add qwen image edit support

2025-12-13 05:48:56 +00:00 · 2025-10-08 19:51:15 +08:00 · 2025-10-08 19:51:15 +08:00 · 40752b629f
commit 40752b629f
parent 58e81adf61
5 changed files with 346 additions and 150 deletions
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -15,28 +15,28 @@ struct SDCondition {
        : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {}
 };

+struct ConditionerParams {
+    std::string text;
+    int clip_skip                       = -1;
+    int width                           = -1;
+    int height                          = -1;
+    int adm_in_channels                 = -1;
+    bool zero_out_masked                = false;
+    int num_input_imgs                  = 0;   // for photomaker
+    std::vector<sd_image_t*> ref_images = {};  // for qwen image edit
+};
+
 struct Conditioner {
    virtual SDCondition get_learned_condition(ggml_context* work_ctx,
                                              int n_threads,
-                                              const std::string& text,
-                                              int clip_skip,
-                                              int width,
-                                              int height,
-                                              int adm_in_channels  = -1,
-                                              bool zero_out_masked = false)             = 0;
-    virtual void alloc_params_buffer()                                                  = 0;
-    virtual void free_params_buffer()                                                   = 0;
-    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
-    virtual size_t get_params_buffer_size()                                             = 0;
+                                              const ConditionerParams& conditioner_params) = 0;
+    virtual void alloc_params_buffer()                                                     = 0;
+    virtual void free_params_buffer()                                                      = 0;
+    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors)    = 0;
+    virtual size_t get_params_buffer_size()                                                = 0;
    virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
                                                                                          int n_threads,
-                                                                                          const std::string& text,
-                                                                                          int clip_skip,
-                                                                                          int width,
-                                                                                          int height,
-                                                                                          int num_input_imgs,
-                                                                                          int adm_in_channels  = -1,
-                                                                                          bool zero_out_masked = false) {
+                                                                                          const ConditionerParams& conditioner_params) {
        GGML_ABORT("Not implemented yet!");
    }
    virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx,
@ -555,20 +555,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
    std::tuple<SDCondition, std::vector<bool>>
    get_learned_condition_with_trigger(ggml_context* work_ctx,
                                       int n_threads,
-                                       const std::string& text,
-                                       int clip_skip,
-                                       int width,
-                                       int height,
-                                       int num_input_imgs,
-                                       int adm_in_channels  = -1,
-                                       bool zero_out_masked = false) {
+                                       const ConditionerParams& conditioner_params) {
        auto image_tokens = convert_token_to_id(trigger_word);
        // if(image_tokens.size() == 1){
        //     printf(" image token id is: %d \n", image_tokens[0]);
        // }
        GGML_ASSERT(image_tokens.size() == 1);
-        auto tokens_and_weights     = tokenize_with_trigger_token(text,
-                                                                  num_input_imgs,
+        auto tokens_and_weights     = tokenize_with_trigger_token(conditioner_params.text,
+                                                                  conditioner_params.num_input_imgs,
                                                                  image_tokens[0],
                                                                  true);
        std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
@ -582,7 +576,15 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        // for(int i = 0; i < clsm.size(); ++i)
        //    printf("%d ", clsm[i]?1:0);
        // printf("\n");
-        auto cond = get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, zero_out_masked);
+        auto cond = get_learned_condition_common(work_ctx,
+                                                 n_threads,
+                                                 tokens,
+                                                 weights,
+                                                 conditioner_params.clip_skip,
+                                                 conditioner_params.width,
+                                                 conditioner_params.height,
+                                                 conditioner_params.adm_in_channels,
+                                                 conditioner_params.zero_out_masked);
        return std::make_tuple(cond, clsm);
    }

@ -600,16 +602,19 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {

    SDCondition get_learned_condition(ggml_context* work_ctx,
                                      int n_threads,
-                                      const std::string& text,
-                                      int clip_skip,
-                                      int width,
-                                      int height,
-                                      int adm_in_channels  = -1,
-                                      bool zero_out_masked = false) {
-        auto tokens_and_weights     = tokenize(text, true);
+                                      const ConditionerParams& conditioner_params) {
+        auto tokens_and_weights     = tokenize(conditioner_params.text, true);
        std::vector<int>& tokens    = tokens_and_weights.first;
        std::vector<float>& weights = tokens_and_weights.second;
-        return get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, zero_out_masked);
+        return get_learned_condition_common(work_ctx,
+                                            n_threads,
+                                            tokens,
+                                            weights,
+                                            conditioner_params.clip_skip,
+                                            conditioner_params.width,
+                                            conditioner_params.height,
+                                            conditioner_params.adm_in_channels,
+                                            conditioner_params.zero_out_masked);
    }
 };

@ -974,14 +979,13 @@ struct SD3CLIPEmbedder : public Conditioner {

    SDCondition get_learned_condition(ggml_context* work_ctx,
                                      int n_threads,
-                                      const std::string& text,
-                                      int clip_skip,
-                                      int width,
-                                      int height,
-                                      int adm_in_channels  = -1,
-                                      bool zero_out_masked = false) {
-        auto tokens_and_weights = tokenize(text, 77, true);
-        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
+                                      const ConditionerParams& conditioner_params) {
+        auto tokens_and_weights = tokenize(conditioner_params.text, 77, true);
+        return get_learned_condition_common(work_ctx,
+                                            n_threads,
+                                            tokens_and_weights,
+                                            conditioner_params.clip_skip,
+                                            conditioner_params.zero_out_masked);
    }
 };

@ -1174,14 +1178,13 @@ struct FluxCLIPEmbedder : public Conditioner {

    SDCondition get_learned_condition(ggml_context* work_ctx,
                                      int n_threads,
-                                      const std::string& text,
-                                      int clip_skip,
-                                      int width,
-                                      int height,
-                                      int adm_in_channels  = -1,
-                                      bool zero_out_masked = false) {
-        auto tokens_and_weights = tokenize(text, chunk_len, true);
-        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
+                                      const ConditionerParams& conditioner_params) {
+        auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
+        return get_learned_condition_common(work_ctx,
+                                            n_threads,
+                                            tokens_and_weights,
+                                            conditioner_params.clip_skip,
+                                            conditioner_params.zero_out_masked);
    }
 };

@ -1360,14 +1363,13 @@ struct T5CLIPEmbedder : public Conditioner {

    SDCondition get_learned_condition(ggml_context* work_ctx,
                                      int n_threads,
-                                      const std::string& text,
-                                      int clip_skip,
-                                      int width,
-                                      int height,
-                                      int adm_in_channels  = -1,
-                                      bool zero_out_masked = false) {
-        auto tokens_and_weights = tokenize(text, chunk_len, true);
-        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
+                                      const ConditionerParams& conditioner_params) {
+        auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
+        return get_learned_condition_common(work_ctx,
+                                            n_threads,
+                                            tokens_and_weights,
+                                            conditioner_params.clip_skip,
+                                            conditioner_params.zero_out_masked);
    }
 };

@ -1379,8 +1381,13 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
    Qwen2_5_VLCLIPEmbedder(ggml_backend_t backend,
                           bool offload_params_to_cpu,
                           const String2GGMLType& tensor_types = {},
-                           const std::string prefix            = "") {
-        qwenvl = std::make_shared<Qwen::Qwen2_5_VLRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.qwen2vl");
+                           const std::string prefix            = "",
+                           bool enable_vision                  = false) {
+        qwenvl = std::make_shared<Qwen::Qwen2_5_VLRunner>(backend,
+                                                          offload_params_to_cpu,
+                                                          tensor_types,
+                                                          "text_encoders.qwen2vl",
+                                                          enable_vision);
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@ -1436,13 +1443,78 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
        return {tokens, weights};
    }

-    SDCondition get_learned_condition_common(ggml_context* work_ctx,
-                                             int n_threads,
-                                             std::tuple<std::vector<int>, std::vector<float>> token_and_weights,
-                                             int clip_skip,
-                                             bool zero_out_masked = false) {
-        auto& tokens  = std::get<0>(token_and_weights);
-        auto& weights = std::get<1>(token_and_weights);
+    SDCondition get_learned_condition(ggml_context* work_ctx,
+                                      int n_threads,
+                                      const ConditionerParams& conditioner_params) {
+        std::string prompt;
+        std::vector<std::pair<int, ggml_tensor*>> image_embeds;
+        if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
+            LOG_INFO("QwenImageEditPlusPipeline");
+            prompt_template_encode_start_idx = 64;
+            int image_embed_idx              = 64 + 6;
+
+            int min_pixels          = 56 * 56;
+            int max_pixels          = 560 * 560;
+            std::string placeholder = "<|image_pad|>";
+            std::string img_prompt;
+
+            for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
+                sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
+                double factor        = qwenvl->params.vision.patch_size * qwenvl->params.vision.spatial_merge_size;
+                int height           = image.height;
+                int width            = image.width;
+                int h_bar            = static_cast<int>(std::round(height / factor)) * factor;
+                int w_bar            = static_cast<int>(std::round(width / factor)) * factor;
+
+                if (static_cast<double>(h_bar) * w_bar > max_pixels) {
+                    double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
+                    h_bar       = std::max(static_cast<int>(factor),
+                                           static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
+                    w_bar       = std::max(static_cast<int>(factor),
+                                           static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
+                } else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
+                    double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
+                    h_bar       = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
+                    w_bar       = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
+                }
+
+                LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
+
+                sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
+                free(image.data);
+                image.data = nullptr;
+
+                ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
+                sd_image_f32_to_tensor(resized_image.data, image_tensor, false);
+                free(resized_image.data);
+                resized_image.data = nullptr;
+
+                ggml_tensor* image_embed = nullptr;
+                qwenvl->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
+                image_embeds.emplace_back(image_embed_idx, image_embed);
+                image_embed_idx += 1 + image_embed->ne[1] + 6;
+
+                img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>";  // [24669, 220, index, 25, 220, 151652]
+                int64_t num_image_tokens = image_embed->ne[1];
+                img_prompt.reserve(num_image_tokens * placeholder.size());
+                for (int j = 0; j < num_image_tokens; j++) {
+                    img_prompt += placeholder;
+                }
+                img_prompt += "<|vision_end|>";
+            }
+
+            prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
+
+            prompt += img_prompt;
+            prompt += conditioner_params.text;
+            prompt += "<|im_end|>\n<|im_start|>assistant\n";
+        } else {
+            prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n";
+        }
+
+        auto tokens_and_weights = tokenize(prompt, 0, false);
+        auto& tokens            = std::get<0>(tokens_and_weights);
+        auto& weights           = std::get<1>(tokens_and_weights);

        int64_t t0                        = ggml_time_ms();
        struct ggml_tensor* hidden_states = NULL;  // [N, n_token, 3584]
@ -1451,6 +1523,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {

        qwenvl->compute(n_threads,
                        input_ids,
+                        image_embeds,
                        &hidden_states,
                        work_ctx);
        {
@ -1486,19 +1559,6 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
        LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
        return SDCondition(new_hidden_states, nullptr, nullptr);
    }
-
-    SDCondition get_learned_condition(ggml_context* work_ctx,
-                                      int n_threads,
-                                      const std::string& text,
-                                      int clip_skip,
-                                      int width,
-                                      int height,
-                                      int adm_in_channels  = -1,
-                                      bool zero_out_masked = false) {
-        std::string prompt      = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n" + text + "<|im_end|>\n<|im_start|>assistant\n";
-        auto tokens_and_weights = tokenize(prompt, 0, false);
-        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
-    }
 };

 #endif
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -1144,10 +1144,6 @@ bool load_images_from_dir(const std::string dir,

 int main(int argc, const char* argv[]) {
    SDParams params;
-    params.verbose = true;
-    sd_set_log_callback(sd_log_cb, (void*)&params);
-    Qwen::Qwen2_5_VLEmbedder::load_from_file_and_test(argv[1]);
-    return 1;
    parse_args(argc, argv, params);
    params.sample_params.guidance.slg.layers                 = params.skip_layers.data();
    params.sample_params.guidance.slg.layer_count            = params.skip_layers.size();
--- a/model.cpp
+++ b/model.cpp
@ -113,7 +113,6 @@ const char* unused_tensors[] = {
    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
    "text_encoders.qwen2vl.output.weight",
    "text_encoders.qwen2vl.lm_head.",
-    "text_encoders.qwen2vl.visual.",
 };

 bool is_unused_tensor(std::string name) {
--- a/qwenvl.hpp
+++ b/qwenvl.hpp
@ -692,7 +692,8 @@ namespace Qwen {
        struct ggml_tensor* forward(struct ggml_context* ctx,
                                    ggml_backend_t backend,
                                    struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* input_pos) {
+                                    struct ggml_tensor* input_pos,
+                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds) {
            // input_ids: [N, n_token]
            // return: [N, n_token, hidden_size]

@ -701,6 +702,46 @@ namespace Qwen {

            auto x = embed_tokens->forward(ctx, input_ids);

+            if (image_embeds.size() > 0) {
+                GGML_ASSERT(x->ne[2] == 1);  // N == 1
+
+                auto raw_x              = ggml_cast(ctx, x, image_embeds[0].second->type);
+                int64_t txt_token_start = 0;
+                int64_t txt_token_end   = 0;
+
+                ggml_tensor* input_embed = nullptr;
+
+                for (int i = 0; i < image_embeds.size(); i++) {
+                    if (i == 0) {
+                        txt_token_start = 0;
+                    } else {
+                        txt_token_start = image_embeds[i - 1].first + image_embeds[i - 1].second->ne[1];
+                    }
+                    txt_token_end = image_embeds[i].first;
+
+                    auto txt_embed = ggml_slice(ctx, raw_x, 1, txt_token_start, txt_token_end);
+                    if (input_embed == nullptr) {
+                        input_embed = txt_embed;
+                    } else {
+                        input_embed = ggml_concat(ctx, input_embed, txt_embed, 1);
+                    }
+
+                    auto image_embed = image_embeds[i].second;
+                    input_embed      = ggml_concat(ctx, input_embed, image_embed, 1);
+                }
+
+                auto final_txt_embed = ggml_slice(ctx,
+                                                  raw_x,
+                                                  1,
+                                                  image_embeds[image_embeds.size() - 1].first + image_embeds[image_embeds.size() - 1].second->ne[1],
+                                                  raw_x->ne[1]);
+
+                input_embed = ggml_concat(ctx, input_embed, final_txt_embed, 1);
+                GGML_ASSERT(raw_x->ne[1] == input_embed->ne[1]);
+
+                x = input_embed;
+            }
+
            for (int i = 0; i < num_layers; i++) {
                auto block = std::dynamic_pointer_cast<Qwen2_5_VLBlock>(blocks["layers." + std::to_string(i)]);

@ -770,11 +811,12 @@ namespace Qwen {
        struct ggml_tensor* forward(struct ggml_context* ctx,
                                    ggml_backend_t backend,
                                    struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* input_pos) {
+                                    struct ggml_tensor* input_pos,
+                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds) {
            // input_ids: [N, n_token]
            auto model = std::dynamic_pointer_cast<Qwen2_5_VLTextModel>(blocks["model"]);

-            auto x = model->forward(ctx, backend, input_ids, input_pos);
+            auto x = model->forward(ctx, backend, input_ids, input_pos, image_embeds);
            return x;
        }

@ -793,6 +835,7 @@ namespace Qwen {

    struct Qwen2_5_VLRunner : public GGMLRunner {
        Qwen2_5_VLParams params;
+        bool enable_vision;
        Qwen2_5_VL model;

        std::vector<int> input_pos_vec;
@ -805,8 +848,27 @@ namespace Qwen {
                         bool offload_params_to_cpu,
                         const String2GGMLType& tensor_types,
                         const std::string prefix,
-                         bool enable_vision = false)
-            : GGMLRunner(backend, offload_params_to_cpu), model(params, enable_vision) {
+                         bool enable_vision_ = false)
+            : GGMLRunner(backend, offload_params_to_cpu), enable_vision(enable_vision_) {
+            bool have_vision_weight = false;
+            for (auto pair : tensor_types) {
+                std::string tensor_name = pair.first;
+                if (tensor_name.find(prefix) == std::string::npos)
+                    continue;
+                size_t pos = tensor_name.find("visual.");
+                if (pos != std::string::npos) {
+                    have_vision_weight = true;
+                    break;
+                }
+            }
+            if (enable_vision && !have_vision_weight) {
+                LOG_WARN("no vision weights detected, vision disabled");
+                enable_vision = false;
+            }
+            if (enable_vision) {
+                LOG_DEBUG("enable qwen2vl vision");
+            }
+            model = Qwen2_5_VL(params, enable_vision);
            model.init(params_ctx, tensor_types, prefix);
        }

@ -821,8 +883,9 @@ namespace Qwen {
        struct ggml_tensor* forward(struct ggml_context* ctx,
                                    ggml_backend_t backend,
                                    struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* input_pos) {
-            auto hidden_states = model.forward(ctx, backend, input_ids, input_pos);  // [N, n_token, hidden_size]
+                                    struct ggml_tensor* input_pos,
+                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds) {
+            auto hidden_states = model.forward(ctx, backend, input_ids, input_pos, image_embeds);  // [N, n_token, hidden_size]
            return hidden_states;
        }

@ -837,11 +900,15 @@ namespace Qwen {
            return hidden_states;
        }

-        struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids) {
+        struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, std::vector<std::pair<int, ggml_tensor*>> image_embeds) {
            struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

            input_ids = to_backend(input_ids);

+            for (auto& image_embed : image_embeds) {
+                image_embed.second = to_backend(image_embed.second);
+            }
+
            int64_t n_tokens = input_ids->ne[0];
            input_pos_vec.resize(n_tokens * 4);
            for (int i = 0; i < n_tokens; ++i) {
@ -856,7 +923,7 @@ namespace Qwen {
                                                n_tokens * 4);
            set_backend_tensor_data(input_pos, input_pos_vec.data());

-            struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, input_pos);
+            struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, input_pos, image_embeds);

            ggml_build_forward_expand(gf, hidden_states);

@ -865,14 +932,24 @@ namespace Qwen {

        void compute(const int n_threads,
                     struct ggml_tensor* input_ids,
+                     std::vector<std::pair<int, ggml_tensor*>> image_embeds,
                     ggml_tensor** output,
                     ggml_context* output_ctx = NULL) {
            auto get_graph = [&]() -> struct ggml_cgraph* {
-                return build_graph(input_ids);
+                return build_graph(input_ids, image_embeds);
            };
            GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
        }

+        int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
+            int grid_t     = 1;
+            int grid_h     = h / params.vision.patch_size;
+            int grid_w     = w / params.vision.patch_size;
+            int llm_grid_h = grid_h / params.vision.spatial_merge_size;
+            int llm_grid_w = grid_w / params.vision.spatial_merge_size;
+            return grid_t * grid_h * grid_w;
+        }
+
        struct ggml_tensor* process_image(struct ggml_context* ctx, struct ggml_tensor* image) {
            // image: [C, H, W]
            // return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1
@ -1030,7 +1107,7 @@ namespace Qwen {
            auto get_graph = [&]() -> struct ggml_cgraph* {
                return build_encode_image_graph(image);
            };
-            GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
        }
    };

@ -1097,9 +1174,59 @@ namespace Qwen {

            struct ggml_context* work_ctx = ggml_init(params);
            GGML_ASSERT(work_ctx != NULL);
-            bool test_vit = true;
+            bool test_vit              = true;
+            bool test_decoder_with_vit = true;

-            if (test_vit) {
+            if (test_decoder_with_vit) {
+                ggml_tensor* image_embed = nullptr;
+                {
+                    auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
+                    print_ggml_tensor(image, false, "image");
+                    struct ggml_tensor* out = NULL;
+
+                    int t0 = ggml_time_ms();
+                    model.encode_image(8, image, &out, work_ctx);
+                    int t1 = ggml_time_ms();
+
+                    print_ggml_tensor(out, false, "image_embed");
+                    image_embed = out;
+                    LOG_DEBUG("qwen2vl encode_image test done in %dms", t1 - t0);
+                }
+
+                std::string placeholder  = "<|image_pad|>";
+                std::string img_prompt   = "Picture 1: <|vision_start|>";  // [24669, 220, 16, 25, 220, 151652]
+                int64_t num_image_tokens = image_embed->ne[1];
+                img_prompt.reserve(num_image_tokens * placeholder.size());
+                for (int i = 0; i < num_image_tokens; i++) {
+                    img_prompt += placeholder;
+                }
+                img_prompt += "<|vision_end|>";
+
+                std::vector<std::pair<int, ggml_tensor*>> image_embeds;
+                image_embeds.emplace_back(64, image_embed);
+
+                std::string text = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
+                text += img_prompt;
+                text += "change 'flux.cpp' to 'edit.cpp'";
+                text += "<|im_end|>\n<|im_start|>assistant\n";
+
+                auto tokens_and_weights     = tokenize(text, 0, false);
+                std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
+                std::vector<float>& weights = std::get<1>(tokens_and_weights);
+                for (auto token : tokens) {
+                    printf("%d ", token);
+                }
+                printf("\n");
+                auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
+                struct ggml_tensor* out = NULL;
+
+                int t0 = ggml_time_ms();
+                model.compute(8, input_ids, image_embeds, &out, work_ctx);
+                int t1 = ggml_time_ms();
+
+                print_ggml_tensor(out);
+                LOG_DEBUG("qwen2vl test done in %dms", t1 - t0);
+            } else if (test_vit) {
                // auto image = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 280, 280, 3);
                // ggml_set_f32(image, 0.f);
                auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
@ -1129,7 +1256,7 @@ namespace Qwen {
                struct ggml_tensor* out = NULL;

                int t0 = ggml_time_ms();
-                model.compute(8, input_ids, &out, work_ctx);
+                model.compute(8, input_ids, {}, &out, work_ctx);
                int t1 = ggml_time_ms();

                print_ggml_tensor(out);
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -272,6 +272,15 @@ public:
            return false;
        }

+        auto& tensor_types = model_loader.tensor_storages_types;
+        for (auto& item : tensor_types) {
+            // LOG_DEBUG("%s %u", item.first.c_str(), item.second);
+            if (contains(item.first, "qwen2vl") && ends_with(item.first, "weight") && (item.second == GGML_TYPE_F32 || item.second == GGML_TYPE_BF16)) {
+                item.second = GGML_TYPE_F16;
+                // LOG_DEBUG(" change %s %u", item.first.c_str(), item.second);
+            }
+        }
+
        LOG_INFO("Version: %s ", model_version_to_str[version]);
        ggml_type wtype = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
                              ? (ggml_type)sd_ctx_params->wtype
@ -420,9 +429,15 @@ public:
                    clip_vision->get_param_tensors(tensors);
                }
            } else if (sd_version_is_qwen_image(version)) {
+                bool enable_vision = false;
+                if (!vae_decode_only) {
+                    enable_vision = true;
+                }
                cond_stage_model = std::make_shared<Qwen2_5_VLCLIPEmbedder>(clip_backend,
                                                                            offload_params_to_cpu,
-                                                                            model_loader.tensor_storages_types);
+                                                                            model_loader.tensor_storages_types,
+                                                                            "",
+                                                                            enable_vision);
                diffusion_model  = std::make_shared<QwenImageModel>(backend,
                                                                   offload_params_to_cpu,
                                                                   model_loader.tensor_storages_types,
@ -594,6 +609,7 @@ public:
        if (vae_decode_only) {
            ignore_tensors.insert("first_stage_model.encoder");
            ignore_tensors.insert("first_stage_model.quant");
+            ignore_tensors.insert("text_encoders.qwen2vl.visual.");
        }
        if (version == VERSION_SVD) {
            ignore_tensors.insert("conditioner.embedders.3");
@ -1977,6 +1993,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                    sd_image_t control_image,
                                    float control_strength,
                                    sd_pm_params_t pm_params,
+                                    std::vector<sd_image_t*> ref_images,
                                    std::vector<ggml_tensor*> ref_latents,
                                    bool increase_ref_index,
                                    ggml_tensor* concat_latent = NULL,
@ -2007,6 +2024,14 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
    ggml_tensor* init_img = NULL;
    SDCondition id_cond;
    std::vector<bool> class_tokens_mask;
+
+    ConditionerParams condition_params;
+    condition_params.clip_skip       = clip_skip;
+    condition_params.width           = width;
+    condition_params.height          = height;
+    condition_params.ref_images      = ref_images;
+    condition_params.adm_in_channels = sd_ctx->sd->diffusion_model->get_adm_in_channels();
+
    if (sd_ctx->sd->stacked_id) {
        if (!sd_ctx->sd->pmid_lora->applied) {
            int64_t t0 = ggml_time_ms();
@ -2046,17 +2071,15 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
            }
            processed_id_images.clear();

-            int64_t t0                    = ggml_time_ms();
-            auto cond_tup                 = sd_ctx->sd->cond_stage_model->get_learned_condition_with_trigger(work_ctx,
-                                                                                                             sd_ctx->sd->n_threads, prompt,
-                                                                                                             clip_skip,
-                                                                                                             width,
-                                                                                                             height,
-                                                                                                             pm_params.id_images_count,
-                                                                                                             sd_ctx->sd->diffusion_model->get_adm_in_channels());
-            id_cond                       = std::get<0>(cond_tup);
-            class_tokens_mask             = std::get<1>(cond_tup);  //
-            struct ggml_tensor* id_embeds = NULL;
+            int64_t t0                      = ggml_time_ms();
+            condition_params.text           = prompt;
+            condition_params.num_input_imgs = pm_params.id_images_count;
+            auto cond_tup                   = sd_ctx->sd->cond_stage_model->get_learned_condition_with_trigger(work_ctx,
+                                                                                                               sd_ctx->sd->n_threads,
+                                                                                                               condition_params);
+            id_cond                         = std::get<0>(cond_tup);
+            class_tokens_mask               = std::get<1>(cond_tup);  //
+            struct ggml_tensor* id_embeds   = NULL;
            if (pmv2 && pm_params.id_embed_path != nullptr) {
                id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path);
                // print_ggml_tensor(id_embeds, true, "id_embeds:");
@ -2082,14 +2105,12 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
    }

    // Get learned condition
-    t0               = ggml_time_ms();
-    SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                           sd_ctx->sd->n_threads,
-                                                                           prompt,
-                                                                           clip_skip,
-                                                                           width,
-                                                                           height,
-                                                                           sd_ctx->sd->diffusion_model->get_adm_in_channels());
+    t0                               = ggml_time_ms();
+    condition_params.text            = prompt;
+    condition_params.zero_out_masked = false;
+    SDCondition cond                 = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
+                                                                                           sd_ctx->sd->n_threads,
+                                                                                           condition_params);

    SDCondition uncond;
    if (guidance.txt_cfg != 1.0 ||
@ -2098,14 +2119,11 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
        if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
            zero_out_masked = true;
        }
-        uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                     sd_ctx->sd->n_threads,
-                                                                     negative_prompt,
-                                                                     clip_skip,
-                                                                     width,
-                                                                     height,
-                                                                     sd_ctx->sd->diffusion_model->get_adm_in_channels(),
-                                                                     zero_out_masked);
+        condition_params.text            = negative_prompt;
+        condition_params.zero_out_masked = zero_out_masked;
+        uncond                           = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
+                                                                                               sd_ctx->sd->n_threads,
+                                                                                               condition_params);
    }
    int64_t t1 = ggml_time_ms();
    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
@ -2507,6 +2525,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                        sd_img_gen_params->control_image,
                                                        sd_img_gen_params->control_strength,
                                                        sd_img_gen_params->pm_params,
+                                                        ref_images,
                                                        ref_latents,
                                                        sd_img_gen_params->increase_ref_index,
                                                        concat_latent,
@ -2764,30 +2783,25 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
    }

    // Get learned condition
-    bool zero_out_masked = true;
-    int64_t t1           = ggml_time_ms();
-    SDCondition cond     = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                               sd_ctx->sd->n_threads,
-                                                                               prompt,
-                                                                               sd_vid_gen_params->clip_skip,
-                                                                               width,
-                                                                               height,
-                                                                               sd_ctx->sd->diffusion_model->get_adm_in_channels(),
-                                                                               zero_out_masked);
-    cond.c_concat        = concat_latent;
-    cond.c_vector        = clip_vision_output;
+    ConditionerParams condition_params;
+    condition_params.clip_skip       = sd_vid_gen_params->clip_skip;
+    condition_params.zero_out_masked = true;
+    condition_params.text            = prompt;
+
+    int64_t t1       = ggml_time_ms();
+    SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
+                                                                           sd_ctx->sd->n_threads,
+                                                                           condition_params);
+    cond.c_concat    = concat_latent;
+    cond.c_vector    = clip_vision_output;
    SDCondition uncond;
    if (sd_vid_gen_params->sample_params.guidance.txt_cfg != 1.0 || sd_vid_gen_params->high_noise_sample_params.guidance.txt_cfg != 1.0) {
-        uncond          = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
-                                                                              sd_ctx->sd->n_threads,
-                                                                              negative_prompt,
-                                                                              sd_vid_gen_params->clip_skip,
-                                                                              width,
-                                                                              height,
-                                                                              sd_ctx->sd->diffusion_model->get_adm_in_channels(),
-                                                                              zero_out_masked);
-        uncond.c_concat = concat_latent;
-        uncond.c_vector = clip_vision_output;
+        condition_params.text = negative_prompt;
+        uncond                = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
+                                                                                    sd_ctx->sd->n_threads,
+                                                                                    condition_params);
+        uncond.c_concat       = concat_latent;
+        uncond.c_vector       = clip_vision_output;
    }
    int64_t t2 = ggml_time_ms();
    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);