rename qwenvl to llm

2025-12-12 21:38:58 +00:00 · 2025-11-29 14:06:46 +08:00 · 2025-11-29 14:06:46 +08:00 · 7a2a7d0767
commit 7a2a7d0767
parent 66e27de9bd
10 changed files with 60 additions and 52 deletions
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -2,7 +2,7 @@
 #define __CONDITIONER_HPP__

 #include "clip.hpp"
-#include "qwenvl.hpp"
+#include "llm.hpp"
 #include "t5.hpp"

 struct SDCondition {
@ -1648,12 +1648,12 @@ struct LLMEmbedder : public Conditioner {
                                               backend,
                                               offload_params_to_cpu,
                                               tensor_storage_map,
-                                               "text_encoders.qwen2vl",
+                                               "text_encoders.llm",
                                               enable_vision);
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
-        llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
+        llm->get_param_tensors(tensors, "text_encoders.llm");
    }

    void alloc_params_buffer() override {
--- a/docs/qwen_image.md
+++ b/docs/qwen_image.md
@ -14,7 +14,7 @@
 ## Examples

 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
 ```

 <img alt="qwen example" src="../assets/qwen/example.png" />
--- a/docs/qwen_image_edit.md
+++ b/docs/qwen_image_edit.md
@ -20,7 +20,7 @@
 ### Qwen Image Edit

 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
 ```

 <img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
@ -29,7 +29,7 @@
 ### Qwen Image Edit 2509

 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --qwen2vl_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
 ```

 <img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -9,8 +9,10 @@ Options:
  --clip_g <string>                        path to the clip-g text encoder
  --clip_vision <string>                   path to the clip-vision encoder
  --t5xxl <string>                         path to the t5xxl text encoder
-  --qwen2vl <string>                       path to the qwen2vl text encoder
-  --qwen2vl_vision <string>                path to the qwen2vl vit
+  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
+  --llm_vision <string>                    path to the llm vit
+  --qwen2vl <string>                       alias of --llm. Deprecated.
+  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
  --diffusion-model <string>               path to the standalone diffusion model
  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
  --vae <string>                           path to standalone vae model
@ -33,7 +35,6 @@ Options:
  -p, --prompt <string>                    the prompt to render
  -n, --negative-prompt <string>           the negative prompt (default: "")
  --preview-path <string>                  path to write preview image to (default: ./preview.png)
-  --easycache <string>                     enable EasyCache for DiT models, accepts optional "threshold,start_percent,end_percent" values (defaults to 0.2,0.15,0.95)
  --upscale-model <string>                 path to esrgan model.
  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                           CPU physical cores
@ -105,15 +106,13 @@ Options:
                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
                                           immediately will be used.The immediately mode may have precision and
                                           compatibility issues with quantized parameters, but it usually offers faster inference
-                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the other
-                                           hand, is exactly the opposite.
+                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
+                                           other hand, is exactly the opposite.
  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
                                           default: discrete
  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
                                           ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
-  --high-noise-scheduler                   (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform,
-                                           simple], default: discrete
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
  -h, --help                               show this help message and exit
@ -121,4 +120,5 @@ Options:
  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
                                           (overrides --vae-tile-size)
  --preview                                preview method. must be one of the following [none, proj, tae, vae] (default is none)
+  --easycache                              enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
 ```
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -70,8 +70,8 @@ struct SDParams {
    std::string clip_g_path;
    std::string clip_vision_path;
    std::string t5xxl_path;
-    std::string qwen2vl_path;
-    std::string qwen2vl_vision_path;
+    std::string llm_path;
+    std::string llm_vision_path;
    std::string diffusion_model_path;
    std::string high_noise_diffusion_model_path;
    std::string vae_path;
@ -174,8 +174,8 @@ void print_params(SDParams params) {
    printf("    clip_g_path:                       %s\n", params.clip_g_path.c_str());
    printf("    clip_vision_path:                  %s\n", params.clip_vision_path.c_str());
    printf("    t5xxl_path:                        %s\n", params.t5xxl_path.c_str());
-    printf("    qwen2vl_path:                      %s\n", params.qwen2vl_path.c_str());
-    printf("    qwen2vl_vision_path:               %s\n", params.qwen2vl_vision_path.c_str());
+    printf("    llm_path:                          %s\n", params.llm_path.c_str());
+    printf("    llm_vision_path:                   %s\n", params.llm_vision_path.c_str());
    printf("    diffusion_model_path:              %s\n", params.diffusion_model_path.c_str());
    printf("    high_noise_diffusion_model_path:   %s\n", params.high_noise_diffusion_model_path.c_str());
    printf("    vae_path:                          %s\n", params.vae_path.c_str());
@ -532,14 +532,22 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         "--t5xxl",
         "path to the t5xxl text encoder",
         &params.t5xxl_path},
+        {"",
+         "--llm",
+         "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
+         &params.llm_path},
+        {"",
+         "--llm_vision",
+         "path to the llm vit",
+         &params.llm_vision_path},
        {"",
         "--qwen2vl",
-         "path to the qwen2vl text encoder",
-         &params.qwen2vl_path},
+         "alias of --llm. Deprecated.",
+         &params.llm_path},
        {"",
         "--qwen2vl_vision",
-         "path to the qwen2vl vit",
-         &params.qwen2vl_vision_path},
+         "alias of --llm_vision. Deprecated.",
+         &params.llm_vision_path},
        {"",
         "--diffusion-model",
         "path to the standalone diffusion model",
@ -1230,7 +1238,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         on_relative_tile_size_arg},
        {"",
         "--preview",
-         std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
+         std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")",
         on_preview_arg},
        {"",
         "--easycache",
@ -1428,7 +1436,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
        parameter_string += " " + std::string(sd_scheduler_name(params.sample_params.scheduler));
    }
    parameter_string += ", ";
-    for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.qwen2vl_path, params.qwen2vl_vision_path}) {
+    for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.llm_path, params.llm_vision_path}) {
        if (!te.empty()) {
            parameter_string += "TE: " + sd_basename(te) + ", ";
        }
@ -1845,8 +1853,8 @@ int main(int argc, const char* argv[]) {
        params.clip_g_path.c_str(),
        params.clip_vision_path.c_str(),
        params.t5xxl_path.c_str(),
-        params.qwen2vl_path.c_str(),
-        params.qwen2vl_vision_path.c_str(),
+        params.llm_path.c_str(),
+        params.llm_vision_path.c_str(),
        params.diffusion_model_path.c_str(),
        params.high_noise_diffusion_model_path.c_str(),
        params.vae_path.c_str(),
--- a/qwenvl.hpp
+++ b/qwenvl.hpp
@ -1549,7 +1549,7 @@ namespace LLM {
            ggml_type model_data_type = GGML_TYPE_COUNT;

            ModelLoader model_loader;
-            if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.qwen2vl.")) {
+            if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }
@ -1569,12 +1569,12 @@ namespace LLM {
                                                                             backend,
                                                                             true,
                                                                             tensor_storage_map,
-                                                                             "text_encoders.qwen2vl",
+                                                                             "text_encoders.llm",
                                                                             true);

            llm->alloc_params_buffer();
            std::map<std::string, ggml_tensor*> tensors;
-            llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
+            llm->get_param_tensors(tensors, "text_encoders.llm");

            bool success = model_loader.load_tensors(tensors);

--- a/model.cpp
+++ b/model.cpp
@ -105,8 +105,8 @@ const char* unused_tensors[] = {
    "denoiser.sigmas",
    "edm_vpred.sigma_max",
    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
-    "text_encoders.qwen2vl.output.weight",
-    "text_encoders.qwen2vl.lm_head.",
+    "text_encoders.llm.output.weight",
+    "text_encoders.llm.lm_head.",
    "first_stage_model.bn.",
 };

--- a/name_conversion.cpp
+++ b/name_conversion.cpp
@ -127,7 +127,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
        {"token_embd.", "shared."},
    };

-    static const std::vector<std::pair<std::string, std::string>> qwenvl_name_map{
+    static const std::vector<std::pair<std::string, std::string>> llm_name_map{
        {"token_embd.", "model.embed_tokens."},
        {"blk.", "model.layers."},
        {"attn_q.", "self_attn.q_proj."},
@ -142,7 +142,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
        {"output_norm.", "model.norm."},
    };

-    static const std::vector<std::pair<std::string, std::string>> qwenvl_vision_name_map{
+    static const std::vector<std::pair<std::string, std::string>> llm_vision_name_map{
        {"mm.", "merger.mlp."},
        {"v.post_ln.", "merger.ln_q."},
        {"v.patch_embd.weight", "patch_embed.proj.0.weight"},
@ -161,11 +161,11 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
    };
    if (contains(name, "t5xxl")) {
        replace_with_name_map(name, t5_name_map);
-    } else if (contains(name, "qwen2vl")) {
-        if (contains(name, "qwen2vl.visual")) {
-            replace_with_name_map(name, qwenvl_vision_name_map);
+    } else if (contains(name, "llm")) {
+        if (contains(name, "llm.visual")) {
+            replace_with_name_map(name, llm_vision_name_map);
        } else {
-            replace_with_name_map(name, qwenvl_name_map);
+            replace_with_name_map(name, llm_name_map);
        }
    } else {
        name = convert_open_clip_to_hf_clip_name(name);
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -276,17 +276,17 @@ public:
            }
        }

-        if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_path)) > 0) {
-            LOG_INFO("loading qwen2vl from '%s'", sd_ctx_params->qwen2vl_path);
-            if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_path, "text_encoders.qwen2vl.")) {
-                LOG_WARN("loading qwen2vl from '%s' failed", sd_ctx_params->qwen2vl_path);
+        if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) {
+            LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path);
+            if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) {
+                LOG_WARN("loading llm from '%s' failed", sd_ctx_params->llm_path);
            }
        }

-        if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_vision_path)) > 0) {
-            LOG_INFO("loading qwen2vl vision from '%s'", sd_ctx_params->qwen2vl_vision_path);
-            if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_vision_path, "text_encoders.qwen2vl.visual.")) {
-                LOG_WARN("loading qwen2vl vision from '%s' failed", sd_ctx_params->qwen2vl_vision_path);
+        if (strlen(SAFE_STR(sd_ctx_params->llm_vision_path)) > 0) {
+            LOG_INFO("loading llm vision from '%s'", sd_ctx_params->llm_vision_path);
+            if (!model_loader.init_from_file(sd_ctx_params->llm_vision_path, "text_encoders.llm.visual.")) {
+                LOG_WARN("loading llm vision from '%s' failed", sd_ctx_params->llm_vision_path);
            }
        }

@ -307,7 +307,7 @@ public:

        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
        for (auto& [name, tensor_storage] : tensor_storage_map) {
-            if (contains(name, "qwen2vl") &&
+            if (contains(name, "llm") &&
                ends_with(name, "weight") &&
                (tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) {
                tensor_storage.expected_type = GGML_TYPE_F16;
@ -684,7 +684,7 @@ public:
            ignore_tensors.insert("first_stage_model.encoder");
            ignore_tensors.insert("first_stage_model.conv1");
            ignore_tensors.insert("first_stage_model.quant");
-            ignore_tensors.insert("text_encoders.qwen2vl.visual.");
+            ignore_tensors.insert("text_encoders.llm.visual.");
        }
        if (version == VERSION_SVD) {
            ignore_tensors.insert("conditioner.embedders.3");
@ -2465,8 +2465,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             "clip_g_path: %s\n"
             "clip_vision_path: %s\n"
             "t5xxl_path: %s\n"
-             "qwen2vl_path: %s\n"
-             "qwen2vl_vision_path: %s\n"
+             "llm_path: %s\n"
+             "llm_vision_path: %s\n"
             "diffusion_model_path: %s\n"
             "high_noise_diffusion_model_path: %s\n"
             "vae_path: %s\n"
@ -2496,8 +2496,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             SAFE_STR(sd_ctx_params->clip_g_path),
             SAFE_STR(sd_ctx_params->clip_vision_path),
             SAFE_STR(sd_ctx_params->t5xxl_path),
-             SAFE_STR(sd_ctx_params->qwen2vl_path),
-             SAFE_STR(sd_ctx_params->qwen2vl_vision_path),
+             SAFE_STR(sd_ctx_params->llm_path),
+             SAFE_STR(sd_ctx_params->llm_vision_path),
             SAFE_STR(sd_ctx_params->diffusion_model_path),
             SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
             SAFE_STR(sd_ctx_params->vae_path),
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -156,8 +156,8 @@ typedef struct {
    const char* clip_g_path;
    const char* clip_vision_path;
    const char* t5xxl_path;
-    const char* qwen2vl_path;
-    const char* qwen2vl_vision_path;
+    const char* llm_path;
+    const char* llm_vision_path;
    const char* diffusion_model_path;
    const char* high_noise_diffusion_model_path;
    const char* vae_path;