rename qwenvl to llm

This commit is contained in:
leejet 2025-11-29 14:06:46 +08:00
parent 66e27de9bd
commit 7a2a7d0767
10 changed files with 60 additions and 52 deletions

View File

@ -2,7 +2,7 @@
#define __CONDITIONER_HPP__
#include "clip.hpp"
#include "qwenvl.hpp"
#include "llm.hpp"
#include "t5.hpp"
struct SDCondition {
@ -1648,12 +1648,12 @@ struct LLMEmbedder : public Conditioner {
backend,
offload_params_to_cpu,
tensor_storage_map,
"text_encoders.qwen2vl",
"text_encoders.llm",
enable_vision);
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
llm->get_param_tensors(tensors, "text_encoders.llm");
}
void alloc_params_buffer() override {

View File

@ -14,7 +14,7 @@
## Examples
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线 探索视觉生成基础模型的极限开创理解与生成一体化的未来。二、Qwen-Image的模型特色1、复杂文字渲染。支持中英渲染、自动布局 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线 探索视觉生成基础模型的极限开创理解与生成一体化的未来。二、Qwen-Image的模型特色1、复杂文字渲染。支持中英渲染、自动布局 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
```
<img alt="qwen example" src="../assets/qwen/example.png" />

View File

@ -20,7 +20,7 @@
### Qwen Image Edit
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
```
<img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
@ -29,7 +29,7 @@
### Qwen Image Edit 2509
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --qwen2vl_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
```
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />

View File

@ -9,8 +9,10 @@ Options:
--clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder
--qwen2vl <string> path to the qwen2vl text encoder
--qwen2vl_vision <string> path to the qwen2vl vit
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
--diffusion-model <string> path to the standalone diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--vae <string> path to standalone vae model
@ -33,7 +35,6 @@ Options:
-p, --prompt <string> the prompt to render
-n, --negative-prompt <string> the negative prompt (default: "")
--preview-path <string> path to write preview image to (default: ./preview.png)
--easycache <string> enable EasyCache for DiT models, accepts optional "threshold,start_percent,end_percent" values (defaults to 0.2,0.15,0.95)
--upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
@ -105,15 +106,13 @@ Options:
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the other
hand, is exactly the opposite.
speed and, in some cases, lower memory usage. The at_runtime mode, on the
other hand, is exactly the opposite.
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
default: discrete
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
--high-noise-scheduler (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform,
simple], default: discrete
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
-h, --help show this help message and exit
@ -121,4 +120,5 @@ Options:
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
--easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
```

View File

@ -70,8 +70,8 @@ struct SDParams {
std::string clip_g_path;
std::string clip_vision_path;
std::string t5xxl_path;
std::string qwen2vl_path;
std::string qwen2vl_vision_path;
std::string llm_path;
std::string llm_vision_path;
std::string diffusion_model_path;
std::string high_noise_diffusion_model_path;
std::string vae_path;
@ -174,8 +174,8 @@ void print_params(SDParams params) {
printf(" clip_g_path: %s\n", params.clip_g_path.c_str());
printf(" clip_vision_path: %s\n", params.clip_vision_path.c_str());
printf(" t5xxl_path: %s\n", params.t5xxl_path.c_str());
printf(" qwen2vl_path: %s\n", params.qwen2vl_path.c_str());
printf(" qwen2vl_vision_path: %s\n", params.qwen2vl_vision_path.c_str());
printf(" llm_path: %s\n", params.llm_path.c_str());
printf(" llm_vision_path: %s\n", params.llm_vision_path.c_str());
printf(" diffusion_model_path: %s\n", params.diffusion_model_path.c_str());
printf(" high_noise_diffusion_model_path: %s\n", params.high_noise_diffusion_model_path.c_str());
printf(" vae_path: %s\n", params.vae_path.c_str());
@ -532,14 +532,22 @@ void parse_args(int argc, const char** argv, SDParams& params) {
"--t5xxl",
"path to the t5xxl text encoder",
&params.t5xxl_path},
{"",
"--llm",
"path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
&params.llm_path},
{"",
"--llm_vision",
"path to the llm vit",
&params.llm_vision_path},
{"",
"--qwen2vl",
"path to the qwen2vl text encoder",
&params.qwen2vl_path},
"alias of --llm. Deprecated.",
&params.llm_path},
{"",
"--qwen2vl_vision",
"path to the qwen2vl vit",
&params.qwen2vl_vision_path},
"alias of --llm_vision. Deprecated.",
&params.llm_vision_path},
{"",
"--diffusion-model",
"path to the standalone diffusion model",
@ -1230,7 +1238,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
on_relative_tile_size_arg},
{"",
"--preview",
std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")",
on_preview_arg},
{"",
"--easycache",
@ -1428,7 +1436,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
parameter_string += " " + std::string(sd_scheduler_name(params.sample_params.scheduler));
}
parameter_string += ", ";
for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.qwen2vl_path, params.qwen2vl_vision_path}) {
for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.llm_path, params.llm_vision_path}) {
if (!te.empty()) {
parameter_string += "TE: " + sd_basename(te) + ", ";
}
@ -1845,8 +1853,8 @@ int main(int argc, const char* argv[]) {
params.clip_g_path.c_str(),
params.clip_vision_path.c_str(),
params.t5xxl_path.c_str(),
params.qwen2vl_path.c_str(),
params.qwen2vl_vision_path.c_str(),
params.llm_path.c_str(),
params.llm_vision_path.c_str(),
params.diffusion_model_path.c_str(),
params.high_noise_diffusion_model_path.c_str(),
params.vae_path.c_str(),

View File

@ -1549,7 +1549,7 @@ namespace LLM {
ggml_type model_data_type = GGML_TYPE_COUNT;
ModelLoader model_loader;
if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.qwen2vl.")) {
if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return;
}
@ -1569,12 +1569,12 @@ namespace LLM {
backend,
true,
tensor_storage_map,
"text_encoders.qwen2vl",
"text_encoders.llm",
true);
llm->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
llm->get_param_tensors(tensors, "text_encoders.llm");
bool success = model_loader.load_tensors(tensors);

View File

@ -105,8 +105,8 @@ const char* unused_tensors[] = {
"denoiser.sigmas",
"edm_vpred.sigma_max",
"text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training
"text_encoders.qwen2vl.output.weight",
"text_encoders.qwen2vl.lm_head.",
"text_encoders.llm.output.weight",
"text_encoders.llm.lm_head.",
"first_stage_model.bn.",
};

View File

@ -127,7 +127,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
{"token_embd.", "shared."},
};
static const std::vector<std::pair<std::string, std::string>> qwenvl_name_map{
static const std::vector<std::pair<std::string, std::string>> llm_name_map{
{"token_embd.", "model.embed_tokens."},
{"blk.", "model.layers."},
{"attn_q.", "self_attn.q_proj."},
@ -142,7 +142,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
{"output_norm.", "model.norm."},
};
static const std::vector<std::pair<std::string, std::string>> qwenvl_vision_name_map{
static const std::vector<std::pair<std::string, std::string>> llm_vision_name_map{
{"mm.", "merger.mlp."},
{"v.post_ln.", "merger.ln_q."},
{"v.patch_embd.weight", "patch_embed.proj.0.weight"},
@ -161,11 +161,11 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
};
if (contains(name, "t5xxl")) {
replace_with_name_map(name, t5_name_map);
} else if (contains(name, "qwen2vl")) {
if (contains(name, "qwen2vl.visual")) {
replace_with_name_map(name, qwenvl_vision_name_map);
} else if (contains(name, "llm")) {
if (contains(name, "llm.visual")) {
replace_with_name_map(name, llm_vision_name_map);
} else {
replace_with_name_map(name, qwenvl_name_map);
replace_with_name_map(name, llm_name_map);
}
} else {
name = convert_open_clip_to_hf_clip_name(name);

View File

@ -276,17 +276,17 @@ public:
}
}
if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_path)) > 0) {
LOG_INFO("loading qwen2vl from '%s'", sd_ctx_params->qwen2vl_path);
if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_path, "text_encoders.qwen2vl.")) {
LOG_WARN("loading qwen2vl from '%s' failed", sd_ctx_params->qwen2vl_path);
if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) {
LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path);
if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) {
LOG_WARN("loading llm from '%s' failed", sd_ctx_params->llm_path);
}
}
if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_vision_path)) > 0) {
LOG_INFO("loading qwen2vl vision from '%s'", sd_ctx_params->qwen2vl_vision_path);
if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_vision_path, "text_encoders.qwen2vl.visual.")) {
LOG_WARN("loading qwen2vl vision from '%s' failed", sd_ctx_params->qwen2vl_vision_path);
if (strlen(SAFE_STR(sd_ctx_params->llm_vision_path)) > 0) {
LOG_INFO("loading llm vision from '%s'", sd_ctx_params->llm_vision_path);
if (!model_loader.init_from_file(sd_ctx_params->llm_vision_path, "text_encoders.llm.visual.")) {
LOG_WARN("loading llm vision from '%s' failed", sd_ctx_params->llm_vision_path);
}
}
@ -307,7 +307,7 @@ public:
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
for (auto& [name, tensor_storage] : tensor_storage_map) {
if (contains(name, "qwen2vl") &&
if (contains(name, "llm") &&
ends_with(name, "weight") &&
(tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) {
tensor_storage.expected_type = GGML_TYPE_F16;
@ -684,7 +684,7 @@ public:
ignore_tensors.insert("first_stage_model.encoder");
ignore_tensors.insert("first_stage_model.conv1");
ignore_tensors.insert("first_stage_model.quant");
ignore_tensors.insert("text_encoders.qwen2vl.visual.");
ignore_tensors.insert("text_encoders.llm.visual.");
}
if (version == VERSION_SVD) {
ignore_tensors.insert("conditioner.embedders.3");
@ -2465,8 +2465,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
"clip_g_path: %s\n"
"clip_vision_path: %s\n"
"t5xxl_path: %s\n"
"qwen2vl_path: %s\n"
"qwen2vl_vision_path: %s\n"
"llm_path: %s\n"
"llm_vision_path: %s\n"
"diffusion_model_path: %s\n"
"high_noise_diffusion_model_path: %s\n"
"vae_path: %s\n"
@ -2496,8 +2496,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
SAFE_STR(sd_ctx_params->clip_g_path),
SAFE_STR(sd_ctx_params->clip_vision_path),
SAFE_STR(sd_ctx_params->t5xxl_path),
SAFE_STR(sd_ctx_params->qwen2vl_path),
SAFE_STR(sd_ctx_params->qwen2vl_vision_path),
SAFE_STR(sd_ctx_params->llm_path),
SAFE_STR(sd_ctx_params->llm_vision_path),
SAFE_STR(sd_ctx_params->diffusion_model_path),
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
SAFE_STR(sd_ctx_params->vae_path),

View File

@ -156,8 +156,8 @@ typedef struct {
const char* clip_g_path;
const char* clip_vision_path;
const char* t5xxl_path;
const char* qwen2vl_path;
const char* qwen2vl_vision_path;
const char* llm_path;
const char* llm_vision_path;
const char* diffusion_model_path;
const char* high_noise_diffusion_model_path;
const char* vae_path;