mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-12 21:38:58 +00:00
rename qwenvl to llm
This commit is contained in:
parent
66e27de9bd
commit
7a2a7d0767
@ -2,7 +2,7 @@
|
||||
#define __CONDITIONER_HPP__
|
||||
|
||||
#include "clip.hpp"
|
||||
#include "qwenvl.hpp"
|
||||
#include "llm.hpp"
|
||||
#include "t5.hpp"
|
||||
|
||||
struct SDCondition {
|
||||
@ -1648,12 +1648,12 @@ struct LLMEmbedder : public Conditioner {
|
||||
backend,
|
||||
offload_params_to_cpu,
|
||||
tensor_storage_map,
|
||||
"text_encoders.qwen2vl",
|
||||
"text_encoders.llm",
|
||||
enable_vision);
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
|
||||
llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
|
||||
llm->get_param_tensors(tensors, "text_encoders.llm");
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
## Examples
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线: 探索视觉生成基础模型的极限,开创理解与生成一体化的未来。二、Qwen-Image的模型特色:1、复杂文字渲染。支持中英渲染、自动布局; 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景:赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
|
||||
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线: 探索视觉生成基础模型的极限,开创理解与生成一体化的未来。二、Qwen-Image的模型特色:1、复杂文字渲染。支持中英渲染、自动布局; 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景:赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
|
||||
```
|
||||
|
||||
<img alt="qwen example" src="../assets/qwen/example.png" />
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
### Qwen Image Edit
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
|
||||
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
|
||||
```
|
||||
|
||||
<img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
|
||||
@ -29,7 +29,7 @@
|
||||
### Qwen Image Edit 2509
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --qwen2vl_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
|
||||
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
|
||||
```
|
||||
|
||||
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
|
||||
@ -9,8 +9,10 @@ Options:
|
||||
--clip_g <string> path to the clip-g text encoder
|
||||
--clip_vision <string> path to the clip-vision encoder
|
||||
--t5xxl <string> path to the t5xxl text encoder
|
||||
--qwen2vl <string> path to the qwen2vl text encoder
|
||||
--qwen2vl_vision <string> path to the qwen2vl vit
|
||||
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
|
||||
--llm_vision <string> path to the llm vit
|
||||
--qwen2vl <string> alias of --llm. Deprecated.
|
||||
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
||||
--diffusion-model <string> path to the standalone diffusion model
|
||||
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
|
||||
--vae <string> path to standalone vae model
|
||||
@ -33,7 +35,6 @@ Options:
|
||||
-p, --prompt <string> the prompt to render
|
||||
-n, --negative-prompt <string> the negative prompt (default: "")
|
||||
--preview-path <string> path to write preview image to (default: ./preview.png)
|
||||
--easycache <string> enable EasyCache for DiT models, accepts optional "threshold,start_percent,end_percent" values (defaults to 0.2,0.15,0.95)
|
||||
--upscale-model <string> path to esrgan model.
|
||||
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
|
||||
CPU physical cores
|
||||
@ -105,15 +106,13 @@ Options:
|
||||
contain any quantized parameters, the at_runtime mode will be used; otherwise,
|
||||
immediately will be used.The immediately mode may have precision and
|
||||
compatibility issues with quantized parameters, but it usually offers faster inference
|
||||
speed and, in some cases, lower memory usage. The at_runtime mode, on the other
|
||||
hand, is exactly the opposite.
|
||||
speed and, in some cases, lower memory usage. The at_runtime mode, on the
|
||||
other hand, is exactly the opposite.
|
||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
|
||||
default: discrete
|
||||
--skip-layers layers to skip for SLG steps (default: [7,8,9])
|
||||
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
|
||||
ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
|
||||
--high-noise-scheduler (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform,
|
||||
simple], default: discrete
|
||||
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
|
||||
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
|
||||
-h, --help show this help message and exit
|
||||
@ -121,4 +120,5 @@ Options:
|
||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
|
||||
(overrides --vae-tile-size)
|
||||
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
|
||||
--easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
|
||||
```
|
||||
|
||||
@ -70,8 +70,8 @@ struct SDParams {
|
||||
std::string clip_g_path;
|
||||
std::string clip_vision_path;
|
||||
std::string t5xxl_path;
|
||||
std::string qwen2vl_path;
|
||||
std::string qwen2vl_vision_path;
|
||||
std::string llm_path;
|
||||
std::string llm_vision_path;
|
||||
std::string diffusion_model_path;
|
||||
std::string high_noise_diffusion_model_path;
|
||||
std::string vae_path;
|
||||
@ -174,8 +174,8 @@ void print_params(SDParams params) {
|
||||
printf(" clip_g_path: %s\n", params.clip_g_path.c_str());
|
||||
printf(" clip_vision_path: %s\n", params.clip_vision_path.c_str());
|
||||
printf(" t5xxl_path: %s\n", params.t5xxl_path.c_str());
|
||||
printf(" qwen2vl_path: %s\n", params.qwen2vl_path.c_str());
|
||||
printf(" qwen2vl_vision_path: %s\n", params.qwen2vl_vision_path.c_str());
|
||||
printf(" llm_path: %s\n", params.llm_path.c_str());
|
||||
printf(" llm_vision_path: %s\n", params.llm_vision_path.c_str());
|
||||
printf(" diffusion_model_path: %s\n", params.diffusion_model_path.c_str());
|
||||
printf(" high_noise_diffusion_model_path: %s\n", params.high_noise_diffusion_model_path.c_str());
|
||||
printf(" vae_path: %s\n", params.vae_path.c_str());
|
||||
@ -532,14 +532,22 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
"--t5xxl",
|
||||
"path to the t5xxl text encoder",
|
||||
¶ms.t5xxl_path},
|
||||
{"",
|
||||
"--llm",
|
||||
"path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
|
||||
¶ms.llm_path},
|
||||
{"",
|
||||
"--llm_vision",
|
||||
"path to the llm vit",
|
||||
¶ms.llm_vision_path},
|
||||
{"",
|
||||
"--qwen2vl",
|
||||
"path to the qwen2vl text encoder",
|
||||
¶ms.qwen2vl_path},
|
||||
"alias of --llm. Deprecated.",
|
||||
¶ms.llm_path},
|
||||
{"",
|
||||
"--qwen2vl_vision",
|
||||
"path to the qwen2vl vit",
|
||||
¶ms.qwen2vl_vision_path},
|
||||
"alias of --llm_vision. Deprecated.",
|
||||
¶ms.llm_vision_path},
|
||||
{"",
|
||||
"--diffusion-model",
|
||||
"path to the standalone diffusion model",
|
||||
@ -1230,7 +1238,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
on_relative_tile_size_arg},
|
||||
{"",
|
||||
"--preview",
|
||||
std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
|
||||
std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")",
|
||||
on_preview_arg},
|
||||
{"",
|
||||
"--easycache",
|
||||
@ -1428,7 +1436,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
|
||||
parameter_string += " " + std::string(sd_scheduler_name(params.sample_params.scheduler));
|
||||
}
|
||||
parameter_string += ", ";
|
||||
for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.qwen2vl_path, params.qwen2vl_vision_path}) {
|
||||
for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.llm_path, params.llm_vision_path}) {
|
||||
if (!te.empty()) {
|
||||
parameter_string += "TE: " + sd_basename(te) + ", ";
|
||||
}
|
||||
@ -1845,8 +1853,8 @@ int main(int argc, const char* argv[]) {
|
||||
params.clip_g_path.c_str(),
|
||||
params.clip_vision_path.c_str(),
|
||||
params.t5xxl_path.c_str(),
|
||||
params.qwen2vl_path.c_str(),
|
||||
params.qwen2vl_vision_path.c_str(),
|
||||
params.llm_path.c_str(),
|
||||
params.llm_vision_path.c_str(),
|
||||
params.diffusion_model_path.c_str(),
|
||||
params.high_noise_diffusion_model_path.c_str(),
|
||||
params.vae_path.c_str(),
|
||||
|
||||
@ -1549,7 +1549,7 @@ namespace LLM {
|
||||
ggml_type model_data_type = GGML_TYPE_COUNT;
|
||||
|
||||
ModelLoader model_loader;
|
||||
if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.qwen2vl.")) {
|
||||
if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) {
|
||||
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
||||
return;
|
||||
}
|
||||
@ -1569,12 +1569,12 @@ namespace LLM {
|
||||
backend,
|
||||
true,
|
||||
tensor_storage_map,
|
||||
"text_encoders.qwen2vl",
|
||||
"text_encoders.llm",
|
||||
true);
|
||||
|
||||
llm->alloc_params_buffer();
|
||||
std::map<std::string, ggml_tensor*> tensors;
|
||||
llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
|
||||
llm->get_param_tensors(tensors, "text_encoders.llm");
|
||||
|
||||
bool success = model_loader.load_tensors(tensors);
|
||||
|
||||
@ -105,8 +105,8 @@ const char* unused_tensors[] = {
|
||||
"denoiser.sigmas",
|
||||
"edm_vpred.sigma_max",
|
||||
"text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training
|
||||
"text_encoders.qwen2vl.output.weight",
|
||||
"text_encoders.qwen2vl.lm_head.",
|
||||
"text_encoders.llm.output.weight",
|
||||
"text_encoders.llm.lm_head.",
|
||||
"first_stage_model.bn.",
|
||||
};
|
||||
|
||||
|
||||
@ -127,7 +127,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
|
||||
{"token_embd.", "shared."},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<std::string, std::string>> qwenvl_name_map{
|
||||
static const std::vector<std::pair<std::string, std::string>> llm_name_map{
|
||||
{"token_embd.", "model.embed_tokens."},
|
||||
{"blk.", "model.layers."},
|
||||
{"attn_q.", "self_attn.q_proj."},
|
||||
@ -142,7 +142,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
|
||||
{"output_norm.", "model.norm."},
|
||||
};
|
||||
|
||||
static const std::vector<std::pair<std::string, std::string>> qwenvl_vision_name_map{
|
||||
static const std::vector<std::pair<std::string, std::string>> llm_vision_name_map{
|
||||
{"mm.", "merger.mlp."},
|
||||
{"v.post_ln.", "merger.ln_q."},
|
||||
{"v.patch_embd.weight", "patch_embed.proj.0.weight"},
|
||||
@ -161,11 +161,11 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
|
||||
};
|
||||
if (contains(name, "t5xxl")) {
|
||||
replace_with_name_map(name, t5_name_map);
|
||||
} else if (contains(name, "qwen2vl")) {
|
||||
if (contains(name, "qwen2vl.visual")) {
|
||||
replace_with_name_map(name, qwenvl_vision_name_map);
|
||||
} else if (contains(name, "llm")) {
|
||||
if (contains(name, "llm.visual")) {
|
||||
replace_with_name_map(name, llm_vision_name_map);
|
||||
} else {
|
||||
replace_with_name_map(name, qwenvl_name_map);
|
||||
replace_with_name_map(name, llm_name_map);
|
||||
}
|
||||
} else {
|
||||
name = convert_open_clip_to_hf_clip_name(name);
|
||||
|
||||
@ -276,17 +276,17 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_path)) > 0) {
|
||||
LOG_INFO("loading qwen2vl from '%s'", sd_ctx_params->qwen2vl_path);
|
||||
if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_path, "text_encoders.qwen2vl.")) {
|
||||
LOG_WARN("loading qwen2vl from '%s' failed", sd_ctx_params->qwen2vl_path);
|
||||
if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) {
|
||||
LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path);
|
||||
if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) {
|
||||
LOG_WARN("loading llm from '%s' failed", sd_ctx_params->llm_path);
|
||||
}
|
||||
}
|
||||
|
||||
if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_vision_path)) > 0) {
|
||||
LOG_INFO("loading qwen2vl vision from '%s'", sd_ctx_params->qwen2vl_vision_path);
|
||||
if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_vision_path, "text_encoders.qwen2vl.visual.")) {
|
||||
LOG_WARN("loading qwen2vl vision from '%s' failed", sd_ctx_params->qwen2vl_vision_path);
|
||||
if (strlen(SAFE_STR(sd_ctx_params->llm_vision_path)) > 0) {
|
||||
LOG_INFO("loading llm vision from '%s'", sd_ctx_params->llm_vision_path);
|
||||
if (!model_loader.init_from_file(sd_ctx_params->llm_vision_path, "text_encoders.llm.visual.")) {
|
||||
LOG_WARN("loading llm vision from '%s' failed", sd_ctx_params->llm_vision_path);
|
||||
}
|
||||
}
|
||||
|
||||
@ -307,7 +307,7 @@ public:
|
||||
|
||||
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||
for (auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (contains(name, "qwen2vl") &&
|
||||
if (contains(name, "llm") &&
|
||||
ends_with(name, "weight") &&
|
||||
(tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) {
|
||||
tensor_storage.expected_type = GGML_TYPE_F16;
|
||||
@ -684,7 +684,7 @@ public:
|
||||
ignore_tensors.insert("first_stage_model.encoder");
|
||||
ignore_tensors.insert("first_stage_model.conv1");
|
||||
ignore_tensors.insert("first_stage_model.quant");
|
||||
ignore_tensors.insert("text_encoders.qwen2vl.visual.");
|
||||
ignore_tensors.insert("text_encoders.llm.visual.");
|
||||
}
|
||||
if (version == VERSION_SVD) {
|
||||
ignore_tensors.insert("conditioner.embedders.3");
|
||||
@ -2465,8 +2465,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
||||
"clip_g_path: %s\n"
|
||||
"clip_vision_path: %s\n"
|
||||
"t5xxl_path: %s\n"
|
||||
"qwen2vl_path: %s\n"
|
||||
"qwen2vl_vision_path: %s\n"
|
||||
"llm_path: %s\n"
|
||||
"llm_vision_path: %s\n"
|
||||
"diffusion_model_path: %s\n"
|
||||
"high_noise_diffusion_model_path: %s\n"
|
||||
"vae_path: %s\n"
|
||||
@ -2496,8 +2496,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
||||
SAFE_STR(sd_ctx_params->clip_g_path),
|
||||
SAFE_STR(sd_ctx_params->clip_vision_path),
|
||||
SAFE_STR(sd_ctx_params->t5xxl_path),
|
||||
SAFE_STR(sd_ctx_params->qwen2vl_path),
|
||||
SAFE_STR(sd_ctx_params->qwen2vl_vision_path),
|
||||
SAFE_STR(sd_ctx_params->llm_path),
|
||||
SAFE_STR(sd_ctx_params->llm_vision_path),
|
||||
SAFE_STR(sd_ctx_params->diffusion_model_path),
|
||||
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
|
||||
SAFE_STR(sd_ctx_params->vae_path),
|
||||
|
||||
@ -156,8 +156,8 @@ typedef struct {
|
||||
const char* clip_g_path;
|
||||
const char* clip_vision_path;
|
||||
const char* t5xxl_path;
|
||||
const char* qwen2vl_path;
|
||||
const char* qwen2vl_vision_path;
|
||||
const char* llm_path;
|
||||
const char* llm_vision_path;
|
||||
const char* diffusion_model_path;
|
||||
const char* high_noise_diffusion_model_path;
|
||||
const char* vae_path;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user