diff --git a/conditioner.hpp b/conditioner.hpp index 38852f2..bce625a 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -2,7 +2,7 @@ #define __CONDITIONER_HPP__ #include "clip.hpp" -#include "qwenvl.hpp" +#include "llm.hpp" #include "t5.hpp" struct SDCondition { @@ -1648,12 +1648,12 @@ struct LLMEmbedder : public Conditioner { backend, offload_params_to_cpu, tensor_storage_map, - "text_encoders.qwen2vl", + "text_encoders.llm", enable_vision); } void get_param_tensors(std::map& tensors) override { - llm->get_param_tensors(tensors, "text_encoders.qwen2vl"); + llm->get_param_tensors(tensors, "text_encoders.llm"); } void alloc_params_buffer() override { diff --git a/docs/qwen_image.md b/docs/qwen_image.md index a6f26c5..cfd9da2 100644 --- a/docs/qwen_image.md +++ b/docs/qwen_image.md @@ -14,7 +14,7 @@ ## Examples ``` -.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线: 探索视觉生成基础模型的极限,开创理解与生成一体化的未来。二、Qwen-Image的模型特色:1、复杂文字渲染。支持中英渲染、自动布局; 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景:赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3 +.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线: 探索视觉生成基础模型的极限,开创理解与生成一体化的未来。二、Qwen-Image的模型特色:1、复杂文字渲染。支持中英渲染、自动布局; 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景:赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3 ``` qwen example diff --git a/docs/qwen_image_edit.md b/docs/qwen_image_edit.md index 3a5242f..36be1c9 100644 --- a/docs/qwen_image_edit.md +++ b/docs/qwen_image_edit.md @@ -20,7 +20,7 @@ ### Qwen Image Edit ``` -.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453 +.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453 ``` qwen_image_edit @@ -29,7 +29,7 @@ ### Qwen Image Edit 2509 ``` -.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --qwen2vl_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'" +.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'" ``` qwen_image_edit_2509 \ No newline at end of file diff --git a/examples/cli/README.md b/examples/cli/README.md index e923360..d0062cf 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -9,8 +9,10 @@ Options: --clip_g path to the clip-g text encoder --clip_vision path to the clip-vision encoder --t5xxl path to the t5xxl text encoder - --qwen2vl path to the qwen2vl text encoder - --qwen2vl_vision path to the qwen2vl vit + --llm path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...) + --llm_vision path to the llm vit + --qwen2vl alias of --llm. Deprecated. + --qwen2vl_vision alias of --llm_vision. Deprecated. --diffusion-model path to the standalone diffusion model --high-noise-diffusion-model path to the standalone high noise diffusion model --vae path to standalone vae model @@ -33,7 +35,6 @@ Options: -p, --prompt the prompt to render -n, --negative-prompt the negative prompt (default: "") --preview-path path to write preview image to (default: ./preview.png) - --easycache enable EasyCache for DiT models, accepts optional "threshold,start_percent,end_percent" values (defaults to 0.2,0.15,0.95) --upscale-model path to esrgan model. -t, --threads number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of CPU physical cores @@ -105,15 +106,13 @@ Options: contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used.The immediately mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference - speed and, in some cases, lower memory usage. The at_runtime mode, on the other - hand, is exactly the opposite. + speed and, in some cases, lower memory usage. The at_runtime mode, on the + other hand, is exactly the opposite. --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete --skip-layers layers to skip for SLG steps (default: [7,8,9]) --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise - --high-noise-scheduler (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, - simple], default: discrete --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) -r, --ref-image reference image for Flux Kontext models (can be used multiple times) -h, --help show this help message and exit @@ -121,4 +120,5 @@ Options: --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size) --preview preview method. must be one of the following [none, proj, tae, vae] (default is none) + --easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95) ``` diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 427364a..cb4f868 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -70,8 +70,8 @@ struct SDParams { std::string clip_g_path; std::string clip_vision_path; std::string t5xxl_path; - std::string qwen2vl_path; - std::string qwen2vl_vision_path; + std::string llm_path; + std::string llm_vision_path; std::string diffusion_model_path; std::string high_noise_diffusion_model_path; std::string vae_path; @@ -174,8 +174,8 @@ void print_params(SDParams params) { printf(" clip_g_path: %s\n", params.clip_g_path.c_str()); printf(" clip_vision_path: %s\n", params.clip_vision_path.c_str()); printf(" t5xxl_path: %s\n", params.t5xxl_path.c_str()); - printf(" qwen2vl_path: %s\n", params.qwen2vl_path.c_str()); - printf(" qwen2vl_vision_path: %s\n", params.qwen2vl_vision_path.c_str()); + printf(" llm_path: %s\n", params.llm_path.c_str()); + printf(" llm_vision_path: %s\n", params.llm_vision_path.c_str()); printf(" diffusion_model_path: %s\n", params.diffusion_model_path.c_str()); printf(" high_noise_diffusion_model_path: %s\n", params.high_noise_diffusion_model_path.c_str()); printf(" vae_path: %s\n", params.vae_path.c_str()); @@ -532,14 +532,22 @@ void parse_args(int argc, const char** argv, SDParams& params) { "--t5xxl", "path to the t5xxl text encoder", ¶ms.t5xxl_path}, + {"", + "--llm", + "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)", + ¶ms.llm_path}, + {"", + "--llm_vision", + "path to the llm vit", + ¶ms.llm_vision_path}, {"", "--qwen2vl", - "path to the qwen2vl text encoder", - ¶ms.qwen2vl_path}, + "alias of --llm. Deprecated.", + ¶ms.llm_path}, {"", "--qwen2vl_vision", - "path to the qwen2vl vit", - ¶ms.qwen2vl_vision_path}, + "alias of --llm_vision. Deprecated.", + ¶ms.llm_vision_path}, {"", "--diffusion-model", "path to the standalone diffusion model", @@ -1230,7 +1238,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { on_relative_tile_size_arg}, {"", "--preview", - std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n", + std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")", on_preview_arg}, {"", "--easycache", @@ -1428,7 +1436,7 @@ std::string get_image_params(SDParams params, int64_t seed) { parameter_string += " " + std::string(sd_scheduler_name(params.sample_params.scheduler)); } parameter_string += ", "; - for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.qwen2vl_path, params.qwen2vl_vision_path}) { + for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.llm_path, params.llm_vision_path}) { if (!te.empty()) { parameter_string += "TE: " + sd_basename(te) + ", "; } @@ -1845,8 +1853,8 @@ int main(int argc, const char* argv[]) { params.clip_g_path.c_str(), params.clip_vision_path.c_str(), params.t5xxl_path.c_str(), - params.qwen2vl_path.c_str(), - params.qwen2vl_vision_path.c_str(), + params.llm_path.c_str(), + params.llm_vision_path.c_str(), params.diffusion_model_path.c_str(), params.high_noise_diffusion_model_path.c_str(), params.vae_path.c_str(), diff --git a/qwenvl.hpp b/llm.hpp similarity index 99% rename from qwenvl.hpp rename to llm.hpp index 3d77265..c96ba0f 100644 --- a/qwenvl.hpp +++ b/llm.hpp @@ -1549,7 +1549,7 @@ namespace LLM { ggml_type model_data_type = GGML_TYPE_COUNT; ModelLoader model_loader; - if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.qwen2vl.")) { + if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) { LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); return; } @@ -1569,12 +1569,12 @@ namespace LLM { backend, true, tensor_storage_map, - "text_encoders.qwen2vl", + "text_encoders.llm", true); llm->alloc_params_buffer(); std::map tensors; - llm->get_param_tensors(tensors, "text_encoders.qwen2vl"); + llm->get_param_tensors(tensors, "text_encoders.llm"); bool success = model_loader.load_tensors(tensors); diff --git a/model.cpp b/model.cpp index b720329..05afde9 100644 --- a/model.cpp +++ b/model.cpp @@ -105,8 +105,8 @@ const char* unused_tensors[] = { "denoiser.sigmas", "edm_vpred.sigma_max", "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training - "text_encoders.qwen2vl.output.weight", - "text_encoders.qwen2vl.lm_head.", + "text_encoders.llm.output.weight", + "text_encoders.llm.lm_head.", "first_stage_model.bn.", }; diff --git a/name_conversion.cpp b/name_conversion.cpp index 590410a..c4670df 100644 --- a/name_conversion.cpp +++ b/name_conversion.cpp @@ -127,7 +127,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix) {"token_embd.", "shared."}, }; - static const std::vector> qwenvl_name_map{ + static const std::vector> llm_name_map{ {"token_embd.", "model.embed_tokens."}, {"blk.", "model.layers."}, {"attn_q.", "self_attn.q_proj."}, @@ -142,7 +142,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix) {"output_norm.", "model.norm."}, }; - static const std::vector> qwenvl_vision_name_map{ + static const std::vector> llm_vision_name_map{ {"mm.", "merger.mlp."}, {"v.post_ln.", "merger.ln_q."}, {"v.patch_embd.weight", "patch_embed.proj.0.weight"}, @@ -161,11 +161,11 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix) }; if (contains(name, "t5xxl")) { replace_with_name_map(name, t5_name_map); - } else if (contains(name, "qwen2vl")) { - if (contains(name, "qwen2vl.visual")) { - replace_with_name_map(name, qwenvl_vision_name_map); + } else if (contains(name, "llm")) { + if (contains(name, "llm.visual")) { + replace_with_name_map(name, llm_vision_name_map); } else { - replace_with_name_map(name, qwenvl_name_map); + replace_with_name_map(name, llm_name_map); } } else { name = convert_open_clip_to_hf_clip_name(name); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 48ed72b..92b719f 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -276,17 +276,17 @@ public: } } - if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_path)) > 0) { - LOG_INFO("loading qwen2vl from '%s'", sd_ctx_params->qwen2vl_path); - if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_path, "text_encoders.qwen2vl.")) { - LOG_WARN("loading qwen2vl from '%s' failed", sd_ctx_params->qwen2vl_path); + if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) { + LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path); + if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) { + LOG_WARN("loading llm from '%s' failed", sd_ctx_params->llm_path); } } - if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_vision_path)) > 0) { - LOG_INFO("loading qwen2vl vision from '%s'", sd_ctx_params->qwen2vl_vision_path); - if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_vision_path, "text_encoders.qwen2vl.visual.")) { - LOG_WARN("loading qwen2vl vision from '%s' failed", sd_ctx_params->qwen2vl_vision_path); + if (strlen(SAFE_STR(sd_ctx_params->llm_vision_path)) > 0) { + LOG_INFO("loading llm vision from '%s'", sd_ctx_params->llm_vision_path); + if (!model_loader.init_from_file(sd_ctx_params->llm_vision_path, "text_encoders.llm.visual.")) { + LOG_WARN("loading llm vision from '%s' failed", sd_ctx_params->llm_vision_path); } } @@ -307,7 +307,7 @@ public: auto& tensor_storage_map = model_loader.get_tensor_storage_map(); for (auto& [name, tensor_storage] : tensor_storage_map) { - if (contains(name, "qwen2vl") && + if (contains(name, "llm") && ends_with(name, "weight") && (tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) { tensor_storage.expected_type = GGML_TYPE_F16; @@ -684,7 +684,7 @@ public: ignore_tensors.insert("first_stage_model.encoder"); ignore_tensors.insert("first_stage_model.conv1"); ignore_tensors.insert("first_stage_model.quant"); - ignore_tensors.insert("text_encoders.qwen2vl.visual."); + ignore_tensors.insert("text_encoders.llm.visual."); } if (version == VERSION_SVD) { ignore_tensors.insert("conditioner.embedders.3"); @@ -2465,8 +2465,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "clip_g_path: %s\n" "clip_vision_path: %s\n" "t5xxl_path: %s\n" - "qwen2vl_path: %s\n" - "qwen2vl_vision_path: %s\n" + "llm_path: %s\n" + "llm_vision_path: %s\n" "diffusion_model_path: %s\n" "high_noise_diffusion_model_path: %s\n" "vae_path: %s\n" @@ -2496,8 +2496,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { SAFE_STR(sd_ctx_params->clip_g_path), SAFE_STR(sd_ctx_params->clip_vision_path), SAFE_STR(sd_ctx_params->t5xxl_path), - SAFE_STR(sd_ctx_params->qwen2vl_path), - SAFE_STR(sd_ctx_params->qwen2vl_vision_path), + SAFE_STR(sd_ctx_params->llm_path), + SAFE_STR(sd_ctx_params->llm_vision_path), SAFE_STR(sd_ctx_params->diffusion_model_path), SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path), SAFE_STR(sd_ctx_params->vae_path), diff --git a/stable-diffusion.h b/stable-diffusion.h index 309da9b..505bb3c 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -156,8 +156,8 @@ typedef struct { const char* clip_g_path; const char* clip_vision_path; const char* t5xxl_path; - const char* qwen2vl_path; - const char* qwen2vl_vision_path; + const char* llm_path; + const char* llm_vision_path; const char* diffusion_model_path; const char* high_noise_diffusion_model_path; const char* vae_path;