mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-23 14:46:39 +00:00
Compare commits
4 Commits
7f0e728b7d
...
b12098f5d0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b12098f5d0 | ||
|
|
2bd249c971 | ||
|
|
e9e952462f | ||
|
|
e8e012eef2 |
@ -50,12 +50,14 @@ API and command-line option may change frequently.***
|
|||||||
- [Ovis-Image](./docs/ovis_image.md)
|
- [Ovis-Image](./docs/ovis_image.md)
|
||||||
- [Anima](./docs/anima.md)
|
- [Anima](./docs/anima.md)
|
||||||
- [ERNIE-Image](./docs/ernie_image.md)
|
- [ERNIE-Image](./docs/ernie_image.md)
|
||||||
|
- [Boogu Image](./docs/boogu_image.md)
|
||||||
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
|
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
|
||||||
- [Ideogram4](./docs/ideogram4.md)
|
- [Ideogram4](./docs/ideogram4.md)
|
||||||
- Image Edit Models
|
- Image Edit Models
|
||||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
||||||
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
|
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
|
||||||
- [LongCat Image Edit](./docs/longcat_image.md)
|
- [LongCat Image Edit](./docs/longcat_image.md)
|
||||||
|
- [Boogu Image Edit](./docs/boogu_image.md)
|
||||||
- Video Models
|
- Video Models
|
||||||
- [Wan2.1/Wan2.2](./docs/wan.md)
|
- [Wan2.1/Wan2.2](./docs/wan.md)
|
||||||
- [LTX-2.3](./docs/ltx2.md)
|
- [LTX-2.3](./docs/ltx2.md)
|
||||||
|
|||||||
BIN
assets/boogu/edit_example.png
Normal file
BIN
assets/boogu/edit_example.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 477 KiB |
BIN
assets/boogu/example.png
Normal file
BIN
assets/boogu/example.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 489 KiB |
31
docs/boogu_image.md
Normal file
31
docs/boogu_image.md
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# How to Use
|
||||||
|
|
||||||
|
Boogu Image uses a Boogu diffusion transformer, the FLUX VAE, and Qwen3-VL as the LLM text and vision encoder.
|
||||||
|
|
||||||
|
## Download weights
|
||||||
|
|
||||||
|
- Download Boogu Image
|
||||||
|
- safetensors: https://huggingface.co/Comfy-Org/Boogu-Image/tree/main/diffusion_models
|
||||||
|
- Download vae
|
||||||
|
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
|
||||||
|
- Download Qwen3-VL 8B
|
||||||
|
- gguf: https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
|
||||||
|
- For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Boogu Image Base
|
||||||
|
|
||||||
|
```
|
||||||
|
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_base_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft -p "a lovely cat" --diffusion-fa -v --offload-to-cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
<img width="256" alt="Boogu Image Base example" src="../assets/boogu/example.png" />
|
||||||
|
|
||||||
|
### Boogu Image Edit
|
||||||
|
|
||||||
|
```
|
||||||
|
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_edit_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --llm_vision ..\..\llm\mmproj-Qwen3VL-8B-Instruct-F16.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --diffusion-fa -v --offload-to-cpu -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'boogu.cpp'"
|
||||||
|
```
|
||||||
|
|
||||||
|
<img width="256" alt="Boogu Image Edit example" src="../assets/boogu/edit_example.png" />
|
||||||
@ -62,18 +62,22 @@ struct SDCliParams {
|
|||||||
{"-o",
|
{"-o",
|
||||||
"--output",
|
"--output",
|
||||||
"path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs support .avi, .webm, and animated .webp",
|
"path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs support .avi, .webm, and animated .webp",
|
||||||
|
0,
|
||||||
&output_path},
|
&output_path},
|
||||||
{"",
|
{"",
|
||||||
"--image",
|
"--image",
|
||||||
"path to the image to inspect (for metadata mode)",
|
"path to the image to inspect (for metadata mode)",
|
||||||
|
0,
|
||||||
&image_path},
|
&image_path},
|
||||||
{"",
|
{"",
|
||||||
"--metadata-format",
|
"--metadata-format",
|
||||||
"metadata output format, one of [text, json] (default: text)",
|
"metadata output format, one of [text, json] (default: text)",
|
||||||
|
0,
|
||||||
&metadata_format},
|
&metadata_format},
|
||||||
{"",
|
{"",
|
||||||
"--preview-path",
|
"--preview-path",
|
||||||
"path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp",
|
"path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp",
|
||||||
|
0,
|
||||||
&preview_path},
|
&preview_path},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -260,7 +260,14 @@ bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& o
|
|||||||
invalid_arg = true;
|
invalid_arg = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
*option.target = argv_to_utf8(i, argv);
|
if(option.concat && !option.target->empty()){
|
||||||
|
if(option.concat > 0 && option.concat <= 0xff){
|
||||||
|
*option.target += static_cast<char>(option.concat);
|
||||||
|
}
|
||||||
|
*option.target += argv_to_utf8(i, argv);
|
||||||
|
} else {
|
||||||
|
*option.target = argv_to_utf8(i, argv);
|
||||||
|
}
|
||||||
found_arg = true;
|
found_arg = true;
|
||||||
}))
|
}))
|
||||||
break;
|
break;
|
||||||
@ -324,120 +331,151 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
{"-m",
|
{"-m",
|
||||||
"--model",
|
"--model",
|
||||||
"path to full model",
|
"path to full model",
|
||||||
|
0,
|
||||||
&model_path},
|
&model_path},
|
||||||
{"",
|
{"",
|
||||||
"--clip_l",
|
"--clip_l",
|
||||||
"path to the clip-l text encoder", &clip_l_path},
|
"path to the clip-l text encoder",
|
||||||
|
0,
|
||||||
|
&clip_l_path},
|
||||||
{"", "--clip_g",
|
{"", "--clip_g",
|
||||||
"path to the clip-g text encoder",
|
"path to the clip-g text encoder",
|
||||||
|
0,
|
||||||
&clip_g_path},
|
&clip_g_path},
|
||||||
{"",
|
{"",
|
||||||
"--clip_vision",
|
"--clip_vision",
|
||||||
"path to the clip-vision encoder",
|
"path to the clip-vision encoder",
|
||||||
|
0,
|
||||||
&clip_vision_path},
|
&clip_vision_path},
|
||||||
{"",
|
{"",
|
||||||
"--t5xxl",
|
"--t5xxl",
|
||||||
"path to the t5xxl text encoder",
|
"path to the t5xxl text encoder",
|
||||||
|
0,
|
||||||
&t5xxl_path},
|
&t5xxl_path},
|
||||||
{"",
|
{"",
|
||||||
"--llm",
|
"--llm",
|
||||||
"path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
|
"path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
|
||||||
|
0,
|
||||||
&llm_path},
|
&llm_path},
|
||||||
{"",
|
{"",
|
||||||
"--llm_vision",
|
"--llm_vision",
|
||||||
"path to the llm vit",
|
"path to the llm vit",
|
||||||
|
0,
|
||||||
&llm_vision_path},
|
&llm_vision_path},
|
||||||
{"",
|
{"",
|
||||||
"--qwen2vl",
|
"--qwen2vl",
|
||||||
"alias of --llm. Deprecated.",
|
"alias of --llm. Deprecated.",
|
||||||
|
0,
|
||||||
&llm_path},
|
&llm_path},
|
||||||
{"",
|
{"",
|
||||||
"--qwen2vl_vision",
|
"--qwen2vl_vision",
|
||||||
"alias of --llm_vision. Deprecated.",
|
"alias of --llm_vision. Deprecated.",
|
||||||
|
0,
|
||||||
&llm_vision_path},
|
&llm_vision_path},
|
||||||
{"",
|
{"",
|
||||||
"--diffusion-model",
|
"--diffusion-model",
|
||||||
"path to the standalone diffusion model",
|
"path to the standalone diffusion model",
|
||||||
|
0,
|
||||||
&diffusion_model_path},
|
&diffusion_model_path},
|
||||||
{"",
|
{"",
|
||||||
"--high-noise-diffusion-model",
|
"--high-noise-diffusion-model",
|
||||||
"path to the standalone high noise diffusion model",
|
"path to the standalone high noise diffusion model",
|
||||||
|
0,
|
||||||
&high_noise_diffusion_model_path},
|
&high_noise_diffusion_model_path},
|
||||||
{"",
|
{"",
|
||||||
"--uncond-diffusion-model",
|
"--uncond-diffusion-model",
|
||||||
"path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG",
|
"path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG",
|
||||||
|
0,
|
||||||
&uncond_diffusion_model_path},
|
&uncond_diffusion_model_path},
|
||||||
{"",
|
{"",
|
||||||
"--embeddings-connectors",
|
"--embeddings-connectors",
|
||||||
"path to LTXAV embeddings connectors",
|
"path to LTXAV embeddings connectors",
|
||||||
|
0,
|
||||||
&embeddings_connectors_path},
|
&embeddings_connectors_path},
|
||||||
{"",
|
{"",
|
||||||
"--vae",
|
"--vae",
|
||||||
"path to standalone vae model",
|
"path to standalone vae model",
|
||||||
|
0,
|
||||||
&vae_path},
|
&vae_path},
|
||||||
{"",
|
{"",
|
||||||
"--vae-format",
|
"--vae-format",
|
||||||
"VAE latent format override: auto, flux, sd3, or flux2 (default: auto)",
|
"VAE latent format override: auto, flux, sd3, or flux2 (default: auto)",
|
||||||
|
0,
|
||||||
&vae_format},
|
&vae_format},
|
||||||
{"",
|
{"",
|
||||||
"--audio-vae",
|
"--audio-vae",
|
||||||
"path to standalone LTX audio vae model",
|
"path to standalone LTX audio vae model",
|
||||||
|
0,
|
||||||
&audio_vae_path},
|
&audio_vae_path},
|
||||||
{"",
|
{"",
|
||||||
"--taesd",
|
"--taesd",
|
||||||
"path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
|
"path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
|
||||||
|
0,
|
||||||
&taesd_path},
|
&taesd_path},
|
||||||
{"",
|
{"",
|
||||||
"--tae",
|
"--tae",
|
||||||
"alias of --taesd",
|
"alias of --taesd",
|
||||||
|
0,
|
||||||
&taesd_path},
|
&taesd_path},
|
||||||
{"",
|
{"",
|
||||||
"--control-net",
|
"--control-net",
|
||||||
"path to control net model",
|
"path to control net model",
|
||||||
|
0,
|
||||||
&control_net_path},
|
&control_net_path},
|
||||||
{"",
|
{"",
|
||||||
"--embd-dir",
|
"--embd-dir",
|
||||||
"embeddings directory",
|
"embeddings directory",
|
||||||
|
0,
|
||||||
&embedding_dir},
|
&embedding_dir},
|
||||||
{"",
|
{"",
|
||||||
"--lora-model-dir",
|
"--lora-model-dir",
|
||||||
"lora model directory",
|
"lora model directory",
|
||||||
|
0,
|
||||||
&lora_model_dir},
|
&lora_model_dir},
|
||||||
{"",
|
{"",
|
||||||
"--hires-upscalers-dir",
|
"--hires-upscalers-dir",
|
||||||
"highres fix upscaler model directory",
|
"highres fix upscaler model directory",
|
||||||
|
0,
|
||||||
&hires_upscalers_dir},
|
&hires_upscalers_dir},
|
||||||
{"",
|
{"",
|
||||||
"--tensor-type-rules",
|
"--tensor-type-rules",
|
||||||
"weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
|
"weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
|
||||||
|
(int)',',
|
||||||
&tensor_type_rules},
|
&tensor_type_rules},
|
||||||
{"",
|
{"",
|
||||||
"--photo-maker",
|
"--photo-maker",
|
||||||
"path to PHOTOMAKER model",
|
"path to PHOTOMAKER model",
|
||||||
|
0,
|
||||||
&photo_maker_path},
|
&photo_maker_path},
|
||||||
{"",
|
{"",
|
||||||
"--pulid-weights",
|
"--pulid-weights",
|
||||||
"path to PuLID Flux weights",
|
"path to PuLID Flux weights",
|
||||||
|
0,
|
||||||
&pulid_weights_path},
|
&pulid_weights_path},
|
||||||
{"",
|
{"",
|
||||||
"--upscale-model",
|
"--upscale-model",
|
||||||
"path to esrgan model.",
|
"path to esrgan model.",
|
||||||
|
0,
|
||||||
&esrgan_path},
|
&esrgan_path},
|
||||||
{"",
|
{"",
|
||||||
"--backend",
|
"--backend",
|
||||||
"runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0",
|
"runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0",
|
||||||
|
(int)',',
|
||||||
&backend},
|
&backend},
|
||||||
{"",
|
{"",
|
||||||
"--params-backend",
|
"--params-backend",
|
||||||
"parameter backend assignment, e.g. disk, cpu, or diffusion=disk,clip=cpu",
|
"parameter backend assignment, e.g. disk, cpu, or diffusion=disk,clip=cpu",
|
||||||
|
(int)',',
|
||||||
¶ms_backend},
|
¶ms_backend},
|
||||||
{"",
|
{"",
|
||||||
"--rpc-servers",
|
"--rpc-servers",
|
||||||
"comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052",
|
"comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052",
|
||||||
|
(int)',',
|
||||||
&rpc_servers},
|
&rpc_servers},
|
||||||
{"",
|
{"",
|
||||||
"--max-vram",
|
"--max-vram",
|
||||||
"maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value",
|
"maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value",
|
||||||
|
0,
|
||||||
&max_vram},
|
&max_vram},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -857,58 +895,71 @@ ArgOptions SDGenerationParams::get_options() {
|
|||||||
{"-p",
|
{"-p",
|
||||||
"--prompt",
|
"--prompt",
|
||||||
"the prompt to render",
|
"the prompt to render",
|
||||||
|
0,
|
||||||
&prompt},
|
&prompt},
|
||||||
{"-n",
|
{"-n",
|
||||||
"--negative-prompt",
|
"--negative-prompt",
|
||||||
"the negative prompt (default: \"\")",
|
"the negative prompt (default: \"\")",
|
||||||
|
0,
|
||||||
&negative_prompt},
|
&negative_prompt},
|
||||||
{"-i",
|
{"-i",
|
||||||
"--init-img",
|
"--init-img",
|
||||||
"path to the init image",
|
"path to the init image",
|
||||||
|
0,
|
||||||
&init_image_path},
|
&init_image_path},
|
||||||
{"",
|
{"",
|
||||||
"--end-img",
|
"--end-img",
|
||||||
"path to the end image, required by flf2v",
|
"path to the end image, required by flf2v",
|
||||||
|
0,
|
||||||
&end_image_path},
|
&end_image_path},
|
||||||
{"",
|
{"",
|
||||||
"--mask",
|
"--mask",
|
||||||
"path to the mask image",
|
"path to the mask image",
|
||||||
|
0,
|
||||||
&mask_image_path},
|
&mask_image_path},
|
||||||
{"",
|
{"",
|
||||||
"--control-image",
|
"--control-image",
|
||||||
"path to control image, control net",
|
"path to control image, control net",
|
||||||
|
0,
|
||||||
&control_image_path},
|
&control_image_path},
|
||||||
{"",
|
{"",
|
||||||
"--control-video",
|
"--control-video",
|
||||||
"path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
|
"path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
|
||||||
"lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
|
"lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
|
||||||
"such as 00.png, 01.png, ... etc.",
|
"such as 00.png, 01.png, ... etc.",
|
||||||
|
0,
|
||||||
&control_video_path},
|
&control_video_path},
|
||||||
{"",
|
{"",
|
||||||
"--pm-id-images-dir",
|
"--pm-id-images-dir",
|
||||||
"path to PHOTOMAKER input id images dir",
|
"path to PHOTOMAKER input id images dir",
|
||||||
|
0,
|
||||||
&pm_id_images_dir},
|
&pm_id_images_dir},
|
||||||
{"",
|
{"",
|
||||||
"--pm-id-embed-path",
|
"--pm-id-embed-path",
|
||||||
"path to PHOTOMAKER v2 id embed",
|
"path to PHOTOMAKER v2 id embed",
|
||||||
|
0,
|
||||||
&pm_id_embed_path},
|
&pm_id_embed_path},
|
||||||
{"",
|
{"",
|
||||||
"--pulid-id-embedding",
|
"--pulid-id-embedding",
|
||||||
"path to PuLID id embedding",
|
"path to PuLID id embedding",
|
||||||
|
0,
|
||||||
&pulid_id_embedding_path},
|
&pulid_id_embedding_path},
|
||||||
{"",
|
{"",
|
||||||
"--hires-upscaler",
|
"--hires-upscaler",
|
||||||
"highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
|
"highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
|
||||||
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
|
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
|
||||||
"under --hires-upscalers-dir (default: Latent)",
|
"under --hires-upscalers-dir (default: Latent)",
|
||||||
|
0,
|
||||||
&hires_upscaler},
|
&hires_upscaler},
|
||||||
{"",
|
{"",
|
||||||
"--extra-sample-args",
|
"--extra-sample-args",
|
||||||
"extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma",
|
"extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma",
|
||||||
|
(int)',',
|
||||||
&extra_sample_args},
|
&extra_sample_args},
|
||||||
{"",
|
{"",
|
||||||
"--extra-tiling-args",
|
"--extra-tiling-args",
|
||||||
"extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
|
"extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
|
||||||
|
(int)',',
|
||||||
&extra_tiling_args},
|
&extra_tiling_args},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -31,6 +31,7 @@ struct StringOption {
|
|||||||
std::string short_name;
|
std::string short_name;
|
||||||
std::string long_name;
|
std::string long_name;
|
||||||
std::string desc;
|
std::string desc;
|
||||||
|
int concat;
|
||||||
std::string* target;
|
std::string* target;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -190,8 +190,8 @@ ArgOptions SDSvrParams::get_options() {
|
|||||||
ArgOptions options;
|
ArgOptions options;
|
||||||
|
|
||||||
options.string_options = {
|
options.string_options = {
|
||||||
{"-l", "--listen-ip", "server listen ip (default: 127.0.0.1)", &listen_ip},
|
{"-l", "--listen-ip", "server listen ip (default: 127.0.0.1)", 0, &listen_ip},
|
||||||
{"", "--serve-html-path", "path to HTML file to serve at root (optional)", &serve_html_path},
|
{"", "--serve-html-path", "path to HTML file to serve at root (optional)", 0, &serve_html_path},
|
||||||
};
|
};
|
||||||
|
|
||||||
options.int_options = {
|
options.int_options = {
|
||||||
|
|||||||
@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
arch = LLM::LLMArch::GPT_OSS_20B;
|
arch = LLM::LLMArch::GPT_OSS_20B;
|
||||||
} else if (sd_version_is_pid(version)) {
|
} else if (sd_version_is_pid(version)) {
|
||||||
arch = LLM::LLMArch::GEMMA2_2B;
|
arch = LLM::LLMArch::GEMMA2_2B;
|
||||||
} else if (sd_version_is_ideogram4(version)) {
|
} else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) {
|
||||||
arch = LLM::LLMArch::QWEN3_VL;
|
arch = LLM::LLMArch::QWEN3_VL;
|
||||||
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
|
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
|
||||||
arch = LLM::LLMArch::QWEN3;
|
arch = LLM::LLMArch::QWEN3;
|
||||||
@ -1778,6 +1778,65 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
|
|
||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
|
} else if (sd_version_is_boogu_image(version)) {
|
||||||
|
prompt_template_encode_start_idx = 0;
|
||||||
|
|
||||||
|
const std::string t2i_system_prompt =
|
||||||
|
"You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows.";
|
||||||
|
const std::string edit_system_prompt =
|
||||||
|
"Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.";
|
||||||
|
const bool has_ref_images = llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty();
|
||||||
|
const bool text_empty = conditioner_params.text.find_first_not_of(" \t\r\n") == std::string::npos;
|
||||||
|
|
||||||
|
if (has_ref_images) {
|
||||||
|
LOG_INFO("BooguImageEditPipeline");
|
||||||
|
const std::string prompt_prefix = "<|im_start|>system\n" + edit_system_prompt + "<|im_end|>\n<|im_start|>user\n";
|
||||||
|
std::string img_prompt;
|
||||||
|
const std::string placeholder = "<|image_pad|>";
|
||||||
|
|
||||||
|
for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
|
||||||
|
const auto& image = (*conditioner_params.ref_images)[i];
|
||||||
|
double factor = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
|
||||||
|
int height = static_cast<int>(image.shape()[1]);
|
||||||
|
int width = static_cast<int>(image.shape()[0]);
|
||||||
|
double beta = std::sqrt((384.0 * 384.0) / (static_cast<double>(height) * static_cast<double>(width)));
|
||||||
|
int h_bar = std::max(static_cast<int>(factor),
|
||||||
|
static_cast<int>(std::round(height * beta / factor)) * static_cast<int>(factor));
|
||||||
|
int w_bar = std::max(static_cast<int>(factor),
|
||||||
|
static_cast<int>(std::round(width * beta / factor)) * static_cast<int>(factor));
|
||||||
|
|
||||||
|
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
|
||||||
|
|
||||||
|
auto resized_image = clip_preprocess(image, w_bar, h_bar);
|
||||||
|
auto image_embed = llm->encode_image(n_threads, resized_image, false, true, true);
|
||||||
|
GGML_ASSERT(!image_embed.empty());
|
||||||
|
|
||||||
|
std::string image_prefix = prompt_prefix + img_prompt + "<|vision_start|>";
|
||||||
|
int image_embed_idx = static_cast<int>(tokenizer->encode(image_prefix, nullptr).size());
|
||||||
|
image_embeds.emplace_back(image_embed_idx, image_embed);
|
||||||
|
|
||||||
|
img_prompt += "<|vision_start|>";
|
||||||
|
int64_t num_image_tokens = image_embed.shape()[1];
|
||||||
|
img_prompt.reserve(img_prompt.size() + static_cast<size_t>(num_image_tokens) * placeholder.size() + 32);
|
||||||
|
for (int j = 0; j < num_image_tokens; j++) {
|
||||||
|
img_prompt += placeholder;
|
||||||
|
}
|
||||||
|
img_prompt += "<|vision_end|>";
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt = prompt_prefix + img_prompt;
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
prompt += "<|im_end|>\n";
|
||||||
|
} else {
|
||||||
|
const std::string& system_prompt = text_empty ? edit_system_prompt : t2i_system_prompt;
|
||||||
|
prompt = "<|im_start|>system\n" + system_prompt + "<|im_end|>\n<|im_start|>user\n";
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
prompt += "<|im_end|>\n";
|
||||||
|
}
|
||||||
} else if (sd_version_is_longcat(version)) {
|
} else if (sd_version_is_longcat(version)) {
|
||||||
spell_quotes = true;
|
spell_quotes = true;
|
||||||
|
|
||||||
|
|||||||
@ -42,6 +42,7 @@ enum SDVersion {
|
|||||||
VERSION_LTXAV,
|
VERSION_LTXAV,
|
||||||
VERSION_HIDREAM_O1,
|
VERSION_HIDREAM_O1,
|
||||||
VERSION_Z_IMAGE,
|
VERSION_Z_IMAGE,
|
||||||
|
VERSION_BOOGU_IMAGE,
|
||||||
VERSION_OVIS_IMAGE,
|
VERSION_OVIS_IMAGE,
|
||||||
VERSION_ERNIE_IMAGE,
|
VERSION_ERNIE_IMAGE,
|
||||||
VERSION_LENS,
|
VERSION_LENS,
|
||||||
@ -143,6 +144,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool sd_version_is_boogu_image(SDVersion version) {
|
||||||
|
if (version == VERSION_BOOGU_IMAGE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool sd_version_is_longcat(SDVersion version) {
|
static inline bool sd_version_is_longcat(SDVersion version) {
|
||||||
if (version == VERSION_LONGCAT) {
|
if (version == VERSION_LONGCAT) {
|
||||||
return true;
|
return true;
|
||||||
@ -206,6 +214,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
|
|||||||
version == VERSION_HIDREAM_O1 ||
|
version == VERSION_HIDREAM_O1 ||
|
||||||
sd_version_is_anima(version) ||
|
sd_version_is_anima(version) ||
|
||||||
sd_version_is_z_image(version) ||
|
sd_version_is_z_image(version) ||
|
||||||
|
sd_version_is_boogu_image(version) ||
|
||||||
sd_version_is_ernie_image(version) ||
|
sd_version_is_ernie_image(version) ||
|
||||||
sd_version_is_lens(version) ||
|
sd_version_is_lens(version) ||
|
||||||
sd_version_is_longcat(version) ||
|
sd_version_is_longcat(version) ||
|
||||||
|
|||||||
@ -899,10 +899,12 @@ namespace Rope {
|
|||||||
// q,k,v: [N, L, n_head, d_head]
|
// q,k,v: [N, L, n_head, d_head]
|
||||||
// pe: [L, d_head/2, 2, 2]
|
// pe: [L, d_head/2, 2, 2]
|
||||||
// return: [N, L, n_head*d_head]
|
// return: [N, L, n_head*d_head]
|
||||||
|
int64_t n_head = q->ne[1];
|
||||||
|
|
||||||
q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head]
|
q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head]
|
||||||
k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head]
|
k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head]
|
||||||
|
|
||||||
auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head]
|
auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, mask, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head]
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
}; // namespace Rope
|
}; // namespace Rope
|
||||||
|
|||||||
@ -227,6 +227,7 @@ namespace Anima {
|
|||||||
k4 = k_norm->forward(ctx, k4);
|
k4 = k_norm->forward(ctx, k4);
|
||||||
|
|
||||||
ggml_tensor* attn_out = nullptr;
|
ggml_tensor* attn_out = nullptr;
|
||||||
|
float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f;
|
||||||
if (pe_q != nullptr || pe_k != nullptr) {
|
if (pe_q != nullptr || pe_k != nullptr) {
|
||||||
if (pe_q == nullptr) {
|
if (pe_q == nullptr) {
|
||||||
pe_q = pe_k;
|
pe_q = pe_k;
|
||||||
@ -244,7 +245,8 @@ namespace Anima {
|
|||||||
num_heads,
|
num_heads,
|
||||||
nullptr,
|
nullptr,
|
||||||
true,
|
true,
|
||||||
ctx->flash_attn_enabled);
|
ctx->flash_attn_enabled,
|
||||||
|
scale);
|
||||||
} else {
|
} else {
|
||||||
auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
|
auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
|
||||||
auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
|
auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
|
||||||
@ -256,7 +258,8 @@ namespace Anima {
|
|||||||
num_heads,
|
num_heads,
|
||||||
nullptr,
|
nullptr,
|
||||||
false,
|
false,
|
||||||
ctx->flash_attn_enabled);
|
ctx->flash_attn_enabled,
|
||||||
|
scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
return out_proj->forward(ctx, attn_out);
|
return out_proj->forward(ctx, attn_out);
|
||||||
|
|||||||
835
src/model/diffusion/boogu.hpp
Normal file
835
src/model/diffusion/boogu.hpp
Normal file
@ -0,0 +1,835 @@
|
|||||||
|
#ifndef __SD_MODEL_DIFFUSION_BOOGU_HPP__
|
||||||
|
#define __SD_MODEL_DIFFUSION_BOOGU_HPP__
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "core/ggml_extend.hpp"
|
||||||
|
#include "model/common/rope.hpp"
|
||||||
|
#include "model/diffusion/dit.hpp"
|
||||||
|
#include "model/diffusion/model.hpp"
|
||||||
|
#include "model/diffusion/qwen_image.hpp"
|
||||||
|
#include "model_loader.h"
|
||||||
|
|
||||||
|
namespace Boogu {
|
||||||
|
constexpr int BOOGU_GRAPH_SIZE = 65536;
|
||||||
|
|
||||||
|
struct BooguConfig {
|
||||||
|
int patch_size = 2;
|
||||||
|
int64_t in_channels = 16;
|
||||||
|
int64_t out_channels = 16;
|
||||||
|
int64_t hidden_size = 3360;
|
||||||
|
int64_t num_layers = 32;
|
||||||
|
int64_t num_double_stream_layers = 8;
|
||||||
|
int64_t num_refiner_layers = 2;
|
||||||
|
int64_t num_attention_heads = 28;
|
||||||
|
int64_t num_kv_heads = 7;
|
||||||
|
int64_t head_dim = 120;
|
||||||
|
int64_t multiple_of = 256;
|
||||||
|
int64_t instruction_feat_dim = 4096;
|
||||||
|
int64_t timestep_embed_dim = 1024;
|
||||||
|
int theta = 10000;
|
||||||
|
float timestep_scale = 1000.0f;
|
||||||
|
float norm_eps = 1e-5f;
|
||||||
|
std::vector<int> axes_dim = {40, 40, 40};
|
||||||
|
int64_t axes_dim_sum = 120;
|
||||||
|
|
||||||
|
static int64_t count_blocks(const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& prefix,
|
||||||
|
const std::string& block_prefix) {
|
||||||
|
int64_t count = 0;
|
||||||
|
for (const auto& [name, _] : tensor_storage_map) {
|
||||||
|
if (!starts_with(name, prefix)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
size_t pos = name.find(block_prefix);
|
||||||
|
if (pos == std::string::npos) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto items = split_string(name.substr(pos), '.');
|
||||||
|
if (items.size() > 1) {
|
||||||
|
count = std::max<int64_t>(count, atoi(items[1].c_str()) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static BooguConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||||
|
BooguConfig config;
|
||||||
|
int64_t detected_head_dim = 0;
|
||||||
|
int64_t detected_kv_dim = 0;
|
||||||
|
|
||||||
|
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||||
|
if (!starts_with(name, prefix)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (ends_with(name, "x_embedder.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
int64_t patch_area = config.patch_size * config.patch_size;
|
||||||
|
config.in_channels = tensor_storage.ne[0] / patch_area;
|
||||||
|
config.hidden_size = tensor_storage.ne[1];
|
||||||
|
} else if (ends_with(name, "time_caption_embed.caption_embedder.1.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
config.instruction_feat_dim = tensor_storage.ne[0];
|
||||||
|
config.hidden_size = tensor_storage.ne[1];
|
||||||
|
} else if (ends_with(name, "single_stream_layers.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
|
||||||
|
detected_head_dim = tensor_storage.ne[0];
|
||||||
|
} else if (ends_with(name, "double_stream_layers.0.img_self_attn.norm_q.weight") && tensor_storage.n_dims == 1) {
|
||||||
|
detected_head_dim = tensor_storage.ne[0];
|
||||||
|
} else if (ends_with(name, "single_stream_layers.0.attn.to_k.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
detected_kv_dim = tensor_storage.ne[1];
|
||||||
|
} else if (ends_with(name, "double_stream_layers.0.img_instruct_attn.processor.img_to_k.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
detected_kv_dim = tensor_storage.ne[1];
|
||||||
|
} else if (ends_with(name, "norm_out.linear_2.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
int64_t patch_area = config.patch_size * config.patch_size;
|
||||||
|
config.out_channels = tensor_storage.ne[1] / patch_area;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
config.num_layers = std::max<int64_t>(1, count_blocks(tensor_storage_map, prefix, "single_stream_layers."));
|
||||||
|
config.num_double_stream_layers = std::max<int64_t>(0, count_blocks(tensor_storage_map, prefix, "double_stream_layers."));
|
||||||
|
int64_t noise_refiner_layers = count_blocks(tensor_storage_map, prefix, "noise_refiner.");
|
||||||
|
int64_t ref_refiner_layers = count_blocks(tensor_storage_map, prefix, "ref_image_refiner.");
|
||||||
|
int64_t context_refiner_layers = count_blocks(tensor_storage_map, prefix, "context_refiner.");
|
||||||
|
config.num_refiner_layers = std::max<int64_t>(1, std::max(noise_refiner_layers, std::max(ref_refiner_layers, context_refiner_layers)));
|
||||||
|
|
||||||
|
if (detected_head_dim > 0) {
|
||||||
|
config.head_dim = detected_head_dim;
|
||||||
|
config.num_attention_heads = config.hidden_size / config.head_dim;
|
||||||
|
config.axes_dim_sum = config.head_dim;
|
||||||
|
if (detected_kv_dim > 0) {
|
||||||
|
config.num_kv_heads = detected_kv_dim / config.head_dim;
|
||||||
|
}
|
||||||
|
if (config.axes_dim_sum == 120) {
|
||||||
|
config.axes_dim = {40, 40, 40};
|
||||||
|
} else if (config.axes_dim_sum % 3 == 0) {
|
||||||
|
int axis = static_cast<int>(config.axes_dim_sum / 3);
|
||||||
|
config.axes_dim = {axis, axis, axis};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
config.timestep_embed_dim = std::min<int64_t>(config.hidden_size, 1024);
|
||||||
|
|
||||||
|
LOG_DEBUG("boogu_image: layers=%" PRId64 ", double_stream_layers=%" PRId64 ", refiner_layers=%" PRId64 ", hidden=%" PRId64 ", heads=%" PRId64 ", kv_heads=%" PRId64 ", head_dim=%" PRId64 ", in_channels=%" PRId64 ", out_channels=%" PRId64,
|
||||||
|
config.num_layers,
|
||||||
|
config.num_double_stream_layers,
|
||||||
|
config.num_refiner_layers,
|
||||||
|
config.hidden_size,
|
||||||
|
config.num_attention_heads,
|
||||||
|
config.num_kv_heads,
|
||||||
|
config.head_dim,
|
||||||
|
config.in_channels,
|
||||||
|
config.out_channels);
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
__STATIC_INLINE__ ggml_tensor* scale_modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) {
|
||||||
|
scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);
|
||||||
|
return ggml_add(ctx, x, ggml_mul(ctx, x, scale));
|
||||||
|
}
|
||||||
|
|
||||||
|
__STATIC_INLINE__ ggml_tensor* gate_residual(ggml_context* ctx, ggml_tensor* residual, ggml_tensor* x, ggml_tensor* gate) {
|
||||||
|
gate = ggml_tanh(ctx, gate);
|
||||||
|
gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]);
|
||||||
|
x = ggml_mul(ctx, x, gate);
|
||||||
|
return ggml_add(ctx, residual, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct LuminaCombinedTimestepCaptionEmbedding : public GGMLBlock {
|
||||||
|
int64_t frequency_embedding_size;
|
||||||
|
float timestep_scale;
|
||||||
|
|
||||||
|
LuminaCombinedTimestepCaptionEmbedding(int64_t hidden_size,
|
||||||
|
int64_t instruction_feat_dim,
|
||||||
|
int64_t frequency_embedding_size,
|
||||||
|
float norm_eps,
|
||||||
|
float timestep_scale)
|
||||||
|
: frequency_embedding_size(frequency_embedding_size),
|
||||||
|
timestep_scale(timestep_scale) {
|
||||||
|
blocks["timestep_embedder"] = std::make_shared<Qwen::TimestepEmbedding>(frequency_embedding_size, std::min<int64_t>(hidden_size, 1024));
|
||||||
|
blocks["caption_embedder.0"] = std::make_shared<RMSNorm>(instruction_feat_dim, norm_eps);
|
||||||
|
blocks["caption_embedder.1"] = std::make_shared<Linear>(instruction_feat_dim, hidden_size, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* timestep, ggml_tensor* text_hidden_states) {
|
||||||
|
auto timestep_embedder = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["timestep_embedder"]);
|
||||||
|
auto caption_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["caption_embedder.0"]);
|
||||||
|
auto caption_embedder_1 = std::dynamic_pointer_cast<Linear>(blocks["caption_embedder.1"]);
|
||||||
|
|
||||||
|
auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(frequency_embedding_size), 10000, timestep_scale);
|
||||||
|
auto time_embed = timestep_embedder->forward(ctx, timestep_proj);
|
||||||
|
auto caption_embed = caption_embedder_1->forward(ctx, caption_embedder_0->forward(ctx, text_hidden_states));
|
||||||
|
return {time_embed, caption_embed};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LuminaRMSNormZero : public GGMLBlock {
|
||||||
|
LuminaRMSNormZero(int64_t embedding_dim, int64_t conditioning_embedding_dim, float norm_eps) {
|
||||||
|
blocks["linear"] = std::make_shared<Linear>(conditioning_embedding_dim, 4 * embedding_dim, true);
|
||||||
|
blocks["norm"] = std::make_shared<RMSNorm>(embedding_dim, norm_eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb) {
|
||||||
|
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
|
||||||
|
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
|
||||||
|
|
||||||
|
emb = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, emb));
|
||||||
|
auto mods = ggml_ext_chunk(ctx->ggml_ctx, emb, 4, 0);
|
||||||
|
|
||||||
|
auto scale_msa = mods[0];
|
||||||
|
auto gate_msa = mods[1];
|
||||||
|
auto scale_mlp = mods[2];
|
||||||
|
auto gate_mlp = mods[3];
|
||||||
|
|
||||||
|
x = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), scale_msa);
|
||||||
|
return {x, gate_msa, scale_mlp, gate_mlp};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LuminaFeedForward : public GGMLBlock {
|
||||||
|
LuminaFeedForward(int64_t dim, int64_t inner_dim, int64_t multiple_of) {
|
||||||
|
inner_dim = multiple_of * ((inner_dim + multiple_of - 1) / multiple_of);
|
||||||
|
blocks["linear_1"] = std::make_shared<Linear>(dim, inner_dim, false);
|
||||||
|
blocks["linear_2"] = std::make_shared<Linear>(inner_dim, dim, false);
|
||||||
|
blocks["linear_3"] = std::make_shared<Linear>(dim, inner_dim, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||||
|
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
|
||||||
|
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
|
||||||
|
auto linear_3 = std::dynamic_pointer_cast<Linear>(blocks["linear_3"]);
|
||||||
|
|
||||||
|
if (sd_backend_is(ctx->backend, "Vulkan")) {
|
||||||
|
linear_2->set_force_prec_f32(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto h1 = linear_1->forward(ctx, x);
|
||||||
|
auto h2 = linear_3->forward(ctx, x);
|
||||||
|
x = ggml_swiglu_split(ctx->ggml_ctx, h1, h2);
|
||||||
|
x = linear_2->forward(ctx, x);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LuminaLayerNormContinuous : public GGMLBlock {
|
||||||
|
LuminaLayerNormContinuous(int64_t embedding_dim,
|
||||||
|
int64_t conditioning_embedding_dim,
|
||||||
|
int64_t out_dim) {
|
||||||
|
blocks["linear_1"] = std::make_shared<Linear>(conditioning_embedding_dim, embedding_dim, true);
|
||||||
|
blocks["norm"] = std::make_shared<LayerNorm>(embedding_dim, 1e-6f, false);
|
||||||
|
blocks["linear_2"] = std::make_shared<Linear>(embedding_dim, out_dim, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning_embedding) {
|
||||||
|
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
|
||||||
|
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
|
||||||
|
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
|
||||||
|
|
||||||
|
auto emb = linear_1->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning_embedding));
|
||||||
|
x = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), emb);
|
||||||
|
x = linear_2->forward(ctx, x);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Attention : public GGMLBlock {
|
||||||
|
int64_t dim_head;
|
||||||
|
int64_t heads;
|
||||||
|
int64_t kv_heads;
|
||||||
|
|
||||||
|
Attention(int64_t query_dim, int64_t dim_head, int64_t heads, int64_t kv_heads, float eps = 1e-5f)
|
||||||
|
: dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
|
||||||
|
blocks["to_q"] = std::make_shared<Linear>(query_dim, heads * dim_head, false);
|
||||||
|
blocks["to_k"] = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
|
||||||
|
blocks["to_v"] = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
|
||||||
|
blocks["norm_q"] = std::make_shared<RMSNorm>(dim_head, eps);
|
||||||
|
blocks["norm_k"] = std::make_shared<RMSNorm>(dim_head, eps);
|
||||||
|
blocks["to_out.0"] = std::make_shared<Linear>(heads * dim_head, query_dim, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* hidden_states,
|
||||||
|
ggml_tensor* encoder_hidden_states,
|
||||||
|
ggml_tensor* rotary_emb,
|
||||||
|
ggml_tensor* attention_mask = nullptr) {
|
||||||
|
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
|
||||||
|
auto to_k = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
|
||||||
|
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
|
||||||
|
auto norm_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
|
||||||
|
auto norm_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
|
||||||
|
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
|
||||||
|
|
||||||
|
if (sd_backend_is(ctx->backend, "Vulkan")) {
|
||||||
|
to_out_0->set_force_prec_f32(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t N = hidden_states->ne[2];
|
||||||
|
int64_t Lq = hidden_states->ne[1];
|
||||||
|
int64_t Lk = encoder_hidden_states->ne[1];
|
||||||
|
|
||||||
|
auto q = to_q->forward(ctx, hidden_states);
|
||||||
|
q = ggml_reshape_4d(ctx->ggml_ctx, q, dim_head, heads, Lq, N);
|
||||||
|
auto k = to_k->forward(ctx, encoder_hidden_states);
|
||||||
|
k = ggml_reshape_4d(ctx->ggml_ctx, k, dim_head, kv_heads, Lk, N);
|
||||||
|
auto v = to_v->forward(ctx, encoder_hidden_states);
|
||||||
|
v = ggml_reshape_4d(ctx->ggml_ctx, v, dim_head, kv_heads, Lk, N);
|
||||||
|
|
||||||
|
q = norm_q->forward(ctx, q);
|
||||||
|
k = norm_k->forward(ctx, k);
|
||||||
|
|
||||||
|
auto out = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
|
||||||
|
out = to_out_0->forward(ctx, out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BooguImageTransformerBlock : public GGMLBlock {
|
||||||
|
bool modulation;
|
||||||
|
|
||||||
|
BooguImageTransformerBlock(int64_t dim,
|
||||||
|
int64_t num_attention_heads,
|
||||||
|
int64_t num_kv_heads,
|
||||||
|
int64_t multiple_of,
|
||||||
|
float norm_eps,
|
||||||
|
bool modulation)
|
||||||
|
: modulation(modulation) {
|
||||||
|
int64_t head_dim = dim / num_attention_heads;
|
||||||
|
blocks["attn"] = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
|
||||||
|
blocks["feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
|
||||||
|
if (modulation) {
|
||||||
|
blocks["norm1"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
} else {
|
||||||
|
blocks["norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
}
|
||||||
|
blocks["ffn_norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["ffn_norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* hidden_states,
|
||||||
|
ggml_tensor* rotary_emb,
|
||||||
|
ggml_tensor* temb = nullptr,
|
||||||
|
ggml_tensor* attention_mask = nullptr) {
|
||||||
|
auto attn = std::dynamic_pointer_cast<Attention>(blocks["attn"]);
|
||||||
|
auto feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["feed_forward"]);
|
||||||
|
auto ffn_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
|
||||||
|
auto norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm2"]);
|
||||||
|
auto ffn_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
|
||||||
|
|
||||||
|
if (modulation) {
|
||||||
|
auto norm1 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["norm1"]);
|
||||||
|
auto mods = norm1->forward(ctx, hidden_states, temb);
|
||||||
|
|
||||||
|
auto norm_hidden_states = std::get<0>(mods);
|
||||||
|
auto gate_msa = std::get<1>(mods);
|
||||||
|
auto scale_mlp = std::get<2>(mods);
|
||||||
|
auto gate_mlp = std::get<3>(mods);
|
||||||
|
|
||||||
|
auto attn_output = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
|
||||||
|
hidden_states = gate_residual(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output), gate_msa);
|
||||||
|
|
||||||
|
auto mlp_input = scale_modulate(ctx->ggml_ctx, ffn_norm1->forward(ctx, hidden_states), scale_mlp);
|
||||||
|
auto mlp_output = feed_forward->forward(ctx, mlp_input);
|
||||||
|
hidden_states = gate_residual(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output), gate_mlp);
|
||||||
|
} else {
|
||||||
|
auto norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm1"]);
|
||||||
|
|
||||||
|
auto norm_hidden_states = norm1->forward(ctx, hidden_states);
|
||||||
|
auto attn_output = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
|
||||||
|
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output));
|
||||||
|
|
||||||
|
auto mlp_output = feed_forward->forward(ctx, ffn_norm1->forward(ctx, hidden_states));
|
||||||
|
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output));
|
||||||
|
}
|
||||||
|
return hidden_states;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BooguImageJointAttention : public GGMLBlock {
|
||||||
|
int64_t dim_head;
|
||||||
|
int64_t heads;
|
||||||
|
int64_t kv_heads;
|
||||||
|
|
||||||
|
BooguImageJointAttention(int64_t dim, int64_t dim_head, int64_t heads, int64_t kv_heads)
|
||||||
|
: dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
|
||||||
|
blocks["norm_q"] = std::make_shared<RMSNorm>(dim_head, 1e-5f);
|
||||||
|
blocks["norm_k"] = std::make_shared<RMSNorm>(dim_head, 1e-5f);
|
||||||
|
blocks["to_out.0"] = std::make_shared<Linear>(heads * dim_head, dim, false);
|
||||||
|
blocks["processor.img_to_q"] = std::make_shared<Linear>(dim, heads * dim_head, false);
|
||||||
|
blocks["processor.img_to_k"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
|
||||||
|
blocks["processor.img_to_v"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
|
||||||
|
blocks["processor.instruct_to_q"] = std::make_shared<Linear>(dim, heads * dim_head, false);
|
||||||
|
blocks["processor.instruct_to_k"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
|
||||||
|
blocks["processor.instruct_to_v"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
|
||||||
|
blocks["processor.instruct_out"] = std::make_shared<Linear>(heads * dim_head, dim, false);
|
||||||
|
blocks["processor.img_out"] = std::make_shared<Linear>(heads * dim_head, dim, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* img_hidden_states,
|
||||||
|
ggml_tensor* instruct_hidden_states,
|
||||||
|
ggml_tensor* rotary_emb,
|
||||||
|
ggml_tensor* attention_mask = nullptr) {
|
||||||
|
auto norm_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
|
||||||
|
auto norm_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
|
||||||
|
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
|
||||||
|
auto img_to_q = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_q"]);
|
||||||
|
auto img_to_k = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_k"]);
|
||||||
|
auto img_to_v = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_v"]);
|
||||||
|
auto instruct_to_q = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_q"]);
|
||||||
|
auto instruct_to_k = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_k"]);
|
||||||
|
auto instruct_to_v = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_v"]);
|
||||||
|
auto instruct_out = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_out"]);
|
||||||
|
auto img_out = std::dynamic_pointer_cast<Linear>(blocks["processor.img_out"]);
|
||||||
|
|
||||||
|
if (sd_backend_is(ctx->backend, "Vulkan")) {
|
||||||
|
to_out_0->set_force_prec_f32(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t N = img_hidden_states->ne[2];
|
||||||
|
int64_t L_img = img_hidden_states->ne[1];
|
||||||
|
int64_t L_instruct = instruct_hidden_states->ne[1];
|
||||||
|
|
||||||
|
auto img_q = img_to_q->forward(ctx, img_hidden_states);
|
||||||
|
img_q = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, heads, L_img, N);
|
||||||
|
auto img_k = img_to_k->forward(ctx, img_hidden_states);
|
||||||
|
img_k = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, kv_heads, L_img, N);
|
||||||
|
auto img_v = img_to_v->forward(ctx, img_hidden_states);
|
||||||
|
img_v = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, kv_heads, L_img, N);
|
||||||
|
|
||||||
|
auto instruct_q = instruct_to_q->forward(ctx, instruct_hidden_states);
|
||||||
|
instruct_q = ggml_reshape_4d(ctx->ggml_ctx, instruct_q, dim_head, heads, L_instruct, N);
|
||||||
|
auto instruct_k = instruct_to_k->forward(ctx, instruct_hidden_states);
|
||||||
|
instruct_k = ggml_reshape_4d(ctx->ggml_ctx, instruct_k, dim_head, kv_heads, L_instruct, N);
|
||||||
|
auto instruct_v = instruct_to_v->forward(ctx, instruct_hidden_states);
|
||||||
|
instruct_v = ggml_reshape_4d(ctx->ggml_ctx, instruct_v, dim_head, kv_heads, L_instruct, N);
|
||||||
|
|
||||||
|
auto q = ggml_concat(ctx->ggml_ctx, instruct_q, img_q, 2);
|
||||||
|
auto k = ggml_concat(ctx->ggml_ctx, instruct_k, img_k, 2);
|
||||||
|
auto v = ggml_concat(ctx->ggml_ctx, instruct_v, img_v, 2);
|
||||||
|
q = norm_q->forward(ctx, q);
|
||||||
|
k = norm_k->forward(ctx, k);
|
||||||
|
|
||||||
|
auto hidden_states = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
|
||||||
|
auto instruct_attn = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, 0, L_instruct);
|
||||||
|
auto img_attn = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, L_instruct, L_instruct + L_img);
|
||||||
|
|
||||||
|
instruct_attn = instruct_out->forward(ctx, instruct_attn);
|
||||||
|
img_attn = img_out->forward(ctx, img_attn);
|
||||||
|
hidden_states = ggml_concat(ctx->ggml_ctx, instruct_attn, img_attn, 1);
|
||||||
|
hidden_states = to_out_0->forward(ctx, hidden_states);
|
||||||
|
return hidden_states;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BooguImageDoubleStreamBlock : public GGMLBlock {
|
||||||
|
BooguImageDoubleStreamBlock(int64_t dim,
|
||||||
|
int64_t num_attention_heads,
|
||||||
|
int64_t num_kv_heads,
|
||||||
|
int64_t multiple_of,
|
||||||
|
float norm_eps) {
|
||||||
|
int64_t head_dim = dim / num_attention_heads;
|
||||||
|
blocks["img_instruct_attn"] = std::make_shared<BooguImageJointAttention>(dim, head_dim, num_attention_heads, num_kv_heads);
|
||||||
|
blocks["img_self_attn"] = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
|
||||||
|
blocks["img_feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
|
||||||
|
blocks["instruct_feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
|
||||||
|
blocks["img_norm1"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["img_norm2"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["img_norm3"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["instruct_norm1"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["instruct_norm2"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["img_attn_norm"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["img_self_attn_norm"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["img_ffn_norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["img_ffn_norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["instruct_attn_norm"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["instruct_ffn_norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["instruct_ffn_norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* img_hidden_states,
|
||||||
|
ggml_tensor* instruct_hidden_states,
|
||||||
|
ggml_tensor* joint_rotary_emb,
|
||||||
|
ggml_tensor* img_rotary_emb,
|
||||||
|
ggml_tensor* temb) {
|
||||||
|
auto img_instruct_attn = std::dynamic_pointer_cast<BooguImageJointAttention>(blocks["img_instruct_attn"]);
|
||||||
|
auto img_self_attn = std::dynamic_pointer_cast<Attention>(blocks["img_self_attn"]);
|
||||||
|
auto img_feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["img_feed_forward"]);
|
||||||
|
auto instruct_feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["instruct_feed_forward"]);
|
||||||
|
auto img_norm1 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm1"]);
|
||||||
|
auto img_norm2 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm2"]);
|
||||||
|
auto img_norm3 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm3"]);
|
||||||
|
auto instruct_norm1 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm1"]);
|
||||||
|
auto instruct_norm2 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm2"]);
|
||||||
|
auto img_attn_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["img_attn_norm"]);
|
||||||
|
auto img_self_attn_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["img_self_attn_norm"]);
|
||||||
|
auto img_ffn_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm1"]);
|
||||||
|
auto img_ffn_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm2"]);
|
||||||
|
auto instruct_attn_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_attn_norm"]);
|
||||||
|
auto instruct_ffn_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm1"]);
|
||||||
|
auto instruct_ffn_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm2"]);
|
||||||
|
|
||||||
|
int64_t L_instruct = instruct_hidden_states->ne[1];
|
||||||
|
|
||||||
|
auto img_norm1_out_vec = img_norm1->forward(ctx, img_hidden_states, temb);
|
||||||
|
auto img_norm2_out_vec = img_norm2->forward(ctx, img_hidden_states, temb);
|
||||||
|
auto img_norm3_out_vec = img_norm3->forward(ctx, img_hidden_states, temb);
|
||||||
|
auto instruct_norm1_out_vec = instruct_norm1->forward(ctx, instruct_hidden_states, temb);
|
||||||
|
auto instruct_norm2_out_vec = instruct_norm2->forward(ctx, instruct_hidden_states, temb);
|
||||||
|
|
||||||
|
auto img_norm1_out = std::get<0>(img_norm1_out_vec);
|
||||||
|
auto img_gate_msa = std::get<1>(img_norm1_out_vec);
|
||||||
|
auto img_scale_mlp = std::get<2>(img_norm1_out_vec);
|
||||||
|
auto img_gate_mlp = std::get<3>(img_norm1_out_vec);
|
||||||
|
|
||||||
|
auto img_norm2_out = std::get<0>(img_norm2_out_vec);
|
||||||
|
auto img_shift_mlp = std::get<1>(img_norm2_out_vec);
|
||||||
|
|
||||||
|
auto img_norm3_out = std::get<0>(img_norm3_out_vec);
|
||||||
|
auto img_gate_self = std::get<1>(img_norm3_out_vec);
|
||||||
|
|
||||||
|
auto instruct_norm1_out = std::get<0>(instruct_norm1_out_vec);
|
||||||
|
auto instruct_gate_msa = std::get<1>(instruct_norm1_out_vec);
|
||||||
|
auto instruct_scale_mlp = std::get<2>(instruct_norm1_out_vec);
|
||||||
|
auto instruct_gate_mlp = std::get<3>(instruct_norm1_out_vec);
|
||||||
|
|
||||||
|
auto instruct_norm2_out = std::get<0>(instruct_norm2_out_vec);
|
||||||
|
auto instruct_shift_mlp = std::get<1>(instruct_norm2_out_vec);
|
||||||
|
|
||||||
|
auto joint_attn_out = img_instruct_attn->forward(ctx, img_norm1_out, instruct_norm1_out, joint_rotary_emb);
|
||||||
|
auto instruct_attn_out = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, 0, L_instruct);
|
||||||
|
auto img_attn_out = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, L_instruct, joint_attn_out->ne[1]);
|
||||||
|
|
||||||
|
auto img_self_attn_out = img_self_attn->forward(ctx, img_norm3_out, img_norm3_out, img_rotary_emb);
|
||||||
|
|
||||||
|
img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_attn_norm->forward(ctx, img_attn_out), img_gate_msa);
|
||||||
|
img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_self_attn_norm->forward(ctx, img_self_attn_out), img_gate_self);
|
||||||
|
|
||||||
|
auto img_mlp_input = scale_modulate(ctx->ggml_ctx, img_norm2_out, img_scale_mlp);
|
||||||
|
img_shift_mlp = ggml_reshape_3d(ctx->ggml_ctx, img_shift_mlp, img_shift_mlp->ne[0], 1, img_shift_mlp->ne[1]);
|
||||||
|
img_mlp_input = ggml_add(ctx->ggml_ctx, img_mlp_input, img_shift_mlp);
|
||||||
|
auto img_mlp_out = img_feed_forward->forward(ctx, img_ffn_norm1->forward(ctx, img_mlp_input));
|
||||||
|
img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_ffn_norm2->forward(ctx, img_mlp_out), img_gate_mlp);
|
||||||
|
|
||||||
|
instruct_hidden_states = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_attn_norm->forward(ctx, instruct_attn_out), instruct_gate_msa);
|
||||||
|
auto instruct_mlp_input = scale_modulate(ctx->ggml_ctx, instruct_norm2_out, instruct_scale_mlp);
|
||||||
|
instruct_shift_mlp = ggml_reshape_3d(ctx->ggml_ctx, instruct_shift_mlp, instruct_shift_mlp->ne[0], 1, instruct_shift_mlp->ne[1]);
|
||||||
|
instruct_mlp_input = ggml_add(ctx->ggml_ctx, instruct_mlp_input, instruct_shift_mlp);
|
||||||
|
auto instruct_mlp_out = instruct_feed_forward->forward(ctx, instruct_ffn_norm1->forward(ctx, instruct_mlp_input));
|
||||||
|
instruct_hidden_states = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_ffn_norm2->forward(ctx, instruct_mlp_out), instruct_gate_mlp);
|
||||||
|
|
||||||
|
return {img_hidden_states, instruct_hidden_states};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BooguImageModel : public GGMLBlock {
|
||||||
|
BooguConfig config;
|
||||||
|
|
||||||
|
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
||||||
|
GGML_UNUSED(tensor_storage_map);
|
||||||
|
GGML_UNUSED(prefix);
|
||||||
|
params["image_index_embedding"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, config.hidden_size, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
BooguImageModel() = default;
|
||||||
|
BooguImageModel(BooguConfig config)
|
||||||
|
: config(std::move(config)) {
|
||||||
|
blocks["x_embedder"] = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
|
||||||
|
blocks["ref_image_patch_embedder"] = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
|
||||||
|
blocks["time_caption_embed"] = std::make_shared<LuminaCombinedTimestepCaptionEmbedding>(this->config.hidden_size,
|
||||||
|
this->config.instruction_feat_dim,
|
||||||
|
256,
|
||||||
|
this->config.norm_eps,
|
||||||
|
this->config.timestep_scale);
|
||||||
|
|
||||||
|
for (int i = 0; i < this->config.num_refiner_layers; i++) {
|
||||||
|
blocks["noise_refiner." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps,
|
||||||
|
true);
|
||||||
|
blocks["ref_image_refiner." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps,
|
||||||
|
true);
|
||||||
|
blocks["context_refiner." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < this->config.num_double_stream_layers; i++) {
|
||||||
|
blocks["double_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageDoubleStreamBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < this->config.num_layers; i++) {
|
||||||
|
blocks["single_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps,
|
||||||
|
true);
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks["norm_out"] = std::make_shared<LuminaLayerNormContinuous>(this->config.hidden_size,
|
||||||
|
this->config.timestep_embed_dim,
|
||||||
|
this->config.patch_size * this->config.patch_size * this->config.out_channels);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* image_index_embedding(GGMLRunnerContext* ctx, int index) {
|
||||||
|
GGML_ASSERT(index >= 0 && index < 5);
|
||||||
|
auto embedding = params["image_index_embedding"];
|
||||||
|
auto out = ggml_view_1d(ctx->ggml_ctx,
|
||||||
|
embedding,
|
||||||
|
config.hidden_size,
|
||||||
|
index * config.hidden_size * ggml_element_size(embedding));
|
||||||
|
out = ggml_reshape_3d(ctx->ggml_ctx, out, config.hidden_size, 1, 1);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* embed_refs(GGMLRunnerContext* ctx, const std::vector<ggml_tensor*>& ref_latents) {
|
||||||
|
if (ref_latents.empty()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
auto ref_image_patch_embedder = std::dynamic_pointer_cast<Linear>(blocks["ref_image_patch_embedder"]);
|
||||||
|
|
||||||
|
ggml_tensor* ref_img = nullptr;
|
||||||
|
for (int i = 0; i < static_cast<int>(ref_latents.size()); i++) {
|
||||||
|
auto ref = DiT::pad_and_patchify(ctx, ref_latents[i], config.patch_size, config.patch_size, false);
|
||||||
|
ref = ref_image_patch_embedder->forward(ctx, ref);
|
||||||
|
ref = ggml_add(ctx->ggml_ctx, ref, image_index_embedding(ctx, std::min(i, 4)));
|
||||||
|
ref_img = ref_img == nullptr ? ref : ggml_concat(ctx->ggml_ctx, ref_img, ref, 1);
|
||||||
|
}
|
||||||
|
return ref_img;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
ggml_tensor* timesteps,
|
||||||
|
ggml_tensor* context,
|
||||||
|
ggml_tensor* pe,
|
||||||
|
std::vector<ggml_tensor*> ref_latents = {}) {
|
||||||
|
int64_t W = x->ne[0];
|
||||||
|
int64_t H = x->ne[1];
|
||||||
|
int64_t N = x->ne[3];
|
||||||
|
GGML_ASSERT(N == 1);
|
||||||
|
|
||||||
|
auto x_embedder = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]);
|
||||||
|
auto time_caption_embed = std::dynamic_pointer_cast<LuminaCombinedTimestepCaptionEmbedding>(blocks["time_caption_embed"]);
|
||||||
|
auto norm_out = std::dynamic_pointer_cast<LuminaLayerNormContinuous>(blocks["norm_out"]);
|
||||||
|
|
||||||
|
auto timestep = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, timesteps), timesteps);
|
||||||
|
auto embeds = time_caption_embed->forward(ctx, timestep, context);
|
||||||
|
auto temb = embeds.first;
|
||||||
|
auto txt = embeds.second;
|
||||||
|
|
||||||
|
auto img = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size, false);
|
||||||
|
int64_t img_len = img->ne[1];
|
||||||
|
img = x_embedder->forward(ctx, img);
|
||||||
|
auto ref_img = embed_refs(ctx, ref_latents);
|
||||||
|
int64_t ref_len = ref_img != nullptr ? ref_img->ne[1] : 0;
|
||||||
|
int64_t txt_len = txt->ne[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(pe->ne[3] == txt_len + ref_len + img_len);
|
||||||
|
auto txt_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt_len);
|
||||||
|
auto noise_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len + ref_len, txt_len + ref_len + img_len);
|
||||||
|
|
||||||
|
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
|
||||||
|
txt = block->forward(ctx, txt, txt_pe);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.context_refiner." + std::to_string(i), "txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
|
||||||
|
img = block->forward(ctx, img, noise_pe, temb);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(img, "boogu.noise_refiner." + std::to_string(i), "img");
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* combined_img = img;
|
||||||
|
if (ref_img != nullptr) {
|
||||||
|
auto ref_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + ref_len);
|
||||||
|
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["ref_image_refiner." + std::to_string(i)]);
|
||||||
|
ref_img = block->forward(ctx, ref_img, ref_pe, temb);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(ref_img, "boogu.ref_image_refiner." + std::to_string(i), "ref_img");
|
||||||
|
}
|
||||||
|
combined_img = ggml_concat(ctx->ggml_ctx, ref_img, img, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + combined_img->ne[1]);
|
||||||
|
for (int i = 0; i < config.num_double_stream_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageDoubleStreamBlock>(blocks["double_stream_layers." + std::to_string(i)]);
|
||||||
|
auto result = block->forward(ctx, combined_img, txt, pe, img_pe, temb);
|
||||||
|
combined_img = result.first;
|
||||||
|
txt = result.second;
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(combined_img, "boogu.double_stream_layers." + std::to_string(i), "img");
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.double_stream_layers." + std::to_string(i), "txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto hidden_states = ggml_concat(ctx->ggml_ctx, txt, combined_img, 1);
|
||||||
|
for (int i = 0; i < config.num_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["single_stream_layers." + std::to_string(i)]);
|
||||||
|
hidden_states = block->forward(ctx, hidden_states, pe, temb);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "boogu.single_stream_layers." + std::to_string(i), "hidden_states");
|
||||||
|
}
|
||||||
|
|
||||||
|
hidden_states = norm_out->forward(ctx, hidden_states, temb);
|
||||||
|
hidden_states = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, hidden_states->ne[1] - img_len, hidden_states->ne[1]);
|
||||||
|
hidden_states = DiT::unpatchify_and_crop(ctx->ggml_ctx, hidden_states, H, W, config.patch_size, config.patch_size, false);
|
||||||
|
hidden_states = ggml_ext_scale(ctx->ggml_ctx, hidden_states, -1.f);
|
||||||
|
return hidden_states;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
__STATIC_INLINE__ int patched_token_count(int64_t size, int patch_size) {
|
||||||
|
int pad = (patch_size - (static_cast<int>(size) % patch_size)) % patch_size;
|
||||||
|
return (static_cast<int>(size) + pad) / patch_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
__STATIC_INLINE__ void append_spatial_ids(std::vector<std::vector<float>>& ids,
|
||||||
|
int bs,
|
||||||
|
int pe_shift,
|
||||||
|
int h_tokens,
|
||||||
|
int w_tokens) {
|
||||||
|
std::vector<std::vector<float>> image_ids(h_tokens * w_tokens, std::vector<float>(3, 0.0f));
|
||||||
|
for (int h = 0; h < h_tokens; h++) {
|
||||||
|
for (int w = 0; w < w_tokens; w++) {
|
||||||
|
image_ids[h * w_tokens + w][0] = static_cast<float>(pe_shift);
|
||||||
|
image_ids[h * w_tokens + w][1] = static_cast<float>(h);
|
||||||
|
image_ids[h * w_tokens + w][2] = static_cast<float>(w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int b = 0; b < bs; b++) {
|
||||||
|
ids.insert(ids.end(), image_ids.begin(), image_ids.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__STATIC_INLINE__ std::vector<float> gen_boogu_pe(int h,
|
||||||
|
int w,
|
||||||
|
int patch_size,
|
||||||
|
int bs,
|
||||||
|
int context_len,
|
||||||
|
const std::vector<ggml_tensor*>& ref_latents,
|
||||||
|
int theta,
|
||||||
|
const std::vector<int>& axes_dim) {
|
||||||
|
std::vector<std::vector<float>> ids;
|
||||||
|
ids.reserve(static_cast<size_t>(bs) * context_len);
|
||||||
|
for (int b = 0; b < bs; b++) {
|
||||||
|
for (int i = 0; i < context_len; i++) {
|
||||||
|
float pos = static_cast<float>(i);
|
||||||
|
ids.push_back({pos, pos, pos});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int pe_shift = context_len;
|
||||||
|
for (ggml_tensor* ref : ref_latents) {
|
||||||
|
int ref_h_tokens = patched_token_count(ref->ne[1], patch_size);
|
||||||
|
int ref_w_tokens = patched_token_count(ref->ne[0], patch_size);
|
||||||
|
append_spatial_ids(ids, bs, pe_shift, ref_h_tokens, ref_w_tokens);
|
||||||
|
pe_shift += std::max(ref_h_tokens, ref_w_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
int h_tokens = patched_token_count(h, patch_size);
|
||||||
|
int w_tokens = patched_token_count(w, patch_size);
|
||||||
|
append_spatial_ids(ids, bs, pe_shift, h_tokens, w_tokens);
|
||||||
|
|
||||||
|
return Rope::embed_nd(ids, bs, static_cast<float>(theta), axes_dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct BooguImageRunner : public DiffusionModelRunner {
|
||||||
|
BooguConfig config;
|
||||||
|
BooguImageModel boogu;
|
||||||
|
std::vector<float> pe_vec;
|
||||||
|
|
||||||
|
BooguImageRunner(ggml_backend_t backend,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string prefix = "",
|
||||||
|
SDVersion version = VERSION_BOOGU_IMAGE,
|
||||||
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
|
config(BooguConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||||
|
boogu = BooguImageModel(config);
|
||||||
|
boogu.init(params_ctx, tensor_storage_map, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() override {
|
||||||
|
return "boogu_image";
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||||
|
boogu.get_param_tensors(tensors, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
|
||||||
|
const sd::Tensor<float>& timesteps_tensor,
|
||||||
|
const sd::Tensor<float>& context_tensor,
|
||||||
|
const std::vector<sd::Tensor<float>>& ref_latents_tensor = {}) {
|
||||||
|
ggml_cgraph* gf = new_graph_custom(BOOGU_GRAPH_SIZE);
|
||||||
|
ggml_tensor* x = make_input(x_tensor);
|
||||||
|
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
||||||
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
|
GGML_ASSERT(!context_tensor.empty());
|
||||||
|
ggml_tensor* context = make_input(context_tensor);
|
||||||
|
|
||||||
|
std::vector<ggml_tensor*> ref_latents;
|
||||||
|
ref_latents.reserve(ref_latents_tensor.size());
|
||||||
|
for (const auto& ref_latent_tensor : ref_latents_tensor) {
|
||||||
|
ref_latents.push_back(make_input(ref_latent_tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
pe_vec = gen_boogu_pe(static_cast<int>(x->ne[1]),
|
||||||
|
static_cast<int>(x->ne[0]),
|
||||||
|
config.patch_size,
|
||||||
|
static_cast<int>(x->ne[3]),
|
||||||
|
static_cast<int>(context->ne[1]),
|
||||||
|
ref_latents,
|
||||||
|
config.theta,
|
||||||
|
config.axes_dim);
|
||||||
|
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
|
||||||
|
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
|
||||||
|
set_backend_tensor_data(pe, pe_vec.data());
|
||||||
|
|
||||||
|
auto runner_ctx = get_context();
|
||||||
|
ggml_tensor* out = boogu.forward(&runner_ctx, x, timesteps, context, pe, ref_latents);
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> compute(int n_threads,
|
||||||
|
const sd::Tensor<float>& x,
|
||||||
|
const sd::Tensor<float>& timesteps,
|
||||||
|
const sd::Tensor<float>& context,
|
||||||
|
const std::vector<sd::Tensor<float>>& ref_latents = {}) {
|
||||||
|
auto get_graph = [&]() -> ggml_cgraph* {
|
||||||
|
return build_graph(x, timesteps, context, ref_latents);
|
||||||
|
};
|
||||||
|
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> compute(int n_threads,
|
||||||
|
const DiffusionParams& diffusion_params) override {
|
||||||
|
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||||
|
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||||
|
static const std::vector<sd::Tensor<float>> empty_ref_latents;
|
||||||
|
return compute(n_threads,
|
||||||
|
*diffusion_params.x,
|
||||||
|
*diffusion_params.timesteps,
|
||||||
|
tensor_or_empty(diffusion_params.context),
|
||||||
|
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace Boogu
|
||||||
|
|
||||||
|
#endif // __SD_MODEL_DIFFUSION_BOOGU_HPP__
|
||||||
@ -162,6 +162,8 @@ namespace ErnieImage {
|
|||||||
int64_t S = x->ne[1];
|
int64_t S = x->ne[1];
|
||||||
int64_t N = x->ne[2];
|
int64_t N = x->ne[2];
|
||||||
|
|
||||||
|
float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f;
|
||||||
|
|
||||||
auto q = to_q->forward(ctx, x);
|
auto q = to_q->forward(ctx, x);
|
||||||
auto k = to_k->forward(ctx, x);
|
auto k = to_k->forward(ctx, x);
|
||||||
auto v = to_v->forward(ctx, x);
|
auto v = to_v->forward(ctx, x);
|
||||||
@ -182,7 +184,7 @@ namespace ErnieImage {
|
|||||||
k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, heads, S, head_dim]
|
k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, heads, S, head_dim]
|
||||||
k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);
|
k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);
|
||||||
|
|
||||||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled); // [N, S, hidden_size]
|
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled, scale); // [N, S, hidden_size]
|
||||||
x = to_out_0->forward(ctx, x);
|
x = to_out_0->forward(ctx, x);
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -79,6 +79,7 @@ namespace LLM {
|
|||||||
int window_size = 112;
|
int window_size = 112;
|
||||||
int num_position_embeddings = 0;
|
int num_position_embeddings = 0;
|
||||||
std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
|
std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
|
||||||
|
bool split_patch_embed = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLMConfig {
|
struct LLMConfig {
|
||||||
@ -179,7 +180,8 @@ namespace LLM {
|
|||||||
config.num_experts_per_tok = 4;
|
config.num_experts_per_tok = 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
config.num_layers = 0;
|
config.num_layers = 0;
|
||||||
|
int detected_vision_layers = 0;
|
||||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||||
if (!starts_with(name, prefix)) {
|
if (!starts_with(name, prefix)) {
|
||||||
continue;
|
continue;
|
||||||
@ -190,6 +192,38 @@ namespace LLM {
|
|||||||
if (contains(name, "attn.q_proj")) {
|
if (contains(name, "attn.q_proj")) {
|
||||||
config.llama_cpp_style = true;
|
config.llama_cpp_style = true;
|
||||||
}
|
}
|
||||||
|
if (contains(name, "visual.patch_embed.proj.1.weight")) {
|
||||||
|
config.vision.split_patch_embed = true;
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.patch_embed.proj.0.weight")) {
|
||||||
|
config.vision.patch_size = static_cast<int>(tensor_storage.ne[0]);
|
||||||
|
config.vision.in_channels = tensor_storage.ne[2];
|
||||||
|
config.vision.hidden_size = tensor_storage.ne[3];
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.patch_embed.bias")) {
|
||||||
|
config.vision.hidden_size = tensor_storage.ne[0];
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.pos_embed.weight")) {
|
||||||
|
config.vision.hidden_size = tensor_storage.ne[0];
|
||||||
|
config.vision.num_position_embeddings = static_cast<int>(tensor_storage.ne[1]);
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.blocks.")) {
|
||||||
|
auto items = split_string(name.substr(pos), '.');
|
||||||
|
if (items.size() > 2) {
|
||||||
|
int block_index = atoi(items[2].c_str());
|
||||||
|
if (block_index + 1 > detected_vision_layers) {
|
||||||
|
detected_vision_layers = block_index + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.blocks.0.mlp.linear_fc1.weight") ||
|
||||||
|
contains(name, "visual.blocks.0.mlp.gate_proj.weight")) {
|
||||||
|
config.vision.intermediate_size = tensor_storage.ne[1];
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.merger.linear_fc2.weight") ||
|
||||||
|
contains(name, "visual.merger.mlp.2.weight")) {
|
||||||
|
config.vision.out_hidden_size = tensor_storage.ne[1];
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
pos = name.find("layers.");
|
pos = name.find("layers.");
|
||||||
@ -219,6 +253,9 @@ namespace LLM {
|
|||||||
if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
|
if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
|
||||||
config.num_heads = 16;
|
config.num_heads = 16;
|
||||||
}
|
}
|
||||||
|
if (detected_vision_layers > 0) {
|
||||||
|
config.vision.num_layers = detected_vision_layers;
|
||||||
|
}
|
||||||
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
|
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
|
||||||
config.num_layers,
|
config.num_layers,
|
||||||
config.vocab_size,
|
config.vocab_size,
|
||||||
@ -539,40 +576,51 @@ namespace LLM {
|
|||||||
|
|
||||||
struct VisionPatchEmbed : public GGMLBlock {
|
struct VisionPatchEmbed : public GGMLBlock {
|
||||||
protected:
|
protected:
|
||||||
bool llama_cpp_style;
|
bool split_patch_embed;
|
||||||
|
bool bias;
|
||||||
int patch_size;
|
int patch_size;
|
||||||
int temporal_patch_size;
|
int temporal_patch_size;
|
||||||
int64_t in_channels;
|
int64_t in_channels;
|
||||||
int64_t embed_dim;
|
int64_t embed_dim;
|
||||||
|
|
||||||
|
void init_params(ggml_context* ctx,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string prefix = "") override {
|
||||||
|
GGML_UNUSED(tensor_storage_map);
|
||||||
|
GGML_UNUSED(prefix);
|
||||||
|
if (split_patch_embed && bias) {
|
||||||
|
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, embed_dim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
VisionPatchEmbed(bool llama_cpp_style,
|
VisionPatchEmbed(bool split_patch_embed,
|
||||||
LLMVisionArch arch,
|
LLMVisionArch arch,
|
||||||
int patch_size = 14,
|
int patch_size = 14,
|
||||||
int temporal_patch_size = 2,
|
int temporal_patch_size = 2,
|
||||||
int64_t in_channels = 3,
|
int64_t in_channels = 3,
|
||||||
int64_t embed_dim = 1152)
|
int64_t embed_dim = 1152)
|
||||||
: llama_cpp_style(llama_cpp_style),
|
: split_patch_embed(split_patch_embed),
|
||||||
|
bias(arch == LLMVisionArch::QWEN3_VL),
|
||||||
patch_size(patch_size),
|
patch_size(patch_size),
|
||||||
temporal_patch_size(temporal_patch_size),
|
temporal_patch_size(temporal_patch_size),
|
||||||
in_channels(in_channels),
|
in_channels(in_channels),
|
||||||
embed_dim(embed_dim) {
|
embed_dim(embed_dim) {
|
||||||
bool bias = arch == LLMVisionArch::QWEN3_VL;
|
if (split_patch_embed) {
|
||||||
if (llama_cpp_style) {
|
|
||||||
blocks["proj.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
|
blocks["proj.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
|
||||||
embed_dim,
|
embed_dim,
|
||||||
{patch_size, patch_size},
|
{patch_size, patch_size},
|
||||||
{patch_size, patch_size},
|
{patch_size, patch_size},
|
||||||
{0, 0},
|
{0, 0},
|
||||||
{1, 1},
|
{1, 1},
|
||||||
bias));
|
false));
|
||||||
blocks["proj.1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
|
blocks["proj.1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
|
||||||
embed_dim,
|
embed_dim,
|
||||||
{patch_size, patch_size},
|
{patch_size, patch_size},
|
||||||
{patch_size, patch_size},
|
{patch_size, patch_size},
|
||||||
{0, 0},
|
{0, 0},
|
||||||
{1, 1},
|
{1, 1},
|
||||||
bias));
|
false));
|
||||||
} else {
|
} else {
|
||||||
std::tuple<int, int, int> kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size};
|
std::tuple<int, int, int> kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size};
|
||||||
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
|
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
|
||||||
@ -593,7 +641,7 @@ namespace LLM {
|
|||||||
temporal_patch_size,
|
temporal_patch_size,
|
||||||
ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size));
|
ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size));
|
||||||
|
|
||||||
if (llama_cpp_style) {
|
if (split_patch_embed) {
|
||||||
auto proj_0 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.0"]);
|
auto proj_0 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.0"]);
|
||||||
auto proj_1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.1"]);
|
auto proj_1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.1"]);
|
||||||
|
|
||||||
@ -606,6 +654,10 @@ namespace LLM {
|
|||||||
x1 = proj_1->forward(ctx, x1);
|
x1 = proj_1->forward(ctx, x1);
|
||||||
|
|
||||||
x = ggml_add(ctx->ggml_ctx, x0, x1);
|
x = ggml_add(ctx->ggml_ctx, x0, x1);
|
||||||
|
if (bias) {
|
||||||
|
auto b = ggml_reshape_4d(ctx->ggml_ctx, params["bias"], 1, 1, embed_dim, 1);
|
||||||
|
x = ggml_add_inplace(ctx->ggml_ctx, x, b);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
auto proj = std::dynamic_pointer_cast<Conv3d>(blocks["proj"]);
|
auto proj = std::dynamic_pointer_cast<Conv3d>(blocks["proj"]);
|
||||||
|
|
||||||
@ -798,7 +850,7 @@ namespace LLM {
|
|||||||
spatial_merge_size(vision_params.spatial_merge_size),
|
spatial_merge_size(vision_params.spatial_merge_size),
|
||||||
num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast<int>(std::sqrt(vision_params.num_position_embeddings)) : 0),
|
num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast<int>(std::sqrt(vision_params.num_position_embeddings)) : 0),
|
||||||
fullatt_block_indexes(vision_params.fullatt_block_indexes) {
|
fullatt_block_indexes(vision_params.fullatt_block_indexes) {
|
||||||
blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(llama_cpp_style,
|
blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(vision_params.split_patch_embed,
|
||||||
arch_,
|
arch_,
|
||||||
vision_params.patch_size,
|
vision_params.patch_size,
|
||||||
vision_params.temporal_patch_size,
|
vision_params.temporal_patch_size,
|
||||||
|
|||||||
@ -682,7 +682,7 @@ struct AutoEncoderKL : public VAE {
|
|||||||
} else if (sd_version_is_sd3(version)) {
|
} else if (sd_version_is_sd3(version)) {
|
||||||
scale_factor = 1.5305f;
|
scale_factor = 1.5305f;
|
||||||
shift_factor = 0.0609f;
|
shift_factor = 0.0609f;
|
||||||
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
|
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
|
||||||
scale_factor = 0.3611f;
|
scale_factor = 0.3611f;
|
||||||
shift_factor = 0.1159f;
|
shift_factor = 0.1159f;
|
||||||
} else if (sd_version_uses_flux2_vae(version)) {
|
} else if (sd_version_uses_flux2_vae(version)) {
|
||||||
|
|||||||
@ -485,6 +485,9 @@ SDVersion ModelLoader::get_sd_version() {
|
|||||||
if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
|
if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
|
||||||
return VERSION_Z_IMAGE;
|
return VERSION_Z_IMAGE;
|
||||||
}
|
}
|
||||||
|
if (tensor_storage.name.find("double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight") != std::string::npos) {
|
||||||
|
return VERSION_BOOGU_IMAGE;
|
||||||
|
}
|
||||||
if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
|
if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
|
||||||
return VERSION_ERNIE_IMAGE;
|
return VERSION_ERNIE_IMAGE;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -184,6 +184,27 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
|
|||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string convert_qwen3_vl_vision_name(std::string name) {
|
||||||
|
static const std::vector<std::pair<std::string, std::string>> qwen3_vl_vision_name_map{
|
||||||
|
{"mm.0.", "merger.linear_fc1."},
|
||||||
|
{"mm.2.", "merger.linear_fc2."},
|
||||||
|
{"v.post_ln.", "merger.norm."},
|
||||||
|
{"v.position_embd.weight", "pos_embed.weight"},
|
||||||
|
{"v.patch_embd.weight.1", "patch_embed.proj.1.weight"},
|
||||||
|
{"v.patch_embd.weight", "patch_embed.proj.0.weight"},
|
||||||
|
{"v.patch_embd.bias", "patch_embed.bias"},
|
||||||
|
{"v.blk.", "blocks."},
|
||||||
|
{"attn_qkv.", "attn.qkv."},
|
||||||
|
{"attn_out.", "attn.proj."},
|
||||||
|
{"ffn_up.", "mlp.linear_fc1."},
|
||||||
|
{"ffn_down.", "mlp.linear_fc2."},
|
||||||
|
{"ln1.", "norm1."},
|
||||||
|
{"ln2.", "norm2."},
|
||||||
|
};
|
||||||
|
replace_with_name_map(name, qwen3_vl_vision_name_map);
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
// ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
|
// ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
|
||||||
std::string convert_diffusers_unet_to_original_sd1(std::string name) {
|
std::string convert_diffusers_unet_to_original_sd1(std::string name) {
|
||||||
// (stable-diffusion, HF Diffusers)
|
// (stable-diffusion, HF Diffusers)
|
||||||
@ -1154,6 +1175,10 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
|
|||||||
|
|
||||||
replace_with_prefix_map(name, prefix_map);
|
replace_with_prefix_map(name, prefix_map);
|
||||||
|
|
||||||
|
if (sd_version_is_boogu_image(version) && starts_with(name, "text_encoders.llm.visual.")) {
|
||||||
|
name = convert_qwen3_vl_vision_name(std::move(name));
|
||||||
|
}
|
||||||
|
|
||||||
// diffusion model
|
// diffusion model
|
||||||
{
|
{
|
||||||
for (const auto& prefix : diffuison_model_prefix_vec) {
|
for (const auto& prefix : diffuison_model_prefix_vec) {
|
||||||
|
|||||||
@ -20,6 +20,7 @@
|
|||||||
#include "extensions/generation_extension.h"
|
#include "extensions/generation_extension.h"
|
||||||
#include "model/adapter/lora.hpp"
|
#include "model/adapter/lora.hpp"
|
||||||
#include "model/diffusion/anima.hpp"
|
#include "model/diffusion/anima.hpp"
|
||||||
|
#include "model/diffusion/boogu.hpp"
|
||||||
#include "model/diffusion/control.hpp"
|
#include "model/diffusion/control.hpp"
|
||||||
#include "model/diffusion/ernie_image.hpp"
|
#include "model/diffusion/ernie_image.hpp"
|
||||||
#include "model/diffusion/flux.hpp"
|
#include "model/diffusion/flux.hpp"
|
||||||
@ -87,6 +88,7 @@ const char* model_version_to_str[] = {
|
|||||||
"LTXAV",
|
"LTXAV",
|
||||||
"HiDream O1",
|
"HiDream O1",
|
||||||
"Z-Image",
|
"Z-Image",
|
||||||
|
"Boogu Image",
|
||||||
"Ovis Image",
|
"Ovis Image",
|
||||||
"Ernie Image",
|
"Ernie Image",
|
||||||
"Lens",
|
"Lens",
|
||||||
@ -124,7 +126,8 @@ static bool sd_version_supports_ref_latent_img_cfg(SDVersion version) {
|
|||||||
sd_version_is_flux2(version) ||
|
sd_version_is_flux2(version) ||
|
||||||
sd_version_is_qwen_image(version) ||
|
sd_version_is_qwen_image(version) ||
|
||||||
sd_version_is_longcat(version) ||
|
sd_version_is_longcat(version) ||
|
||||||
sd_version_is_z_image(version);
|
sd_version_is_z_image(version) ||
|
||||||
|
sd_version_is_boogu_image(version);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool sd_version_supports_img_cfg(SDVersion version, bool has_ref_images) {
|
static bool sd_version_supports_img_cfg(SDVersion version, bool has_ref_images) {
|
||||||
@ -784,6 +787,18 @@ public:
|
|||||||
"model.diffusion_model",
|
"model.diffusion_model",
|
||||||
version,
|
version,
|
||||||
model_manager);
|
model_manager);
|
||||||
|
} else if (sd_version_is_boogu_image(version)) {
|
||||||
|
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
||||||
|
tensor_storage_map,
|
||||||
|
version,
|
||||||
|
"",
|
||||||
|
true,
|
||||||
|
model_manager);
|
||||||
|
diffusion_model = std::make_shared<Boogu::BooguImageRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||||
|
tensor_storage_map,
|
||||||
|
"model.diffusion_model",
|
||||||
|
version,
|
||||||
|
model_manager);
|
||||||
} else if (sd_version_is_ernie_image(version)) {
|
} else if (sd_version_is_ernie_image(version)) {
|
||||||
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
@ -1220,6 +1235,7 @@ public:
|
|||||||
sd_version_is_anima(version) ||
|
sd_version_is_anima(version) ||
|
||||||
sd_version_is_ernie_image(version) ||
|
sd_version_is_ernie_image(version) ||
|
||||||
sd_version_is_z_image(version) ||
|
sd_version_is_z_image(version) ||
|
||||||
|
sd_version_is_boogu_image(version) ||
|
||||||
sd_version_is_pid(version) ||
|
sd_version_is_pid(version) ||
|
||||||
sd_version_is_ideogram4(version)) {
|
sd_version_is_ideogram4(version)) {
|
||||||
pred_type = FLOW_PRED;
|
pred_type = FLOW_PRED;
|
||||||
@ -1231,6 +1247,8 @@ public:
|
|||||||
default_flow_shift = 1.5f;
|
default_flow_shift = 1.5f;
|
||||||
} else if (sd_version_is_ideogram4(version)) {
|
} else if (sd_version_is_ideogram4(version)) {
|
||||||
default_flow_shift = 1.0f;
|
default_flow_shift = 1.0f;
|
||||||
|
} else if (sd_version_is_boogu_image(version)) {
|
||||||
|
default_flow_shift = 3.16f;
|
||||||
} else {
|
} else {
|
||||||
default_flow_shift = 3.f;
|
default_flow_shift = 3.f;
|
||||||
}
|
}
|
||||||
@ -1691,7 +1709,7 @@ public:
|
|||||||
if (sd_version_is_sd3(version)) {
|
if (sd_version_is_sd3(version)) {
|
||||||
latent_rgb_proj = sd3_latent_rgb_proj;
|
latent_rgb_proj = sd3_latent_rgb_proj;
|
||||||
latent_rgb_bias = sd3_latent_rgb_bias;
|
latent_rgb_bias = sd3_latent_rgb_bias;
|
||||||
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
|
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
|
||||||
latent_rgb_proj = flux_latent_rgb_proj;
|
latent_rgb_proj = flux_latent_rgb_proj;
|
||||||
latent_rgb_bias = flux_latent_rgb_bias;
|
latent_rgb_bias = flux_latent_rgb_bias;
|
||||||
} else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
|
} else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
|
||||||
@ -1786,6 +1804,9 @@ public:
|
|||||||
if (sd_version_is_anima(version)) {
|
if (sd_version_is_anima(version)) {
|
||||||
return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
|
return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
|
||||||
}
|
}
|
||||||
|
if (sd_version_is_boogu_image(version)) {
|
||||||
|
return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
|
||||||
|
}
|
||||||
if (version == VERSION_HIDREAM_O1) {
|
if (version == VERSION_HIDREAM_O1) {
|
||||||
return std::vector<float>{1.0f - (t / static_cast<float>(TIMESTEPS))};
|
return std::vector<float>{1.0f - (t / static_cast<float>(TIMESTEPS))};
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user