mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-24 15:16:38 +00:00
Compare commits
27 Commits
master-692
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f440ad9c29 | ||
|
|
41f7acbfb0 | ||
|
|
b395a6972d | ||
|
|
854bebfe02 | ||
|
|
787d229d84 | ||
|
|
b12098f5d0 | ||
|
|
2bd249c971 | ||
|
|
e9e952462f | ||
|
|
e8e012eef2 | ||
|
|
7f0e728b7d | ||
|
|
92a3b73cdb | ||
|
|
710bc91c8f | ||
|
|
5a34bc7f6e | ||
|
|
146b6cc49e | ||
|
|
93527fda74 | ||
|
|
6e66a1a4a4 | ||
|
|
bb90bfa00f | ||
|
|
517abc777d | ||
|
|
6f00939f75 | ||
|
|
c2df4e1228 | ||
|
|
9838264c49 | ||
|
|
17d70b91e6 | ||
|
|
5db680c2c7 | ||
|
|
749186c0eb | ||
|
|
bdb431ad95 | ||
|
|
276025e054 | ||
|
|
8d4c7af95b |
@ -204,6 +204,12 @@ if(SD_WEBM)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (SD_RPC)
|
||||||
|
message("-- Use RPC as backend stable-diffusion")
|
||||||
|
set(GGML_RPC ON)
|
||||||
|
add_definitions(-DSD_USE_RPC)
|
||||||
|
endif ()
|
||||||
|
|
||||||
set(SD_LIB stable-diffusion)
|
set(SD_LIB stable-diffusion)
|
||||||
|
|
||||||
file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
|
file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
|
||||||
|
|||||||
33
README.md
33
README.md
@ -34,8 +34,8 @@ API and command-line option may change frequently.***
|
|||||||
- Super lightweight and without external dependencies
|
- Super lightweight and without external dependencies
|
||||||
- Supported models
|
- Supported models
|
||||||
- Image Models
|
- Image Models
|
||||||
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
|
- [SD1.x, SD2.x, SD-Turbo](./docs/sd.md)
|
||||||
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
|
- [SDXL, SDXL-Turbo](./docs/sd.md)
|
||||||
- [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
|
- [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
|
||||||
- [SD3/SD3.5](./docs/sd3.md)
|
- [SD3/SD3.5](./docs/sd3.md)
|
||||||
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
|
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
|
||||||
@ -50,21 +50,23 @@ API and command-line option may change frequently.***
|
|||||||
- [Ovis-Image](./docs/ovis_image.md)
|
- [Ovis-Image](./docs/ovis_image.md)
|
||||||
- [Anima](./docs/anima.md)
|
- [Anima](./docs/anima.md)
|
||||||
- [ERNIE-Image](./docs/ernie_image.md)
|
- [ERNIE-Image](./docs/ernie_image.md)
|
||||||
|
- [Boogu Image](./docs/boogu_image.md)
|
||||||
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
|
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
|
||||||
- [Ideogram4](./docs/ideogram4.md)
|
- [Ideogram4](./docs/ideogram4.md)
|
||||||
- Image Edit Models
|
- Image Edit Models
|
||||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
||||||
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
|
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
|
||||||
- [LongCat Image Edit](./docs/longcat_image.md)
|
- [LongCat Image Edit](./docs/longcat_image.md)
|
||||||
|
- [Boogu Image Edit](./docs/boogu_image.md)
|
||||||
- Video Models
|
- Video Models
|
||||||
- [Wan2.1/Wan2.2](./docs/wan.md)
|
- [Wan2.1/Wan2.2](./docs/wan.md)
|
||||||
- [LTX-2.3](./docs/ltx2.md)
|
- [LTX-2.3](./docs/ltx2.md)
|
||||||
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
|
- [PhotoMaker](./docs/photo_maker.md) support.
|
||||||
- Control Net support with SD 1.5
|
- Control Net support with SD 1.5
|
||||||
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
|
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
|
||||||
- Latent Consistency Models support (LCM/LCM-LoRA)
|
- Latent Consistency Models support (LCM/LCM-LoRA)
|
||||||
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
|
- Faster and memory efficient latent decoding with [TAESD](./docs/taesd.md)
|
||||||
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
|
- Upscale images generated with [ESRGAN](./docs/esrgan.md)
|
||||||
- Supported backends
|
- Supported backends
|
||||||
- CPU (AVX, AVX2 and AVX512 support for x86 architectures)
|
- CPU (AVX, AVX2 and AVX512 support for x86 architectures)
|
||||||
- CUDA
|
- CUDA
|
||||||
@ -133,28 +135,9 @@ For runtime and parameter backend placement, see the [backend selection guide](.
|
|||||||
## More Guides
|
## More Guides
|
||||||
|
|
||||||
- [Backend selection](./docs/backend.md)
|
- [Backend selection](./docs/backend.md)
|
||||||
- [SD1.x/SD2.x/SDXL](./docs/sd.md)
|
- [RPC](./docs/rpc.md)
|
||||||
- [SD3/SD3.5](./docs/sd3.md)
|
|
||||||
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
|
|
||||||
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
|
|
||||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
|
||||||
- [Chroma](./docs/chroma.md)
|
|
||||||
- [🔥Qwen Image](./docs/qwen_image.md)
|
|
||||||
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
|
|
||||||
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
|
|
||||||
- [🔥LTX-2.3](./docs/ltx2.md)
|
|
||||||
- [🔥Z-Image](./docs/z_image.md)
|
|
||||||
- [Ovis-Image](./docs/ovis_image.md)
|
|
||||||
- [Anima](./docs/anima.md)
|
|
||||||
- [ERNIE-Image](./docs/ernie_image.md)
|
|
||||||
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
|
|
||||||
- [Lens](./docs/lens.md)
|
|
||||||
- [LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
|
|
||||||
- [LoRA](./docs/lora.md)
|
- [LoRA](./docs/lora.md)
|
||||||
- [LCM/LCM-LoRA](./docs/lcm.md)
|
- [LCM/LCM-LoRA](./docs/lcm.md)
|
||||||
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
|
|
||||||
- [Using ESRGAN to upscale results](./docs/esrgan.md)
|
|
||||||
- [Using TAESD to faster decoding](./docs/taesd.md)
|
|
||||||
- [Docker](./docs/docker.md)
|
- [Docker](./docs/docker.md)
|
||||||
- [Quantization and GGUF](./docs/quantization_and_gguf.md)
|
- [Quantization and GGUF](./docs/quantization_and_gguf.md)
|
||||||
- [Inference acceleration via caching](./docs/caching.md)
|
- [Inference acceleration via caching](./docs/caching.md)
|
||||||
|
|||||||
BIN
assets/boogu/edit_example.png
Normal file
BIN
assets/boogu/edit_example.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 477 KiB |
BIN
assets/boogu/example.png
Normal file
BIN
assets/boogu/example.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 489 KiB |
@ -3,7 +3,7 @@
|
|||||||
`stable-diffusion.cpp` has two backend assignments:
|
`stable-diffusion.cpp` has two backend assignments:
|
||||||
|
|
||||||
- `--backend` selects the runtime backend used to execute model graphs.
|
- `--backend` selects the runtime backend used to execute model graphs.
|
||||||
- `--params-backend` selects the backend used to allocate model parameters.
|
- `--params-backend` selects where model parameters are kept.
|
||||||
|
|
||||||
If `--params-backend` is not set, parameters use the same backend as their module runtime backend.
|
If `--params-backend` is not set, parameters use the same backend as their module runtime backend.
|
||||||
|
|
||||||
@ -29,6 +29,20 @@ The same syntax is used for parameter placement:
|
|||||||
sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend te=cpu,vae=cpu
|
sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend te=cpu,vae=cpu
|
||||||
```
|
```
|
||||||
|
|
||||||
|
`--params-backend` also accepts the special value `disk`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend disk
|
||||||
|
```
|
||||||
|
|
||||||
|
`--max-vram` can target resolved backend/device names:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sd-cli -m model.safetensors -p "a cat" --backend diffusion=cuda0,vae=vulkan0 --max-vram cuda0=6,vulkan0=2
|
||||||
|
```
|
||||||
|
|
||||||
|
The budget applies to every module running on that backend.
|
||||||
|
|
||||||
Module names are case-insensitive. Hyphens and underscores in module names are ignored, so `clip_vision`, `clip-vision`, and `clipvision` are equivalent.
|
Module names are case-insensitive. Hyphens and underscores in module names are ignored, so `clip_vision`, `clip-vision`, and `clipvision` are equivalent.
|
||||||
|
|
||||||
`all=`, `default=`, and `*=` can be used to set the default backend inside a mixed assignment:
|
`all=`, `default=`, and `*=` can be used to set the default backend inside a mixed assignment:
|
||||||
@ -64,9 +78,11 @@ The special values `auto`, `default`, and an empty backend name select the defau
|
|||||||
|
|
||||||
The special value `gpu` selects the first GPU backend, falling back to the first integrated GPU backend.
|
The special value `gpu` selects the first GPU backend, falling back to the first integrated GPU backend.
|
||||||
|
|
||||||
|
The special value `disk` is accepted only by `--params-backend`. `--backend disk` is invalid because `disk` is a parameter residency mode, not a runtime compute backend.
|
||||||
|
|
||||||
## Runtime backend vs. parameter backend
|
## Runtime backend vs. parameter backend
|
||||||
|
|
||||||
The runtime backend controls where graph execution runs. The parameter backend controls where model weights are allocated.
|
The runtime backend controls where graph execution runs. The parameter backend controls where model weights are allocated or whether they are reloaded from disk on demand.
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
@ -76,6 +92,16 @@ sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend cpu
|
|||||||
|
|
||||||
This runs all modules on `cuda0`, but stores parameters in CPU RAM. During execution, parameters are moved to the runtime backend as needed.
|
This runs all modules on `cuda0`, but stores parameters in CPU RAM. During execution, parameters are moved to the runtime backend as needed.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend disk
|
||||||
|
```
|
||||||
|
|
||||||
|
This runs all modules on `cuda0`, reloads parameters from the model file as needed, and releases those parameter buffers after use.
|
||||||
|
|
||||||
|
`disk` is never selected implicitly. If `--params-backend` is not set, parameters use the runtime backend.
|
||||||
|
|
||||||
Per-module assignments can be mixed:
|
Per-module assignments can be mixed:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
@ -100,23 +126,27 @@ uses one shared CPU backend for both `te` and `vae` runtime execution.
|
|||||||
|
|
||||||
Runtime and parameter assignments also share the same backend cache. If `--backend diffusion=cuda0` and `--params-backend diffusion=cuda0` resolve to the same device, both use the same backend instance.
|
Runtime and parameter assignments also share the same backend cache. If `--backend diffusion=cuda0` and `--params-backend diffusion=cuda0` resolve to the same device, both use the same backend instance.
|
||||||
|
|
||||||
|
`--params-backend disk` does not create a separate backend instance. Parameters are loaded lazily using the module runtime backend.
|
||||||
|
|
||||||
`SDBackendManager` owns the backend instances and frees them when the context or upscaler is destroyed. Model runners receive non-owning runtime and parameter backend pointers and do not free them.
|
`SDBackendManager` owns the backend instances and frees them when the context or upscaler is destroyed. Model runners receive non-owning runtime and parameter backend pointers and do not free them.
|
||||||
|
|
||||||
## Compatibility flags
|
## Compatibility flags
|
||||||
|
|
||||||
The older CPU placement flags are still supported:
|
The example CLI/server still accepts these older CPU placement flags as compatibility aliases:
|
||||||
|
|
||||||
- `--clip-on-cpu`
|
- `--clip-on-cpu`
|
||||||
- `--vae-on-cpu`
|
- `--vae-on-cpu`
|
||||||
- `--control-net-cpu`
|
- `--control-net-cpu`
|
||||||
- `--offload-to-cpu`
|
- `--offload-to-cpu`
|
||||||
|
|
||||||
`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` affect runtime backend assignment only when `--backend` is not set. They map to `te=cpu`, `vae=cpu`, and `controlnet=cpu`.
|
`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` are deprecated. The example argument layer prepends `te=cpu`, `vae=cpu`, and `controlnet=cpu` to `--backend` before creating the context.
|
||||||
|
|
||||||
`--offload-to-cpu` affects parameter backend assignment only when `--params-backend` is not set. It is equivalent to:
|
`--offload-to-cpu` prepends a CPU default to the parameter assignment in the caller before creating the context:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
--params-backend cpu
|
--params-backend '*=cpu'
|
||||||
```
|
```
|
||||||
|
|
||||||
Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
|
Because this default is inserted first, later explicit `--params-backend` entries can still override it, for example `--offload-to-cpu --params-backend te=disk` keeps non-TE parameters on CPU and reloads TE parameters from disk.
|
||||||
|
|
||||||
|
Library callers should set `backend` and `params_backend` directly. The old CPU/offload fields are no longer part of the C API. Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
|
||||||
|
|||||||
31
docs/boogu_image.md
Normal file
31
docs/boogu_image.md
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# How to Use
|
||||||
|
|
||||||
|
Boogu Image uses a Boogu diffusion transformer, the FLUX VAE, and Qwen3-VL as the LLM text and vision encoder.
|
||||||
|
|
||||||
|
## Download weights
|
||||||
|
|
||||||
|
- Download Boogu Image
|
||||||
|
- safetensors: https://huggingface.co/Comfy-Org/Boogu-Image/tree/main/diffusion_models
|
||||||
|
- Download vae
|
||||||
|
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
|
||||||
|
- Download Qwen3-VL 8B
|
||||||
|
- gguf: https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
|
||||||
|
- For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Boogu Image Base
|
||||||
|
|
||||||
|
```
|
||||||
|
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_base_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft -p "a lovely cat" --diffusion-fa -v --offload-to-cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
<img width="256" alt="Boogu Image Base example" src="../assets/boogu/example.png" />
|
||||||
|
|
||||||
|
### Boogu Image Edit
|
||||||
|
|
||||||
|
```
|
||||||
|
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_edit_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --llm_vision ..\..\llm\mmproj-Qwen3VL-8B-Instruct-F16.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --diffusion-fa -v --offload-to-cpu -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'boogu.cpp'"
|
||||||
|
```
|
||||||
|
|
||||||
|
<img width="256" alt="Boogu Image Edit example" src="../assets/boogu/edit_example.png" />
|
||||||
@ -21,6 +21,38 @@ and the compute buffer shrink in the debug log:
|
|||||||
|
|
||||||
Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.
|
Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.
|
||||||
|
|
||||||
|
## Use params backend to reduce VRAM or RAM usage.
|
||||||
|
|
||||||
|
`--params-backend` controls where model parameters are kept. If it is not set, parameters use the same backend as `--backend`, so a GPU runtime backend also keeps parameters in VRAM.
|
||||||
|
|
||||||
|
Use CPU params to reduce VRAM usage:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
--backend cuda0 --params-backend cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
This keeps model weights in system RAM and moves them to the runtime backend when needed. In the example CLI/server, `--offload-to-cpu` is a compatibility shortcut that prepends `*=cpu` to `--params-backend` before creating the context, so explicit module assignments can still override it:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
--offload-to-cpu --params-backend te=disk
|
||||||
|
```
|
||||||
|
|
||||||
|
Use disk params to reduce both VRAM and RAM usage:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
--backend cuda0 --params-backend disk
|
||||||
|
```
|
||||||
|
|
||||||
|
This reloads parameters from the model file on demand and releases them after use. It has the lowest memory residency, but can be slower because weights must be read again. `disk` is never selected implicitly; set it explicitly when RAM usage matters more than reload cost.
|
||||||
|
|
||||||
|
Per-module assignments can target only the largest modules:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
--backend cuda0 --params-backend diffusion=disk,te=cpu,vae=cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
See [backend selection](./backend.md) for full syntax.
|
||||||
|
|
||||||
## Use quantization to reduce memory usage.
|
## Use quantization to reduce memory usage.
|
||||||
|
|
||||||
[quantization](./quantization_and_gguf.md)
|
[quantization](./quantization_and_gguf.md)
|
||||||
196
docs/pulid.md
Normal file
196
docs/pulid.md
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
# PuLID-Flux face-identity preservation
|
||||||
|
|
||||||
|
stable-diffusion.cpp supports the [PuLID-Flux](https://github.com/ToTheBeginning/PuLID)
|
||||||
|
identity-injection technique on top of Flux.1 (schnell or dev) models.
|
||||||
|
Given a single source portrait, PuLID-Flux produces new generations that
|
||||||
|
preserve the source person's face across arbitrary scenes, poses, and
|
||||||
|
prompts.
|
||||||
|
|
||||||
|
Unlike PhotoMaker (which extracts the identity inside the inference
|
||||||
|
process from a directory of images), PuLID-Flux's identity extractor is
|
||||||
|
a heavy stack (insightface ArcFace + EVA-CLIP-L + IDFormer encoder) that
|
||||||
|
is impractical to port to C++/ggml. To keep this implementation small and
|
||||||
|
cross-vendor, **stable-diffusion.cpp consumes a precomputed identity
|
||||||
|
embedding** produced by an external Python tool that runs once per source
|
||||||
|
portrait. Everything downstream of that one-shot extraction is C++ and
|
||||||
|
runs on any backend (Vulkan, CUDA, Metal, ROCm, CPU).
|
||||||
|
|
||||||
|
## Architecture summary
|
||||||
|
|
||||||
|
The PuLID-Flux contribution to the Flux denoise loop is a stack of 20
|
||||||
|
small cross-attention modules (`PerceiverAttentionCA`) inserted between
|
||||||
|
the Flux transformer blocks:
|
||||||
|
|
||||||
|
- After every 2nd of the 19 double-stream blocks (10 hook points)
|
||||||
|
- After every 4th of the 38 single-stream blocks (10 hook points)
|
||||||
|
|
||||||
|
Each cross-attention layer takes the current image tokens as query, the
|
||||||
|
32-token / 2048-dim identity embedding as key+value, and adds its output
|
||||||
|
(scaled by `id_weight`, typically 1.0) back to the image tokens.
|
||||||
|
|
||||||
|
## Required weights
|
||||||
|
|
||||||
|
Three files in addition to the standard Flux weight set:
|
||||||
|
|
||||||
|
1. **Flux base** (transformer + VAE + clip_l + t5xxl) -- exactly as
|
||||||
|
[docs/flux.md](flux.md) describes.
|
||||||
|
2. **PuLID weights** -- download from
|
||||||
|
[guozinan/PuLID](https://huggingface.co/guozinan/PuLID):
|
||||||
|
- `pulid_flux_v0.9.0.safetensors` or `pulid_flux_v0.9.1.safetensors`
|
||||||
|
(recommended; this implementation is verified against v0.9.1)
|
||||||
|
- **v1.1 (`pulid_v1.1.safetensors`) is NOT yet supported** -- it uses
|
||||||
|
renamed keys (`id_adapter_attn_layers.*` instead of `pulid_ca.*`)
|
||||||
|
and possibly different module structure. Future PR.
|
||||||
|
3. **Identity embedding (.pulidembd)** -- produced by the precompute
|
||||||
|
tool below.
|
||||||
|
|
||||||
|
## Precompute the identity embedding
|
||||||
|
|
||||||
|
The precompute tool runs the PyTorch identity-extraction stack on a
|
||||||
|
single portrait image and writes the resulting `(32, 2048)` embedding
|
||||||
|
to a `.pulidembd` binary file (about 131 KB). Run it once per source
|
||||||
|
person; the same file is reused for any number of generations.
|
||||||
|
|
||||||
|
A reference Python script is provided alongside this docs file at
|
||||||
|
[`script/pulid_extract_id.py`](../script/pulid_extract_id.py). It
|
||||||
|
requires:
|
||||||
|
- A working CUDA / CPU PyTorch stack
|
||||||
|
- `insightface`, `facexlib`, `eva-clip`, `torchvision`, `opencv-python`,
|
||||||
|
`huggingface_hub`, `gguf`
|
||||||
|
- The PuLID weights file (same one stable-diffusion.cpp will load below)
|
||||||
|
- The ToTheBeginning/PuLID repo's `pulid/` package (including
|
||||||
|
`pulid/pipeline_flux.py`) and `eva_clip/` package on `PYTHONPATH`; `flux/`
|
||||||
|
is not needed for embedding extraction
|
||||||
|
|
||||||
|
Run it as:
|
||||||
|
|
||||||
|
```
|
||||||
|
python pulid_extract_id.py \
|
||||||
|
--portrait /path/to/source-photo.jpg \
|
||||||
|
--pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \
|
||||||
|
--out /path/to/source.pulidembd
|
||||||
|
```
|
||||||
|
|
||||||
|
## Format (gguf)
|
||||||
|
|
||||||
|
The embedding is a standard **gguf** container holding a single tensor:
|
||||||
|
|
||||||
|
```
|
||||||
|
tensor name : "pulid_id"
|
||||||
|
shape : [token_dim, num_tokens] (ggml order; typically [2048, 32])
|
||||||
|
type : F16 (also accepts F32 / BF16)
|
||||||
|
metadata : general.architecture = "pulid", pulid.version = 1
|
||||||
|
```
|
||||||
|
|
||||||
|
stable-diffusion.cpp loads it with the normal gguf reader
|
||||||
|
(`gguf_init_from_file`) and converts to fp32 at load time -- no bespoke
|
||||||
|
parser. Total file size for the typical (32, 2048, fp16) case is ~131 KB.
|
||||||
|
|
||||||
|
## Command-line usage
|
||||||
|
|
||||||
|
```
|
||||||
|
.\bin\Release\sd-cli.exe \
|
||||||
|
--diffusion-model models\flux1-schnell-Q4_K_S.gguf \
|
||||||
|
--vae models\ae.safetensors \
|
||||||
|
--clip_l models\clip_l.safetensors \
|
||||||
|
--t5xxl models\t5xxl_fp16.safetensors \
|
||||||
|
--pulid-weights models\pulid_flux_v0.9.1.safetensors \
|
||||||
|
--pulid-id-embedding source.pulidembd \
|
||||||
|
--pulid-id-weight 1.0 \
|
||||||
|
-p "candid photograph of a young woman on a beach at sunset" \
|
||||||
|
--cfg-scale 1.0 --sampling-method euler --steps 4 -W 512 -H 512 \
|
||||||
|
--seed 42 --clip-on-cpu \
|
||||||
|
-o out.png
|
||||||
|
```
|
||||||
|
|
||||||
|
For Flux Dev (instead of Schnell), add `--guidance 3.5` and `--steps 20`.
|
||||||
|
|
||||||
|
## Flags
|
||||||
|
|
||||||
|
| Flag | Purpose |
|
||||||
|
|----------------------------|-------------------------------------------------------------------|
|
||||||
|
| `--pulid-weights <path>` | Path to `pulid_flux_v0.9.x.safetensors`. Loaded with the model. |
|
||||||
|
| `--pulid-id-embedding <p>` | Path to a `.pulidembd` binary produced by the precompute tool. |
|
||||||
|
| `--pulid-id-weight <f>` | Identity-injection strength. Typical 0.7-1.2; default 1.0. |
|
||||||
|
|
||||||
|
All three flags must be set together to activate PuLID. Setting only
|
||||||
|
`--pulid-weights` (no embedding) loads the weights but disables injection
|
||||||
|
at runtime. Setting `--pulid-id-weight 0` zeros out the contribution
|
||||||
|
(useful for falsification testing: outputs should be byte-identical to
|
||||||
|
a no-PuLID run with the same seed).
|
||||||
|
|
||||||
|
## Memory budget
|
||||||
|
|
||||||
|
At 512x512, 4 steps (Schnell), the 20 cross-attention layers add roughly
|
||||||
|
10% to denoise time and almost nothing to peak VRAM. Tested on a 12 GB
|
||||||
|
consumer card alongside Flux Schnell Q4 GGUF + CPU-offloaded clip_l and
|
||||||
|
t5xxl + GPU-resident VAE.
|
||||||
|
|
||||||
|
At 1024x1024 with Flux Dev Q4 + 20 steps + PuLID, the VAE decode compute
|
||||||
|
buffer doesn't fit on a 12 GB card even with `--vae-on-cpu`. Workaround:
|
||||||
|
explicitly route VAE to the CPU backend instead of the offload flag:
|
||||||
|
|
||||||
|
```
|
||||||
|
--backend "diffusion=vulkan0,vae=cpu"
|
||||||
|
```
|
||||||
|
|
||||||
|
The `--vae-on-cpu` flag offloads VAE weights but leaves the compute graph
|
||||||
|
on the default backend; this is existing stable-diffusion.cpp behavior,
|
||||||
|
not a PuLID-specific issue. Documented here because anyone running PuLID
|
||||||
|
at 1024 will hit it.
|
||||||
|
|
||||||
|
## Backend selection
|
||||||
|
|
||||||
|
The standard `--backend` flag works as documented. Common patterns:
|
||||||
|
|
||||||
|
```
|
||||||
|
# AMD Vulkan
|
||||||
|
--backend "diffusion=vulkan0,vae=cpu"
|
||||||
|
|
||||||
|
# NVIDIA Vulkan
|
||||||
|
--backend "diffusion=vulkan1,vae=cpu"
|
||||||
|
|
||||||
|
# CUDA
|
||||||
|
--backend "diffusion=cuda0,vae=cpu"
|
||||||
|
```
|
||||||
|
|
||||||
|
The PuLID cross-attention layers run on the same backend as the main
|
||||||
|
diffusion model. They have not yet been independently profiled on every
|
||||||
|
backend; only Vulkan and CPU have been tested by the original contributor.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
A three-way SHA-256 check is the recommended sanity test when bringing up
|
||||||
|
a new combination of model + backend + hardware:
|
||||||
|
|
||||||
|
| Run | Expected hash relation |
|
||||||
|
|----------------------------------------------|------------------------------------|
|
||||||
|
| A: no `--pulid-*` flags | baseline |
|
||||||
|
| B: PuLID flags, `--pulid-id-weight 0.0` | **byte-identical to A** |
|
||||||
|
| C: PuLID flags, `--pulid-id-weight 1.0` | **different from A,B**, preserves source identity |
|
||||||
|
|
||||||
|
If A and C differ but A and B differ too, the injection is allocating
|
||||||
|
or computing something even at zero weight -- likely a bug.
|
||||||
|
|
||||||
|
## Limitations / not yet supported
|
||||||
|
|
||||||
|
- **`--skip-layers` (skip-layer-guidance / SLG) combined with PuLID** is not
|
||||||
|
supported. The `pulid_ca` index advances per non-skipped block, so a
|
||||||
|
skipped block silently misaligns the cross-attention weight assignment
|
||||||
|
vs. the trained intervals. The reference PyTorch implementation does
|
||||||
|
not have SLG either, so there is no well-defined behavior to emulate.
|
||||||
|
Use either feature alone.
|
||||||
|
- **PuLID v1.1 weights** (`pulid_v1.1.safetensors`, renamed key layout).
|
||||||
|
- **Multiple ID images.** The reference PyTorch implementation can fuse
|
||||||
|
several portraits into one embedding for stronger identity. This
|
||||||
|
implementation accepts a single embedding produced from one or more
|
||||||
|
images by the external precompute tool.
|
||||||
|
- **Negative-prompt branch of CFG.** PuLID only injects on the positive
|
||||||
|
conditioning path in the published reference, and the implementation
|
||||||
|
here follows that. Flux's distilled guidance doesn't run a separate
|
||||||
|
uncond branch in normal use, so this matters only for `--true-cfg`
|
||||||
|
workflows that aren't standard for Flux.
|
||||||
|
- **Backends other than Vulkan and CPU** are untested by the original
|
||||||
|
contributor. The implementation is pure-ggml and should work on CUDA,
|
||||||
|
ROCm, and Metal, but verification by users on those backends is
|
||||||
|
welcomed.
|
||||||
220
docs/rpc.md
Normal file
220
docs/rpc.md
Normal file
@ -0,0 +1,220 @@
|
|||||||
|
# Building and Using the RPC Server with `stable-diffusion.cpp`
|
||||||
|
|
||||||
|
This guide covers how to build a version of [the RPC server from `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/tools/rpc/README.md) that is compatible with your version of `stable-diffusion.cpp` to manage multi-backends setups. RPC allows you to offload specific model components to a remote server.
|
||||||
|
|
||||||
|
> **Note on Model Location:** The model files (e.g., `.safetensors` or `.gguf`) remain on the **Client** machine. The client parses the file and transmits the necessary tensor data and computational graphs to the server. The server does not need to store the model files locally.
|
||||||
|
|
||||||
|
## 1. Building `stable-diffusion.cpp` with RPC client
|
||||||
|
|
||||||
|
First, you should build the client application from source. It requires `SD_RPC=ON` to include the RPC backend to your client.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. \
|
||||||
|
-DSD_RPC=ON \
|
||||||
|
# Add other build flags here (e.g., -DSD_VULKAN=ON)
|
||||||
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Note:** Ensure you add the other flags you would normally use (e.g., `-DSD_VULKAN=ON`, `-DSD_CUDA=ON`, `-DSD_HIPBLAS=ON`, or `-DGGML_METAL=ON`), for more information about building `stable-diffusion.cpp` from source, please refer to the [build.md](build.md) documentation.
|
||||||
|
|
||||||
|
## 2. Ensure `llama.cpp` is at the correct commit
|
||||||
|
|
||||||
|
`stable-diffusion.cpp`'s RPC client is designed to work with a specific version of `llama.cpp` (compatible with the `ggml` submodule) to ensure API compatibility. The commit hash for `llama.cpp` is stored in `ggml/scripts/sync-llama.last`.
|
||||||
|
|
||||||
|
> **Start from Root:** Perform these steps from the root of your `stable-diffusion.cpp` directory.
|
||||||
|
|
||||||
|
1. Read the target commit hash from the submodule tracker:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Linux / WSL / MacOS
|
||||||
|
HASH=$(cat ggml/scripts/sync-llama.last)
|
||||||
|
|
||||||
|
# Windows (PowerShell)
|
||||||
|
$HASH = Get-Content -Path "ggml\scripts\sync-llama.last"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Clone `llama.cpp` at the target commit .
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/ggml-org/llama.cpp.git
|
||||||
|
cd llama.cpp
|
||||||
|
git checkout $HASH
|
||||||
|
```
|
||||||
|
To save on download time and storage, you can use a shallow clone to download only the target commit:
|
||||||
|
```bash
|
||||||
|
mkdir -p llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
git init
|
||||||
|
git remote add origin https://github.com/ggml-org/llama.cpp.git
|
||||||
|
git fetch --depth 1 origin $HASH
|
||||||
|
git checkout FETCH_HEAD
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3. Build `llama.cpp` (RPC Server)
|
||||||
|
|
||||||
|
The RPC server acts as the worker. You must explicitly enable the **backend** (the hardware interface, such as CUDA for Nvidia, Metal for Apple Silicon, or Vulkan) when building, otherwise the server will default to using only the CPU.
|
||||||
|
|
||||||
|
To find the correct flags for your system, refer to the official documentation for the [`llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) repository.
|
||||||
|
|
||||||
|
> **Crucial:** You must include the compiler flags required to satisfy the API compatibility with `stable-diffusion.cpp` (`-DGGML_MAX_NAME=128`). Without this flag, `GGML_MAX_NAME` will default to `64` for the server, and data transfers between the client and server will fail. Of course, `-DGGML_RPC` must also be enabled.
|
||||||
|
>
|
||||||
|
> I recommend disabling the `LLAMA_CURL` flag to avoid unnecessary dependencies, and disabling shared library builds to avoid potential conflicts.
|
||||||
|
|
||||||
|
> **Build Target:** We are specifically building the `rpc-server` target. This prevents the build system from compiling the entire `llama.cpp` suite (like `llama-server`), making the build significantly faster.
|
||||||
|
|
||||||
|
### Linux / WSL (Vulkan)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DGGML_RPC=ON \
|
||||||
|
-DGGML_VULKAN=ON \ # Ensure backend is enabled
|
||||||
|
-DGGML_BUILD_SHARED_LIBS=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
|
||||||
|
-DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
|
||||||
|
cmake --build . --config Release --target rpc-server -j $(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
### macOS (Metal)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DGGML_RPC=ON \
|
||||||
|
-DGGML_METAL=ON \
|
||||||
|
-DGGML_BUILD_SHARED_LIBS=OFF \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
|
||||||
|
-DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
|
||||||
|
cmake --build . --config Release --target rpc-server
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows (Visual Studio 2022, Vulkan)
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -G "Visual Studio 17 2022" -A x64 `
|
||||||
|
-DGGML_RPC=ON `
|
||||||
|
-DGGML_VULKAN=ON `
|
||||||
|
-DGGML_BUILD_SHARED_LIBS=OFF `
|
||||||
|
-DLLAMA_CURL=OFF `
|
||||||
|
-DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 `
|
||||||
|
-DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
|
||||||
|
cmake --build . --config Release --target rpc-server
|
||||||
|
```
|
||||||
|
|
||||||
|
## 4. Usage
|
||||||
|
|
||||||
|
Once both applications are built, you can run the server and the client to manage your GPU allocation.
|
||||||
|
|
||||||
|
### Step A: Run the RPC Server
|
||||||
|
|
||||||
|
Start the server. It listens for connections on the default address (usually `localhost:50052`). If your server is on a different machine, ensure the server binds to the correct interface and your firewall allows the connection.
|
||||||
|
|
||||||
|
**On the Server :**
|
||||||
|
If running on the same machine, you can use the default address:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./rpc-server
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to allow connections from other machines on the network:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./rpc-server --host 0.0.0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Security Warning:** The RPC server does not currently support authentication or encryption. **Only run the server on trusted local networks**. Never expose the RPC server directly to the open internet.
|
||||||
|
|
||||||
|
> **Drivers & Hardware:** Ensure the Server machine has the necessary drivers installed and functional (e.g., Nvidia Drivers for CUDA, Vulkan SDK, or Metal). If no devices are found, the server will simply fallback to CPU usage.
|
||||||
|
|
||||||
|
<!-- ### Step B: Check if the client is able to connect to the server and see the available devices
|
||||||
|
|
||||||
|
We're assuming the server is running on your local machine, and listening on the default port `50052`. If it's running on a different machine, you can replace `localhost` with the IP address of the server.
|
||||||
|
|
||||||
|
**On the Client:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./sd-cli --rpc-servers localhost:50052 --list-devices
|
||||||
|
```
|
||||||
|
|
||||||
|
If the server is running and the client is able to connect, you should see `RPC0 localhost:50052` in the list of devices.
|
||||||
|
|
||||||
|
Example output:
|
||||||
|
(Client built without GPU acceleration, two GPUs available on the server)
|
||||||
|
|
||||||
|
```
|
||||||
|
List of available GGML devices:
|
||||||
|
Name Description
|
||||||
|
-------------------
|
||||||
|
CPU AMD Ryzen 9 5900X 12-Core Processor
|
||||||
|
RPC0 localhost:50052
|
||||||
|
RPC1 localhost:50052
|
||||||
|
``` -->
|
||||||
|
|
||||||
|
### Step B: Run with RPC device
|
||||||
|
|
||||||
|
If everything is working correctly, you can now run the client while offloading some or all of the work to the RPC server.
|
||||||
|
|
||||||
|
Example: Setting the main backend to the RPC0 device for doing all the work on the server.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./sd-cli -m models/sd1.5.safetensors -p "A cat" --rpc-servers localhost:50052 --backend RPC0
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Scaling: Multiple RPC Servers
|
||||||
|
|
||||||
|
You can connect the client to multiple RPC servers simultaneously to scale out your hardware usage.
|
||||||
|
|
||||||
|
Example: A main machine (192.168.1.10) with 3 GPUs, with one GPU running CUDA and the other two running Vulkan, and a second machine (192.168.1.11) only one GPU.
|
||||||
|
|
||||||
|
**On the first machine (Running two server instances):**
|
||||||
|
|
||||||
|
**Terminal 1 (CUDA):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Linux / WSL
|
||||||
|
export CUDA_VISIBLE_DEVICES=0
|
||||||
|
cd ./build_cuda/bin/Release
|
||||||
|
./rpc-server --host 0.0.0.0
|
||||||
|
|
||||||
|
# Windows PowerShell
|
||||||
|
$env:CUDA_VISIBLE_DEVICES="0"
|
||||||
|
cd .\build_cuda\bin\Release
|
||||||
|
./rpc-server --host 0.0.0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
**Terminal 2 (Vulkan):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ./build_vulkan/bin/Release
|
||||||
|
# ignore the first GPU (used by CUDA server)
|
||||||
|
./rpc-server --host 0.0.0.0 --port 50053 -d Vulkan1,Vulkan2
|
||||||
|
```
|
||||||
|
|
||||||
|
**On the second machine:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ./build/bin/Release
|
||||||
|
./rpc-server --host 0.0.0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
**On the Client:**
|
||||||
|
Pass multiple server addresses separated by commas.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./sd-cli --rpc-servers 192.168.1.10:50052,192.168.1.10:50053,192.168.1.11:50052 [...]
|
||||||
|
```
|
||||||
|
|
||||||
|
The client will map these servers to sequential device IDs (e.g., RPC0 from the first server, RPC2, RPC3 from the second, and RPC4 from the third). With this setup, you could for example use RPC0 for the main backend, RPC1 and RPC2 for the text encoders, and RPC3 for the VAE.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Performance Considerations
|
||||||
|
|
||||||
|
RPC performance is heavily dependent on network bandwidth, as large weights and activations must be transferred back and forth over the network, especially for large models, or when using high resolutions. For best results, ensure your network connection is stable and has sufficient bandwidth (>1Gbps recommended). This shoumd not be a concern if you are running the server and client on the same machine, as the data transfer will happen over the loopback interface.
|
||||||
@ -1,204 +1,9 @@
|
|||||||
# Run
|
# Usage
|
||||||
|
|
||||||
```
|
For detailed command-line arguments, run:
|
||||||
usage: ./bin/sd-cli [options]
|
|
||||||
|
|
||||||
CLI Options:
|
```bash
|
||||||
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image
|
./bin/sd-cli -h
|
||||||
sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs
|
|
||||||
support .avi, .webm, and animated .webp
|
|
||||||
--image <string> path to the image to inspect (for metadata mode)
|
|
||||||
--metadata-format <string> metadata output format, one of [text, json] (default: text)
|
|
||||||
--preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support
|
|
||||||
.avi, .webm, and animated .webp
|
|
||||||
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file
|
|
||||||
(default is 1, meaning updating at every step)
|
|
||||||
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified
|
|
||||||
%d in output path, 1 otherwise)
|
|
||||||
--canny apply canny preprocessor (edge detection)
|
|
||||||
--convert-name convert tensor name (for convert mode)
|
|
||||||
-v, --verbose print extra info
|
|
||||||
--color colors the logging tags according to level
|
|
||||||
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
|
|
||||||
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
|
|
||||||
--metadata-raw include raw hex previews for unparsed metadata payloads
|
|
||||||
--metadata-brief truncate long metadata text values in text output
|
|
||||||
--metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
|
|
||||||
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
|
|
||||||
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
|
|
||||||
-h, --help show this help message and exit
|
|
||||||
|
|
||||||
Context Options:
|
|
||||||
-m, --model <string> path to full model
|
|
||||||
--clip_l <string> path to the clip-l text encoder
|
|
||||||
--clip_g <string> path to the clip-g text encoder
|
|
||||||
--clip_vision <string> path to the clip-vision encoder
|
|
||||||
--t5xxl <string> path to the t5xxl text encoder
|
|
||||||
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
|
|
||||||
mistral-small3.2 for flux2, ...)
|
|
||||||
--llm_vision <string> path to the llm vit
|
|
||||||
--qwen2vl <string> alias of --llm. Deprecated.
|
|
||||||
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
|
||||||
--diffusion-model <string> path to the standalone diffusion model
|
|
||||||
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
|
|
||||||
--uncond-diffusion-model <string> path to the standalone unconditional diffusion model, currently used by
|
|
||||||
Ideogram4 CFG
|
|
||||||
--vae <string> path to standalone vae model
|
|
||||||
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
|
||||||
--tae <string> alias of --taesd
|
|
||||||
--control-net <string> path to control net model
|
|
||||||
--embd-dir <string> embeddings directory
|
|
||||||
--lora-model-dir <string> lora model directory
|
|
||||||
--hires-upscalers-dir <string> highres fix upscaler model directory
|
|
||||||
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
|
|
||||||
--photo-maker <string> path to PHOTOMAKER model
|
|
||||||
--upscale-model <string> path to esrgan model.
|
|
||||||
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
|
|
||||||
then threads will be set to the number of CPU physical cores
|
|
||||||
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
|
||||||
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
|
|
||||||
graph splitting; a negative value auto-detects free VRAM, sparing the
|
|
||||||
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
|
|
||||||
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
|
||||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
|
|
||||||
when needed
|
|
||||||
--mmap whether to memory-map model
|
|
||||||
--control-net-cpu keep controlnet in cpu (for low vram)
|
|
||||||
--clip-on-cpu keep clip in cpu (for low vram)
|
|
||||||
--vae-on-cpu keep vae in cpu (for low vram)
|
|
||||||
--fa use flash attention
|
|
||||||
--diffusion-fa use flash attention in the diffusion model only
|
|
||||||
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
|
|
||||||
--vae-conv-direct use ggml_conv2d_direct in the vae model
|
|
||||||
--circular enable circular padding for convolutions
|
|
||||||
--circularx enable circular RoPE wrapping on x-axis (width) only
|
|
||||||
--circulary enable circular RoPE wrapping on y-axis (height) only
|
|
||||||
--chroma-disable-dit-mask disable dit mask for chroma
|
|
||||||
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
|
|
||||||
--chroma-enable-t5-mask enable t5 mask for chroma
|
|
||||||
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
|
|
||||||
q4_K). If not specified, the default is the type of the weight file
|
|
||||||
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
|
|
||||||
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
|
|
||||||
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
|
|
||||||
flux2_flow]
|
|
||||||
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
|
|
||||||
auto. In auto mode, if the model weights contain any quantized parameters,
|
|
||||||
the at_runtime mode will be used; otherwise, immediately will be used.The
|
|
||||||
immediately mode may have precision and compatibility issues with quantized
|
|
||||||
parameters, but it usually offers faster inference speed and, in some cases,
|
|
||||||
lower memory usage. The at_runtime mode, on the other hand, is exactly the
|
|
||||||
opposite.
|
|
||||||
|
|
||||||
Generation Options:
|
|
||||||
-p, --prompt <string> the prompt to render
|
|
||||||
-n, --negative-prompt <string> the negative prompt (default: "")
|
|
||||||
-i, --init-img <string> path to the init image
|
|
||||||
--end-img <string> path to the end image, required by flf2v
|
|
||||||
--mask <string> path to the mask image
|
|
||||||
--control-image <string> path to control image, control net
|
|
||||||
--control-video <string> path to control video frames, It must be a directory path. The video frames
|
|
||||||
inside should be stored as images in lexicographical (character) order. For
|
|
||||||
example, if the control video path is `frames`, the directory contain images
|
|
||||||
such as 00.png, 01.png, ... etc.
|
|
||||||
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
|
|
||||||
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
|
|
||||||
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
|
|
||||||
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
|
|
||||||
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
|
|
||||||
--extra-sample-args <string> extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta,
|
|
||||||
apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports
|
|
||||||
slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end;
|
|
||||||
ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma
|
|
||||||
--extra-tiling-args <string> extra VAE tiling args, key=value list. LTX video VAE supports
|
|
||||||
temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)
|
|
||||||
-H, --height <int> image height, in pixel space (default: 512)
|
|
||||||
-W, --width <int> image width, in pixel space (default: 512)
|
|
||||||
--steps <int> number of sample steps (default: 20)
|
|
||||||
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
|
|
||||||
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
|
|
||||||
(default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
|
|
||||||
-b, --batch-count <int> batch count
|
|
||||||
--video-frames <int> video frames (default: 1)
|
|
||||||
--fps <int> fps (default: 24)
|
|
||||||
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
|
|
||||||
NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
|
|
||||||
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
|
|
||||||
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
|
|
||||||
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
|
|
||||||
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
|
|
||||||
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
|
|
||||||
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
|
|
||||||
128)
|
|
||||||
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
|
|
||||||
--img-cfg-scale <float> image guidance scale for inpaint or image edit models: (default: same as
|
|
||||||
--cfg-scale)
|
|
||||||
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
|
|
||||||
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
|
|
||||||
disabled, a value of 2.5 is nice for sd3.5 medium
|
|
||||||
--skip-layer-start <float> SLG enabling point (default: 0.01)
|
|
||||||
--skip-layer-end <float> SLG disabling point (default: 0.2)
|
|
||||||
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
|
|
||||||
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
|
||||||
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
|
|
||||||
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
|
|
||||||
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or image edit models (default:
|
|
||||||
same as --cfg-scale)
|
|
||||||
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
|
|
||||||
(default: 3.5)
|
|
||||||
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
|
|
||||||
0)
|
|
||||||
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
|
|
||||||
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
|
|
||||||
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
|
|
||||||
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
|
||||||
--strength <float> strength for noising/unnoising (default: 0.75)
|
|
||||||
--pm-style-strength <float>
|
|
||||||
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
|
|
||||||
destruction of information in init image
|
|
||||||
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
|
|
||||||
`--high-noise-steps` is set to -1
|
|
||||||
--vace-strength <float> wan vace strength
|
|
||||||
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
|
|
||||||
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
|
|
||||||
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
|
|
||||||
--increase-ref-index automatically increase the indices of references images based on the order
|
|
||||||
they are listed (starting with 1).
|
|
||||||
--disable-auto-resize-ref-image disable auto resize of ref images
|
|
||||||
--disable-image-metadata do not embed generation metadata on image files
|
|
||||||
--vae-tiling process vae in tiles to reduce memory usage
|
|
||||||
--temporal-tiling enable temporal tiling for LTX video VAE decode
|
|
||||||
--hires enable highres fix
|
|
||||||
-s, --seed RNG seed (default: 42, use random seed for < 0)
|
|
||||||
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
|
|
||||||
dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
|
|
||||||
er_sde, euler_cfg_pp, euler_a_cfg_pp] (default: euler for Flux/SD3/Wan, euler_a otherwise)
|
|
||||||
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
|
|
||||||
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
|
|
||||||
res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp] default: euler for Flux/SD3/Wan, euler_a otherwise
|
|
||||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
|
|
||||||
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default:
|
|
||||||
model-specific
|
|
||||||
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
|
|
||||||
"14.61,7.8,3.5,0.0").
|
|
||||||
--hires-sigmas custom sigma values for the highres fix second pass, comma-separated (e.g.,
|
|
||||||
"0.85,0.725,0.421875,0.0").
|
|
||||||
--skip-layers layers to skip for SLG steps (default: [7,8,9])
|
|
||||||
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
|
|
||||||
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
|
|
||||||
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
|
|
||||||
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
|
|
||||||
Chebyshev+Taylor forecasting)
|
|
||||||
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
|
|
||||||
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
|
|
||||||
Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
|
|
||||||
Examples: "threshold=0.25" or "threshold=1.5,reset=0"
|
|
||||||
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
|
|
||||||
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
|
|
||||||
--scm-policy SCM policy: 'dynamic' (default) or 'static'
|
|
||||||
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
|
|
||||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
|
|
||||||
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Metadata mode inspects PNG/JPEG container metadata without loading any model:
|
Metadata mode inspects PNG/JPEG container metadata without loading any model:
|
||||||
|
|||||||
@ -62,18 +62,22 @@ struct SDCliParams {
|
|||||||
{"-o",
|
{"-o",
|
||||||
"--output",
|
"--output",
|
||||||
"path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs support .avi, .webm, and animated .webp",
|
"path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs support .avi, .webm, and animated .webp",
|
||||||
|
0,
|
||||||
&output_path},
|
&output_path},
|
||||||
{"",
|
{"",
|
||||||
"--image",
|
"--image",
|
||||||
"path to the image to inspect (for metadata mode)",
|
"path to the image to inspect (for metadata mode)",
|
||||||
|
0,
|
||||||
&image_path},
|
&image_path},
|
||||||
{"",
|
{"",
|
||||||
"--metadata-format",
|
"--metadata-format",
|
||||||
"metadata output format, one of [text, json] (default: text)",
|
"metadata output format, one of [text, json] (default: text)",
|
||||||
|
0,
|
||||||
&metadata_format},
|
&metadata_format},
|
||||||
{"",
|
{"",
|
||||||
"--preview-path",
|
"--preview-path",
|
||||||
"path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp",
|
"path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp",
|
||||||
|
0,
|
||||||
&preview_path},
|
&preview_path},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -623,8 +627,6 @@ int main(int argc, const char* argv[]) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool vae_decode_only = true;
|
|
||||||
|
|
||||||
auto load_image_and_update_size = [&](const std::string& path,
|
auto load_image_and_update_size = [&](const std::string& path,
|
||||||
SDImageOwner& image,
|
SDImageOwner& image,
|
||||||
bool resize_image = true,
|
bool resize_image = true,
|
||||||
@ -646,21 +648,18 @@ int main(int argc, const char* argv[]) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if (gen_params.init_image_path.size() > 0) {
|
if (gen_params.init_image_path.size() > 0) {
|
||||||
vae_decode_only = false;
|
|
||||||
if (!load_image_and_update_size(gen_params.init_image_path, gen_params.init_image)) {
|
if (!load_image_and_update_size(gen_params.init_image_path, gen_params.init_image)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (gen_params.end_image_path.size() > 0) {
|
if (gen_params.end_image_path.size() > 0) {
|
||||||
vae_decode_only = false;
|
|
||||||
if (!load_image_and_update_size(gen_params.end_image_path, gen_params.end_image)) {
|
if (!load_image_and_update_size(gen_params.end_image_path, gen_params.end_image)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (gen_params.ref_image_paths.size() > 0) {
|
if (gen_params.ref_image_paths.size() > 0) {
|
||||||
vae_decode_only = false;
|
|
||||||
gen_params.ref_images.clear();
|
gen_params.ref_images.clear();
|
||||||
for (auto& path : gen_params.ref_image_paths) {
|
for (auto& path : gen_params.ref_image_paths) {
|
||||||
SDImageOwner ref_image({0, 0, 3, nullptr});
|
SDImageOwner ref_image({0, 0, 3, nullptr});
|
||||||
@ -735,18 +734,7 @@ int main(int argc, const char* argv[]) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cli_params.mode == VID_GEN) {
|
sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(cli_params.taesd_preview);
|
||||||
vae_decode_only = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (gen_params.hires_enabled &&
|
|
||||||
(gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL ||
|
|
||||||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS ||
|
|
||||||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) {
|
|
||||||
vae_decode_only = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview);
|
|
||||||
|
|
||||||
SDImageVec results;
|
SDImageVec results;
|
||||||
int num_results = 0;
|
int num_results = 0;
|
||||||
@ -798,12 +786,11 @@ int main(int argc, const char* argv[]) {
|
|||||||
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
|
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
|
||||||
if (ctx_params.esrgan_path.size() > 0 && gen_params.upscale_repeats > 0) {
|
if (ctx_params.esrgan_path.size() > 0 && gen_params.upscale_repeats > 0) {
|
||||||
UpscalerCtxPtr upscaler_ctx(new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
|
UpscalerCtxPtr upscaler_ctx(new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
|
||||||
ctx_params.offload_params_to_cpu,
|
|
||||||
ctx_params.diffusion_conv_direct,
|
ctx_params.diffusion_conv_direct,
|
||||||
ctx_params.n_threads,
|
ctx_params.n_threads,
|
||||||
gen_params.upscale_tile_size,
|
gen_params.upscale_tile_size,
|
||||||
ctx_params.backend.c_str(),
|
sd_ctx_params.backend,
|
||||||
ctx_params.params_backend.c_str()));
|
sd_ctx_params.params_backend));
|
||||||
|
|
||||||
if (upscaler_ctx == nullptr) {
|
if (upscaler_ctx == nullptr) {
|
||||||
LOG_ERROR("new_upscaler_ctx failed");
|
LOG_ERROR("new_upscaler_ctx failed");
|
||||||
|
|||||||
@ -6,6 +6,7 @@
|
|||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
@ -51,6 +52,10 @@ static sd_vae_format_t str_to_vae_format(const std::string& value) {
|
|||||||
return SD_VAE_FORMAT_COUNT;
|
return SD_VAE_FORMAT_COUNT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void prepend_backend_assignment(std::string& spec, const char* assignment) {
|
||||||
|
spec = spec.empty() ? assignment : std::string(assignment) + "," + spec;
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
static std::string utf16_to_utf8(const std::wstring& wstr) {
|
static std::string utf16_to_utf8(const std::wstring& wstr) {
|
||||||
if (wstr.empty())
|
if (wstr.empty())
|
||||||
@ -256,7 +261,14 @@ bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& o
|
|||||||
invalid_arg = true;
|
invalid_arg = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (option.concat && !option.target->empty()) {
|
||||||
|
if (option.concat > 0 && option.concat <= 0xff) {
|
||||||
|
*option.target += static_cast<char>(option.concat);
|
||||||
|
}
|
||||||
|
*option.target += argv_to_utf8(i, argv);
|
||||||
|
} else {
|
||||||
*option.target = argv_to_utf8(i, argv);
|
*option.target = argv_to_utf8(i, argv);
|
||||||
|
}
|
||||||
found_arg = true;
|
found_arg = true;
|
||||||
}))
|
}))
|
||||||
break;
|
break;
|
||||||
@ -320,109 +332,152 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
{"-m",
|
{"-m",
|
||||||
"--model",
|
"--model",
|
||||||
"path to full model",
|
"path to full model",
|
||||||
|
0,
|
||||||
&model_path},
|
&model_path},
|
||||||
{"",
|
{"",
|
||||||
"--clip_l",
|
"--clip_l",
|
||||||
"path to the clip-l text encoder", &clip_l_path},
|
"path to the clip-l text encoder",
|
||||||
|
0,
|
||||||
|
&clip_l_path},
|
||||||
{"", "--clip_g",
|
{"", "--clip_g",
|
||||||
"path to the clip-g text encoder",
|
"path to the clip-g text encoder",
|
||||||
|
0,
|
||||||
&clip_g_path},
|
&clip_g_path},
|
||||||
{"",
|
{"",
|
||||||
"--clip_vision",
|
"--clip_vision",
|
||||||
"path to the clip-vision encoder",
|
"path to the clip-vision encoder",
|
||||||
|
0,
|
||||||
&clip_vision_path},
|
&clip_vision_path},
|
||||||
{"",
|
{"",
|
||||||
"--t5xxl",
|
"--t5xxl",
|
||||||
"path to the t5xxl text encoder",
|
"path to the t5xxl text encoder",
|
||||||
|
0,
|
||||||
&t5xxl_path},
|
&t5xxl_path},
|
||||||
{"",
|
{"",
|
||||||
"--llm",
|
"--llm",
|
||||||
"path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
|
"path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
|
||||||
|
0,
|
||||||
&llm_path},
|
&llm_path},
|
||||||
{"",
|
{"",
|
||||||
"--llm_vision",
|
"--llm_vision",
|
||||||
"path to the llm vit",
|
"path to the llm vit",
|
||||||
|
0,
|
||||||
&llm_vision_path},
|
&llm_vision_path},
|
||||||
{"",
|
{"",
|
||||||
"--qwen2vl",
|
"--qwen2vl",
|
||||||
"alias of --llm. Deprecated.",
|
"alias of --llm. Deprecated.",
|
||||||
|
0,
|
||||||
&llm_path},
|
&llm_path},
|
||||||
{"",
|
{"",
|
||||||
"--qwen2vl_vision",
|
"--qwen2vl_vision",
|
||||||
"alias of --llm_vision. Deprecated.",
|
"alias of --llm_vision. Deprecated.",
|
||||||
|
0,
|
||||||
&llm_vision_path},
|
&llm_vision_path},
|
||||||
{"",
|
{"",
|
||||||
"--diffusion-model",
|
"--diffusion-model",
|
||||||
"path to the standalone diffusion model",
|
"path to the standalone diffusion model",
|
||||||
|
0,
|
||||||
&diffusion_model_path},
|
&diffusion_model_path},
|
||||||
{"",
|
{"",
|
||||||
"--high-noise-diffusion-model",
|
"--high-noise-diffusion-model",
|
||||||
"path to the standalone high noise diffusion model",
|
"path to the standalone high noise diffusion model",
|
||||||
|
0,
|
||||||
&high_noise_diffusion_model_path},
|
&high_noise_diffusion_model_path},
|
||||||
{"",
|
{"",
|
||||||
"--uncond-diffusion-model",
|
"--uncond-diffusion-model",
|
||||||
"path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG",
|
"path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG",
|
||||||
|
0,
|
||||||
&uncond_diffusion_model_path},
|
&uncond_diffusion_model_path},
|
||||||
{"",
|
{"",
|
||||||
"--embeddings-connectors",
|
"--embeddings-connectors",
|
||||||
"path to LTXAV embeddings connectors",
|
"path to LTXAV embeddings connectors",
|
||||||
|
0,
|
||||||
&embeddings_connectors_path},
|
&embeddings_connectors_path},
|
||||||
{"",
|
{"",
|
||||||
"--vae",
|
"--vae",
|
||||||
"path to standalone vae model",
|
"path to standalone vae model",
|
||||||
|
0,
|
||||||
&vae_path},
|
&vae_path},
|
||||||
{"",
|
{"",
|
||||||
"--vae-format",
|
"--vae-format",
|
||||||
"VAE latent format override: auto, flux, sd3, or flux2 (default: auto)",
|
"VAE latent format override: auto, flux, sd3, or flux2 (default: auto)",
|
||||||
|
0,
|
||||||
&vae_format},
|
&vae_format},
|
||||||
{"",
|
{"",
|
||||||
"--audio-vae",
|
"--audio-vae",
|
||||||
"path to standalone LTX audio vae model",
|
"path to standalone LTX audio vae model",
|
||||||
|
0,
|
||||||
&audio_vae_path},
|
&audio_vae_path},
|
||||||
{"",
|
{"",
|
||||||
"--taesd",
|
"--taesd",
|
||||||
"path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
|
"path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
|
||||||
|
0,
|
||||||
&taesd_path},
|
&taesd_path},
|
||||||
{"",
|
{"",
|
||||||
"--tae",
|
"--tae",
|
||||||
"alias of --taesd",
|
"alias of --taesd",
|
||||||
|
0,
|
||||||
&taesd_path},
|
&taesd_path},
|
||||||
{"",
|
{"",
|
||||||
"--control-net",
|
"--control-net",
|
||||||
"path to control net model",
|
"path to control net model",
|
||||||
|
0,
|
||||||
&control_net_path},
|
&control_net_path},
|
||||||
{"",
|
{"",
|
||||||
"--embd-dir",
|
"--embd-dir",
|
||||||
"embeddings directory",
|
"embeddings directory",
|
||||||
|
0,
|
||||||
&embedding_dir},
|
&embedding_dir},
|
||||||
{"",
|
{"",
|
||||||
"--lora-model-dir",
|
"--lora-model-dir",
|
||||||
"lora model directory",
|
"lora model directory",
|
||||||
|
0,
|
||||||
&lora_model_dir},
|
&lora_model_dir},
|
||||||
{"",
|
{"",
|
||||||
"--hires-upscalers-dir",
|
"--hires-upscalers-dir",
|
||||||
"highres fix upscaler model directory",
|
"highres fix upscaler model directory",
|
||||||
|
0,
|
||||||
&hires_upscalers_dir},
|
&hires_upscalers_dir},
|
||||||
{"",
|
{"",
|
||||||
"--tensor-type-rules",
|
"--tensor-type-rules",
|
||||||
"weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
|
"weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
|
||||||
|
(int)',',
|
||||||
&tensor_type_rules},
|
&tensor_type_rules},
|
||||||
{"",
|
{"",
|
||||||
"--photo-maker",
|
"--photo-maker",
|
||||||
"path to PHOTOMAKER model",
|
"path to PHOTOMAKER model",
|
||||||
|
0,
|
||||||
&photo_maker_path},
|
&photo_maker_path},
|
||||||
|
{"",
|
||||||
|
"--pulid-weights",
|
||||||
|
"path to PuLID Flux weights",
|
||||||
|
0,
|
||||||
|
&pulid_weights_path},
|
||||||
{"",
|
{"",
|
||||||
"--upscale-model",
|
"--upscale-model",
|
||||||
"path to esrgan model.",
|
"path to esrgan model.",
|
||||||
|
0,
|
||||||
&esrgan_path},
|
&esrgan_path},
|
||||||
{"",
|
{"",
|
||||||
"--backend",
|
"--backend",
|
||||||
"runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0",
|
"runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0",
|
||||||
|
(int)',',
|
||||||
&backend},
|
&backend},
|
||||||
{"",
|
{"",
|
||||||
"--params-backend",
|
"--params-backend",
|
||||||
"parameter backend assignment, e.g. cpu or diffusion=cpu,clip=cpu",
|
"parameter backend assignment, e.g. disk, cpu, or diffusion=disk,clip=cpu",
|
||||||
|
(int)',',
|
||||||
¶ms_backend},
|
¶ms_backend},
|
||||||
|
{"",
|
||||||
|
"--rpc-servers",
|
||||||
|
"comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052",
|
||||||
|
(int)',',
|
||||||
|
&rpc_servers},
|
||||||
|
{"",
|
||||||
|
"--max-vram",
|
||||||
|
"maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value",
|
||||||
|
0,
|
||||||
|
&max_vram},
|
||||||
};
|
};
|
||||||
|
|
||||||
options.int_options = {
|
options.int_options = {
|
||||||
@ -437,18 +492,15 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
&chroma_t5_mask_pad},
|
&chroma_t5_mask_pad},
|
||||||
};
|
};
|
||||||
|
|
||||||
options.float_options = {
|
|
||||||
{"",
|
|
||||||
"--max-vram",
|
|
||||||
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)",
|
|
||||||
&max_vram},
|
|
||||||
};
|
|
||||||
|
|
||||||
options.bool_options = {
|
options.bool_options = {
|
||||||
{"",
|
{"",
|
||||||
"--stream-layers",
|
"--stream-layers",
|
||||||
"enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
|
"enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
|
||||||
true, &stream_layers},
|
true, &stream_layers},
|
||||||
|
{"",
|
||||||
|
"--eager-load",
|
||||||
|
"load all params into the params backend at model-load time instead of lazily on first use (defaults to false)",
|
||||||
|
true, &eager_load},
|
||||||
{"",
|
{"",
|
||||||
"--force-sdxl-vae-conv-scale",
|
"--force-sdxl-vae-conv-scale",
|
||||||
"force use of conv scale on sdxl vae",
|
"force use of conv scale on sdxl vae",
|
||||||
@ -463,15 +515,15 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
true, &enable_mmap},
|
true, &enable_mmap},
|
||||||
{"",
|
{"",
|
||||||
"--control-net-cpu",
|
"--control-net-cpu",
|
||||||
"keep controlnet in cpu (for low vram)",
|
"deprecated; use --backend controlnet=cpu",
|
||||||
true, &control_net_cpu},
|
true, &control_net_cpu},
|
||||||
{"",
|
{"",
|
||||||
"--clip-on-cpu",
|
"--clip-on-cpu",
|
||||||
"keep clip in cpu (for low vram)",
|
"deprecated; use --backend te=cpu",
|
||||||
true, &clip_on_cpu},
|
true, &clip_on_cpu},
|
||||||
{"",
|
{"",
|
||||||
"--vae-on-cpu",
|
"--vae-on-cpu",
|
||||||
"keep vae in cpu (for low vram)",
|
"deprecated; use --backend vae=cpu",
|
||||||
true, &vae_on_cpu},
|
true, &vae_on_cpu},
|
||||||
{"",
|
{"",
|
||||||
"--fa",
|
"--fa",
|
||||||
@ -688,6 +740,25 @@ bool SDContextParams::resolve_and_validate(SDMode mode) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SDContextParams::prepare_backend_assignments() {
|
||||||
|
effective_backend = backend;
|
||||||
|
effective_params_backend = params_backend;
|
||||||
|
|
||||||
|
if (offload_params_to_cpu) {
|
||||||
|
prepend_backend_assignment(effective_params_backend, "*=cpu");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (clip_on_cpu) {
|
||||||
|
prepend_backend_assignment(effective_backend, "te=cpu");
|
||||||
|
}
|
||||||
|
if (vae_on_cpu) {
|
||||||
|
prepend_backend_assignment(effective_backend, "vae=cpu");
|
||||||
|
}
|
||||||
|
if (control_net_cpu) {
|
||||||
|
prepend_backend_assignment(effective_backend, "controlnet=cpu");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::string SDContextParams::to_string() const {
|
std::string SDContextParams::to_string() const {
|
||||||
std::ostringstream emb_ss;
|
std::ostringstream emb_ss;
|
||||||
emb_ss << "{\n";
|
emb_ss << "{\n";
|
||||||
@ -731,8 +802,9 @@ std::string SDContextParams::to_string() const {
|
|||||||
<< " rng_type: " << sd_rng_type_name(rng_type) << ",\n"
|
<< " rng_type: " << sd_rng_type_name(rng_type) << ",\n"
|
||||||
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
|
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
|
||||||
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
|
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
|
||||||
<< " max_vram: " << max_vram << ",\n"
|
<< " max_vram: \"" << max_vram << "\",\n"
|
||||||
<< " stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
|
<< " stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
|
||||||
|
<< " eager_load: " << (eager_load ? "true" : "false") << ",\n"
|
||||||
<< " backend: \"" << backend << "\",\n"
|
<< " backend: \"" << backend << "\",\n"
|
||||||
<< " params_backend: \"" << params_backend << "\",\n"
|
<< " params_backend: \"" << params_backend << "\",\n"
|
||||||
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
|
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
|
||||||
@ -757,7 +829,8 @@ std::string SDContextParams::to_string() const {
|
|||||||
return oss.str();
|
return oss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
|
sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
|
||||||
|
prepare_backend_assignments();
|
||||||
embedding_vec.clear();
|
embedding_vec.clear();
|
||||||
embedding_vec.reserve(embedding_map.size());
|
embedding_vec.reserve(embedding_map.size());
|
||||||
for (const auto& kv : embedding_map) {
|
for (const auto& kv : embedding_map) {
|
||||||
@ -767,57 +840,54 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
|
|||||||
embedding_vec.emplace_back(item);
|
embedding_vec.emplace_back(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
sd_ctx_params_t sd_ctx_params = {
|
sd_ctx_params_t sd_ctx_params;
|
||||||
model_path.c_str(),
|
sd_ctx_params_init(&sd_ctx_params);
|
||||||
clip_l_path.c_str(),
|
sd_ctx_params.model_path = model_path.c_str();
|
||||||
clip_g_path.c_str(),
|
sd_ctx_params.clip_l_path = clip_l_path.c_str();
|
||||||
clip_vision_path.c_str(),
|
sd_ctx_params.clip_g_path = clip_g_path.c_str();
|
||||||
t5xxl_path.c_str(),
|
sd_ctx_params.clip_vision_path = clip_vision_path.c_str();
|
||||||
llm_path.c_str(),
|
sd_ctx_params.t5xxl_path = t5xxl_path.c_str();
|
||||||
llm_vision_path.c_str(),
|
sd_ctx_params.llm_path = llm_path.c_str();
|
||||||
diffusion_model_path.c_str(),
|
sd_ctx_params.llm_vision_path = llm_vision_path.c_str();
|
||||||
high_noise_diffusion_model_path.c_str(),
|
sd_ctx_params.diffusion_model_path = diffusion_model_path.c_str();
|
||||||
uncond_diffusion_model_path.c_str(),
|
sd_ctx_params.high_noise_diffusion_model_path = high_noise_diffusion_model_path.c_str();
|
||||||
embeddings_connectors_path.c_str(),
|
sd_ctx_params.uncond_diffusion_model_path = uncond_diffusion_model_path.c_str();
|
||||||
vae_path.c_str(),
|
sd_ctx_params.embeddings_connectors_path = embeddings_connectors_path.c_str();
|
||||||
audio_vae_path.c_str(),
|
sd_ctx_params.vae_path = vae_path.c_str();
|
||||||
taesd_path.c_str(),
|
sd_ctx_params.audio_vae_path = audio_vae_path.c_str();
|
||||||
control_net_path.c_str(),
|
sd_ctx_params.taesd_path = taesd_path.c_str();
|
||||||
embedding_vec.data(),
|
sd_ctx_params.control_net_path = control_net_path.c_str();
|
||||||
static_cast<uint32_t>(embedding_vec.size()),
|
sd_ctx_params.embeddings = embedding_vec.data();
|
||||||
photo_maker_path.c_str(),
|
sd_ctx_params.embedding_count = static_cast<uint32_t>(embedding_vec.size());
|
||||||
tensor_type_rules.c_str(),
|
sd_ctx_params.photo_maker_path = photo_maker_path.c_str();
|
||||||
vae_decode_only,
|
sd_ctx_params.pulid_weights_path = pulid_weights_path.c_str();
|
||||||
free_params_immediately,
|
sd_ctx_params.tensor_type_rules = tensor_type_rules.c_str();
|
||||||
n_threads,
|
sd_ctx_params.n_threads = n_threads;
|
||||||
wtype,
|
sd_ctx_params.wtype = wtype;
|
||||||
rng_type,
|
sd_ctx_params.rng_type = rng_type;
|
||||||
sampler_rng_type,
|
sd_ctx_params.sampler_rng_type = sampler_rng_type;
|
||||||
prediction,
|
sd_ctx_params.prediction = prediction;
|
||||||
lora_apply_mode,
|
sd_ctx_params.lora_apply_mode = lora_apply_mode;
|
||||||
offload_params_to_cpu,
|
sd_ctx_params.enable_mmap = enable_mmap;
|
||||||
enable_mmap,
|
sd_ctx_params.flash_attn = flash_attn;
|
||||||
clip_on_cpu,
|
sd_ctx_params.diffusion_flash_attn = diffusion_flash_attn;
|
||||||
control_net_cpu,
|
sd_ctx_params.tae_preview_only = taesd_preview;
|
||||||
vae_on_cpu,
|
sd_ctx_params.diffusion_conv_direct = diffusion_conv_direct;
|
||||||
flash_attn,
|
sd_ctx_params.vae_conv_direct = vae_conv_direct;
|
||||||
diffusion_flash_attn,
|
sd_ctx_params.circular_x = circular || circular_x;
|
||||||
taesd_preview,
|
sd_ctx_params.circular_y = circular || circular_y;
|
||||||
diffusion_conv_direct,
|
sd_ctx_params.force_sdxl_vae_conv_scale = force_sdxl_vae_conv_scale;
|
||||||
vae_conv_direct,
|
sd_ctx_params.chroma_use_dit_mask = chroma_use_dit_mask;
|
||||||
circular || circular_x,
|
sd_ctx_params.chroma_use_t5_mask = chroma_use_t5_mask;
|
||||||
circular || circular_y,
|
sd_ctx_params.chroma_t5_mask_pad = chroma_t5_mask_pad;
|
||||||
force_sdxl_vae_conv_scale,
|
sd_ctx_params.qwen_image_zero_cond_t = qwen_image_zero_cond_t;
|
||||||
chroma_use_dit_mask,
|
sd_ctx_params.vae_format = str_to_vae_format(vae_format);
|
||||||
chroma_use_t5_mask,
|
sd_ctx_params.max_vram = max_vram.c_str();
|
||||||
chroma_t5_mask_pad,
|
sd_ctx_params.stream_layers = stream_layers;
|
||||||
qwen_image_zero_cond_t,
|
sd_ctx_params.eager_load = eager_load;
|
||||||
str_to_vae_format(vae_format),
|
sd_ctx_params.backend = effective_backend.c_str();
|
||||||
max_vram,
|
sd_ctx_params.params_backend = effective_params_backend.c_str();
|
||||||
stream_layers,
|
sd_ctx_params.rpc_servers = rpc_servers.c_str();
|
||||||
backend.c_str(),
|
|
||||||
params_backend.c_str(),
|
|
||||||
};
|
|
||||||
return sd_ctx_params;
|
return sd_ctx_params;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -832,54 +902,71 @@ ArgOptions SDGenerationParams::get_options() {
|
|||||||
{"-p",
|
{"-p",
|
||||||
"--prompt",
|
"--prompt",
|
||||||
"the prompt to render",
|
"the prompt to render",
|
||||||
|
0,
|
||||||
&prompt},
|
&prompt},
|
||||||
{"-n",
|
{"-n",
|
||||||
"--negative-prompt",
|
"--negative-prompt",
|
||||||
"the negative prompt (default: \"\")",
|
"the negative prompt (default: \"\")",
|
||||||
|
0,
|
||||||
&negative_prompt},
|
&negative_prompt},
|
||||||
{"-i",
|
{"-i",
|
||||||
"--init-img",
|
"--init-img",
|
||||||
"path to the init image",
|
"path to the init image",
|
||||||
|
0,
|
||||||
&init_image_path},
|
&init_image_path},
|
||||||
{"",
|
{"",
|
||||||
"--end-img",
|
"--end-img",
|
||||||
"path to the end image, required by flf2v",
|
"path to the end image, required by flf2v",
|
||||||
|
0,
|
||||||
&end_image_path},
|
&end_image_path},
|
||||||
{"",
|
{"",
|
||||||
"--mask",
|
"--mask",
|
||||||
"path to the mask image",
|
"path to the mask image",
|
||||||
|
0,
|
||||||
&mask_image_path},
|
&mask_image_path},
|
||||||
{"",
|
{"",
|
||||||
"--control-image",
|
"--control-image",
|
||||||
"path to control image, control net",
|
"path to control image, control net",
|
||||||
|
0,
|
||||||
&control_image_path},
|
&control_image_path},
|
||||||
{"",
|
{"",
|
||||||
"--control-video",
|
"--control-video",
|
||||||
"path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
|
"path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
|
||||||
"lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
|
"lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
|
||||||
"such as 00.png, 01.png, ... etc.",
|
"such as 00.png, 01.png, ... etc.",
|
||||||
|
0,
|
||||||
&control_video_path},
|
&control_video_path},
|
||||||
{"",
|
{"",
|
||||||
"--pm-id-images-dir",
|
"--pm-id-images-dir",
|
||||||
"path to PHOTOMAKER input id images dir",
|
"path to PHOTOMAKER input id images dir",
|
||||||
|
0,
|
||||||
&pm_id_images_dir},
|
&pm_id_images_dir},
|
||||||
{"",
|
{"",
|
||||||
"--pm-id-embed-path",
|
"--pm-id-embed-path",
|
||||||
"path to PHOTOMAKER v2 id embed",
|
"path to PHOTOMAKER v2 id embed",
|
||||||
|
0,
|
||||||
&pm_id_embed_path},
|
&pm_id_embed_path},
|
||||||
|
{"",
|
||||||
|
"--pulid-id-embedding",
|
||||||
|
"path to PuLID id embedding",
|
||||||
|
0,
|
||||||
|
&pulid_id_embedding_path},
|
||||||
{"",
|
{"",
|
||||||
"--hires-upscaler",
|
"--hires-upscaler",
|
||||||
"highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
|
"highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
|
||||||
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
|
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
|
||||||
"under --hires-upscalers-dir (default: Latent)",
|
"under --hires-upscalers-dir (default: Latent)",
|
||||||
|
0,
|
||||||
&hires_upscaler},
|
&hires_upscaler},
|
||||||
{"",
|
{"",
|
||||||
"--extra-sample-args",
|
"--extra-sample-args",
|
||||||
"extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma",
|
"extra sampler/scheduler/guidance args, key=value list. CFG supports guidance_schedule; APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma;",
|
||||||
|
(int)',',
|
||||||
&extra_sample_args},
|
&extra_sample_args},
|
||||||
{"",
|
{"",
|
||||||
"--extra-tiling-args",
|
"--extra-tiling-args",
|
||||||
"extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
|
"extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
|
||||||
|
(int)',',
|
||||||
&extra_tiling_args},
|
&extra_tiling_args},
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1017,6 +1104,10 @@ ArgOptions SDGenerationParams::get_options() {
|
|||||||
"--pm-style-strength",
|
"--pm-style-strength",
|
||||||
"",
|
"",
|
||||||
&pm_style_strength},
|
&pm_style_strength},
|
||||||
|
{"",
|
||||||
|
"--pulid-id-weight",
|
||||||
|
"strength of PuLID identity injection",
|
||||||
|
&pulid_id_weight},
|
||||||
{"",
|
{"",
|
||||||
"--control-strength",
|
"--control-strength",
|
||||||
"strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
|
"strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
|
||||||
@ -1331,6 +1422,42 @@ ArgOptions SDGenerationParams::get_options() {
|
|||||||
return 1;
|
return 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
auto on_prompt_file_arg = [&](int argc, const char** argv, int index) {
|
||||||
|
if (++index >= argc) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
const char* arg = argv[index];
|
||||||
|
std::ifstream f(arg, std::ios::binary);
|
||||||
|
try {
|
||||||
|
prompt = std::string(std::istreambuf_iterator<char>{f}, {});
|
||||||
|
} catch (const std::ios_base::failure&) {
|
||||||
|
f.setstate(std::ios_base::failbit);
|
||||||
|
}
|
||||||
|
if (f.fail()) {
|
||||||
|
LOG_ERROR("error: failed to read prompt file '%s'\n", arg);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto on_negative_prompt_file_arg = [&](int argc, const char** argv, int index) {
|
||||||
|
if (++index >= argc) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
const char* arg = argv[index];
|
||||||
|
std::ifstream f(arg, std::ios::binary);
|
||||||
|
try {
|
||||||
|
negative_prompt = std::string(std::istreambuf_iterator<char>{f}, {});
|
||||||
|
} catch (const std::ios_base::failure&) {
|
||||||
|
f.setstate(std::ios_base::failbit);
|
||||||
|
}
|
||||||
|
if (f.fail()) {
|
||||||
|
LOG_ERROR("error: failed to read negative prompt file '%s'\n", arg);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
};
|
||||||
|
|
||||||
options.manual_options = {
|
options.manual_options = {
|
||||||
{"-s",
|
{"-s",
|
||||||
"--seed",
|
"--seed",
|
||||||
@ -1394,6 +1521,14 @@ ArgOptions SDGenerationParams::get_options() {
|
|||||||
"--vae-relative-tile-size",
|
"--vae-relative-tile-size",
|
||||||
"relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
|
"relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
|
||||||
on_relative_tile_size_arg},
|
on_relative_tile_size_arg},
|
||||||
|
{"",
|
||||||
|
"--prompt-file",
|
||||||
|
"path to the file containing the prompt to render",
|
||||||
|
on_prompt_file_arg},
|
||||||
|
{"",
|
||||||
|
"--negative-prompt-file",
|
||||||
|
"path to the file containing the negative prompt",
|
||||||
|
on_negative_prompt_file_arg},
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -2249,6 +2384,11 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
|
|||||||
pm_style_strength,
|
pm_style_strength,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
sd_pulid_params_t pulid_params = {
|
||||||
|
pulid_id_embedding_path.empty() ? nullptr : pulid_id_embedding_path.c_str(),
|
||||||
|
pulid_id_weight,
|
||||||
|
};
|
||||||
|
|
||||||
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
|
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
|
||||||
params.lora_count = static_cast<uint32_t>(lora_vec.size());
|
params.lora_count = static_cast<uint32_t>(lora_vec.size());
|
||||||
params.prompt = prompt.c_str();
|
params.prompt = prompt.c_str();
|
||||||
@ -2269,6 +2409,7 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
|
|||||||
params.control_image = control_image.get();
|
params.control_image = control_image.get();
|
||||||
params.control_strength = control_strength;
|
params.control_strength = control_strength;
|
||||||
params.pm_params = pm_params;
|
params.pm_params = pm_params;
|
||||||
|
params.pulid_params = pulid_params;
|
||||||
params.vae_tiling_params = vae_tiling_params;
|
params.vae_tiling_params = vae_tiling_params;
|
||||||
params.cache = cache_params;
|
params.cache = cache_params;
|
||||||
|
|
||||||
|
|||||||
@ -31,6 +31,7 @@ struct StringOption {
|
|||||||
std::string short_name;
|
std::string short_name;
|
||||||
std::string long_name;
|
std::string long_name;
|
||||||
std::string desc;
|
std::string desc;
|
||||||
|
int concat;
|
||||||
std::string* target;
|
std::string* target;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -133,6 +134,7 @@ struct SDContextParams {
|
|||||||
std::string control_net_path;
|
std::string control_net_path;
|
||||||
std::string embedding_dir;
|
std::string embedding_dir;
|
||||||
std::string photo_maker_path;
|
std::string photo_maker_path;
|
||||||
|
std::string pulid_weights_path;
|
||||||
sd_type_t wtype = SD_TYPE_COUNT;
|
sd_type_t wtype = SD_TYPE_COUNT;
|
||||||
std::string tensor_type_rules;
|
std::string tensor_type_rules;
|
||||||
std::string lora_model_dir = ".";
|
std::string lora_model_dir = ".";
|
||||||
@ -144,10 +146,14 @@ struct SDContextParams {
|
|||||||
rng_type_t rng_type = CUDA_RNG;
|
rng_type_t rng_type = CUDA_RNG;
|
||||||
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
|
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
|
||||||
bool offload_params_to_cpu = false;
|
bool offload_params_to_cpu = false;
|
||||||
float max_vram = 0.f;
|
std::string max_vram = "0";
|
||||||
bool stream_layers = false;
|
bool stream_layers = false;
|
||||||
|
bool eager_load = false;
|
||||||
std::string backend;
|
std::string backend;
|
||||||
std::string params_backend;
|
std::string params_backend;
|
||||||
|
std::string rpc_servers;
|
||||||
|
std::string effective_backend;
|
||||||
|
std::string effective_params_backend;
|
||||||
bool enable_mmap = false;
|
bool enable_mmap = false;
|
||||||
bool control_net_cpu = false;
|
bool control_net_cpu = false;
|
||||||
bool clip_on_cpu = false;
|
bool clip_on_cpu = false;
|
||||||
@ -175,11 +181,12 @@ struct SDContextParams {
|
|||||||
float flow_shift = INFINITY;
|
float flow_shift = INFINITY;
|
||||||
ArgOptions get_options();
|
ArgOptions get_options();
|
||||||
void build_embedding_map();
|
void build_embedding_map();
|
||||||
|
void prepare_backend_assignments();
|
||||||
bool resolve(SDMode mode);
|
bool resolve(SDMode mode);
|
||||||
bool validate(SDMode mode);
|
bool validate(SDMode mode);
|
||||||
bool resolve_and_validate(SDMode mode);
|
bool resolve_and_validate(SDMode mode);
|
||||||
std::string to_string() const;
|
std::string to_string() const;
|
||||||
sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview);
|
sd_ctx_params_t to_sd_ctx_params_t(bool taesd_preview);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct SDGenerationParams {
|
struct SDGenerationParams {
|
||||||
@ -230,6 +237,9 @@ struct SDGenerationParams {
|
|||||||
std::string pm_id_embed_path;
|
std::string pm_id_embed_path;
|
||||||
float pm_style_strength = 20.f;
|
float pm_style_strength = 20.f;
|
||||||
|
|
||||||
|
std::string pulid_id_embedding_path;
|
||||||
|
float pulid_id_weight = 1.0f;
|
||||||
|
|
||||||
int upscale_repeats = 1;
|
int upscale_repeats = 1;
|
||||||
int upscale_tile_size = 128;
|
int upscale_tile_size = 128;
|
||||||
|
|
||||||
|
|||||||
@ -117,188 +117,10 @@ In this case, the server will load and serve the specified `index.html` file ins
|
|||||||
* using a custom UI
|
* using a custom UI
|
||||||
* avoiding rebuilding the binary after frontend modifications
|
* avoiding rebuilding the binary after frontend modifications
|
||||||
|
|
||||||
# Run
|
# Usage
|
||||||
|
|
||||||
```
|
For detailed command-line arguments, run:
|
||||||
usage: ./bin/sd-server [options]
|
|
||||||
|
```bash
|
||||||
Svr Options:
|
./bin/sd-server -h
|
||||||
-l, --listen-ip <string> server listen ip (default: 127.0.0.1)
|
|
||||||
--serve-html-path <string> path to HTML file to serve at root (optional)
|
|
||||||
--listen-port <int> server listen port (default: 1234)
|
|
||||||
-v, --verbose print extra info
|
|
||||||
--color colors the logging tags according to level
|
|
||||||
-h, --help show this help message and exit
|
|
||||||
|
|
||||||
Context Options:
|
|
||||||
-m, --model <string> path to full model
|
|
||||||
--clip_l <string> path to the clip-l text encoder
|
|
||||||
--clip_g <string> path to the clip-g text encoder
|
|
||||||
--clip_vision <string> path to the clip-vision encoder
|
|
||||||
--t5xxl <string> path to the t5xxl text encoder
|
|
||||||
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
|
|
||||||
mistral-small3.2 for flux2, ...)
|
|
||||||
--llm_vision <string> path to the llm vit
|
|
||||||
--qwen2vl <string> alias of --llm. Deprecated.
|
|
||||||
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
|
||||||
--diffusion-model <string> path to the standalone diffusion model
|
|
||||||
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
|
|
||||||
--uncond-diffusion-model <string> path to the standalone unconditional diffusion model, currently used by
|
|
||||||
Ideogram4 CFG
|
|
||||||
--vae <string> path to standalone vae model
|
|
||||||
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
|
||||||
--tae <string> alias of --taesd
|
|
||||||
--control-net <string> path to control net model
|
|
||||||
--embd-dir <string> embeddings directory
|
|
||||||
--lora-model-dir <string> lora model directory
|
|
||||||
--hires-upscalers-dir <string> highres fix upscaler model directory
|
|
||||||
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
|
|
||||||
--photo-maker <string> path to PHOTOMAKER model
|
|
||||||
--upscale-model <string> path to esrgan model.
|
|
||||||
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
|
|
||||||
then threads will be set to the number of CPU physical cores
|
|
||||||
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
|
||||||
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
|
|
||||||
graph splitting; a negative value auto-detects free VRAM, sparing the
|
|
||||||
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
|
|
||||||
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
|
||||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
|
|
||||||
when needed
|
|
||||||
--mmap whether to memory-map model
|
|
||||||
--control-net-cpu keep controlnet in cpu (for low vram)
|
|
||||||
--clip-on-cpu keep clip in cpu (for low vram)
|
|
||||||
--vae-on-cpu keep vae in cpu (for low vram)
|
|
||||||
--fa use flash attention
|
|
||||||
--diffusion-fa use flash attention in the diffusion model only
|
|
||||||
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
|
|
||||||
--vae-conv-direct use ggml_conv2d_direct in the vae model
|
|
||||||
--circular enable circular padding for convolutions
|
|
||||||
--circularx enable circular RoPE wrapping on x-axis (width) only
|
|
||||||
--circulary enable circular RoPE wrapping on y-axis (height) only
|
|
||||||
--chroma-disable-dit-mask disable dit mask for chroma
|
|
||||||
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
|
|
||||||
--chroma-enable-t5-mask enable t5 mask for chroma
|
|
||||||
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
|
|
||||||
q4_K). If not specified, the default is the type of the weight file
|
|
||||||
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
|
|
||||||
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
|
|
||||||
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
|
|
||||||
flux2_flow]
|
|
||||||
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
|
|
||||||
auto. In auto mode, if the model weights contain any quantized parameters,
|
|
||||||
the at_runtime mode will be used; otherwise, immediately will be used.The
|
|
||||||
immediately mode may have precision and compatibility issues with quantized
|
|
||||||
parameters, but it usually offers faster inference speed and, in some cases,
|
|
||||||
lower memory usage. The at_runtime mode, on the other hand, is exactly the
|
|
||||||
opposite.
|
|
||||||
|
|
||||||
Default Generation Options:
|
|
||||||
-p, --prompt <string> the prompt to render
|
|
||||||
-n, --negative-prompt <string> the negative prompt (default: "")
|
|
||||||
-i, --init-img <string> path to the init image
|
|
||||||
--end-img <string> path to the end image, required by flf2v
|
|
||||||
--mask <string> path to the mask image
|
|
||||||
--control-image <string> path to control image, control net
|
|
||||||
--control-video <string> path to control video frames, It must be a directory path. The video frames
|
|
||||||
inside should be stored as images in lexicographical (character) order. For
|
|
||||||
example, if the control video path is `frames`, the directory contain images
|
|
||||||
such as 00.png, 01.png, ... etc.
|
|
||||||
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
|
|
||||||
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
|
|
||||||
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
|
|
||||||
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
|
|
||||||
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
|
|
||||||
--extra-sample-args <string> extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta,
|
|
||||||
apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports
|
|
||||||
slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end;
|
|
||||||
ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma
|
|
||||||
--extra-tiling-args <string> extra VAE tiling args, key=value list. LTX video VAE supports
|
|
||||||
temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)
|
|
||||||
-H, --height <int> image height, in pixel space (default: 512)
|
|
||||||
-W, --width <int> image width, in pixel space (default: 512)
|
|
||||||
--steps <int> number of sample steps (default: 20)
|
|
||||||
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
|
|
||||||
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
|
|
||||||
(default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
|
|
||||||
-b, --batch-count <int> batch count
|
|
||||||
--video-frames <int> video frames (default: 1)
|
|
||||||
--fps <int> fps (default: 24)
|
|
||||||
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
|
|
||||||
NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
|
|
||||||
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
|
|
||||||
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
|
|
||||||
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
|
|
||||||
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
|
|
||||||
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
|
|
||||||
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
|
|
||||||
128)
|
|
||||||
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
|
|
||||||
--img-cfg-scale <float> image guidance scale for inpaint or image edit models: (default: same as
|
|
||||||
--cfg-scale)
|
|
||||||
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
|
|
||||||
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
|
|
||||||
disabled, a value of 2.5 is nice for sd3.5 medium
|
|
||||||
--skip-layer-start <float> SLG enabling point (default: 0.01)
|
|
||||||
--skip-layer-end <float> SLG disabling point (default: 0.2)
|
|
||||||
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
|
|
||||||
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
|
||||||
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
|
|
||||||
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
|
|
||||||
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or image edit models (default:
|
|
||||||
same as --cfg-scale)
|
|
||||||
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
|
|
||||||
(default: 3.5)
|
|
||||||
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
|
|
||||||
0)
|
|
||||||
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
|
|
||||||
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
|
|
||||||
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
|
|
||||||
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
|
|
||||||
--strength <float> strength for noising/unnoising (default: 0.75)
|
|
||||||
--pm-style-strength <float>
|
|
||||||
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
|
|
||||||
destruction of information in init image
|
|
||||||
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
|
|
||||||
`--high-noise-steps` is set to -1
|
|
||||||
--vace-strength <float> wan vace strength
|
|
||||||
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
|
|
||||||
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
|
|
||||||
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
|
|
||||||
--increase-ref-index automatically increase the indices of references images based on the order
|
|
||||||
they are listed (starting with 1).
|
|
||||||
--disable-auto-resize-ref-image disable auto resize of ref images
|
|
||||||
--disable-image-metadata do not embed generation metadata on image files
|
|
||||||
--vae-tiling process vae in tiles to reduce memory usage
|
|
||||||
--temporal-tiling enable temporal tiling for LTX video VAE decode
|
|
||||||
--hires enable highres fix
|
|
||||||
-s, --seed RNG seed (default: 42, use random seed for < 0)
|
|
||||||
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
|
|
||||||
dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
|
|
||||||
er_sde, euler_cfg_pp, euler_a_cfg_pp] (default: euler for Flux/SD3/Wan, euler_a otherwise)
|
|
||||||
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
|
|
||||||
dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
|
|
||||||
res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp] default: euler for Flux/SD3/Wan, euler_a otherwise
|
|
||||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
|
|
||||||
smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default:
|
|
||||||
model-specific
|
|
||||||
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
|
|
||||||
"14.61,7.8,3.5,0.0").
|
|
||||||
--hires-sigmas custom sigma values for the highres fix second pass, comma-separated (e.g.,
|
|
||||||
"0.85,0.725,0.421875,0.0").
|
|
||||||
--skip-layers layers to skip for SLG steps (default: [7,8,9])
|
|
||||||
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
|
|
||||||
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
|
|
||||||
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
|
|
||||||
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
|
|
||||||
Chebyshev+Taylor forecasting)
|
|
||||||
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
|
|
||||||
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
|
|
||||||
Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
|
|
||||||
Examples: "threshold=0.25" or "threshold=1.5,reset=0"
|
|
||||||
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
|
|
||||||
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
|
|
||||||
--scm-policy SCM policy: 'dynamic' (default) or 'static'
|
|
||||||
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
|
|
||||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
|
|
||||||
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
|
|
||||||
```
|
```
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
Subproject commit 797ccf80825cc035508ba9b599b2a21953e7f835
|
Subproject commit c4bce3d6b3f236614cca21014f076083b7270ba8
|
||||||
@ -85,7 +85,7 @@ int main(int argc, const char** argv) {
|
|||||||
LOG_DEBUG("%s", ctx_params.to_string().c_str());
|
LOG_DEBUG("%s", ctx_params.to_string().c_str());
|
||||||
LOG_DEBUG("%s", default_gen_params.to_string().c_str());
|
LOG_DEBUG("%s", default_gen_params.to_string().c_str());
|
||||||
|
|
||||||
sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false, false, false);
|
sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false);
|
||||||
SDCtxPtr sd_ctx(new_sd_ctx(&sd_ctx_params));
|
SDCtxPtr sd_ctx(new_sd_ctx(&sd_ctx_params));
|
||||||
|
|
||||||
if (sd_ctx == nullptr) {
|
if (sd_ctx == nullptr) {
|
||||||
|
|||||||
@ -190,8 +190,8 @@ ArgOptions SDSvrParams::get_options() {
|
|||||||
ArgOptions options;
|
ArgOptions options;
|
||||||
|
|
||||||
options.string_options = {
|
options.string_options = {
|
||||||
{"-l", "--listen-ip", "server listen ip (default: 127.0.0.1)", &listen_ip},
|
{"-l", "--listen-ip", "server listen ip (default: 127.0.0.1)", 0, &listen_ip},
|
||||||
{"", "--serve-html-path", "path to HTML file to serve at root (optional)", &serve_html_path},
|
{"", "--serve-html-path", "path to HTML file to serve at root (optional)", 0, &serve_html_path},
|
||||||
};
|
};
|
||||||
|
|
||||||
options.int_options = {
|
options.int_options = {
|
||||||
|
|||||||
2
ggml
2
ggml
@ -1 +1 @@
|
|||||||
Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421
|
Subproject commit 3af5f5760e19a96427f5f7a93b79cbdf3d4b265b
|
||||||
@ -195,20 +195,15 @@ typedef struct {
|
|||||||
const sd_embedding_t* embeddings;
|
const sd_embedding_t* embeddings;
|
||||||
uint32_t embedding_count;
|
uint32_t embedding_count;
|
||||||
const char* photo_maker_path;
|
const char* photo_maker_path;
|
||||||
|
const char* pulid_weights_path;
|
||||||
const char* tensor_type_rules;
|
const char* tensor_type_rules;
|
||||||
bool vae_decode_only;
|
|
||||||
bool free_params_immediately;
|
|
||||||
int n_threads;
|
int n_threads;
|
||||||
enum sd_type_t wtype;
|
enum sd_type_t wtype;
|
||||||
enum rng_type_t rng_type;
|
enum rng_type_t rng_type;
|
||||||
enum rng_type_t sampler_rng_type;
|
enum rng_type_t sampler_rng_type;
|
||||||
enum prediction_t prediction;
|
enum prediction_t prediction;
|
||||||
enum lora_apply_mode_t lora_apply_mode;
|
enum lora_apply_mode_t lora_apply_mode;
|
||||||
bool offload_params_to_cpu;
|
|
||||||
bool enable_mmap;
|
bool enable_mmap;
|
||||||
bool keep_clip_on_cpu;
|
|
||||||
bool keep_control_net_on_cpu;
|
|
||||||
bool keep_vae_on_cpu;
|
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
bool diffusion_flash_attn;
|
bool diffusion_flash_attn;
|
||||||
bool tae_preview_only;
|
bool tae_preview_only;
|
||||||
@ -222,10 +217,12 @@ typedef struct {
|
|||||||
int chroma_t5_mask_pad;
|
int chroma_t5_mask_pad;
|
||||||
bool qwen_image_zero_cond_t;
|
bool qwen_image_zero_cond_t;
|
||||||
enum sd_vae_format_t vae_format;
|
enum sd_vae_format_t vae_format;
|
||||||
float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
|
const char* max_vram; // GiB budget or backend assignment spec for graph-cut segmented param offload (0 = disabled, -1 = auto)
|
||||||
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
|
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
|
||||||
|
bool eager_load; // Load all params into the params backend at model-load time instead of lazily on first use
|
||||||
const char* backend;
|
const char* backend;
|
||||||
const char* params_backend;
|
const char* params_backend;
|
||||||
|
const char* rpc_servers;
|
||||||
} sd_ctx_params_t;
|
} sd_ctx_params_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@ -277,6 +274,11 @@ typedef struct {
|
|||||||
float style_strength;
|
float style_strength;
|
||||||
} sd_pm_params_t; // photo maker
|
} sd_pm_params_t; // photo maker
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
const char* id_embedding_path;
|
||||||
|
float id_weight;
|
||||||
|
} sd_pulid_params_t;
|
||||||
|
|
||||||
enum sd_cache_mode_t {
|
enum sd_cache_mode_t {
|
||||||
SD_CACHE_DISABLED = 0,
|
SD_CACHE_DISABLED = 0,
|
||||||
SD_CACHE_EASYCACHE,
|
SD_CACHE_EASYCACHE,
|
||||||
@ -369,6 +371,7 @@ typedef struct {
|
|||||||
sd_image_t control_image;
|
sd_image_t control_image;
|
||||||
float control_strength;
|
float control_strength;
|
||||||
sd_pm_params_t pm_params;
|
sd_pm_params_t pm_params;
|
||||||
|
sd_pulid_params_t pulid_params;
|
||||||
sd_tiling_params_t vae_tiling_params;
|
sd_tiling_params_t vae_tiling_params;
|
||||||
sd_cache_params_t cache;
|
sd_cache_params_t cache;
|
||||||
sd_hires_params_t hires;
|
sd_hires_params_t hires;
|
||||||
@ -450,6 +453,17 @@ SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
|
|||||||
SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
|
SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
|
||||||
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
|
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
|
||||||
|
|
||||||
|
enum sd_cancel_mode_t {
|
||||||
|
// Stop the current generation as soon as possible.
|
||||||
|
SD_CANCEL_ALL,
|
||||||
|
// Finish the current image sample, then skip additional batch latents and return completed images.
|
||||||
|
SD_CANCEL_NEW_LATENTS,
|
||||||
|
// Clear a pending cancellation request.
|
||||||
|
SD_CANCEL_RESET
|
||||||
|
};
|
||||||
|
|
||||||
|
SD_API void sd_cancel_generation(sd_ctx_t* sd_ctx, enum sd_cancel_mode_t mode);
|
||||||
|
|
||||||
SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
|
SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
|
||||||
SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||||
const sd_vid_gen_params_t* sd_vid_gen_params,
|
const sd_vid_gen_params_t* sd_vid_gen_params,
|
||||||
@ -460,7 +474,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||||||
typedef struct upscaler_ctx_t upscaler_ctx_t;
|
typedef struct upscaler_ctx_t upscaler_ctx_t;
|
||||||
|
|
||||||
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
||||||
bool offload_params_to_cpu,
|
|
||||||
bool direct,
|
bool direct,
|
||||||
int n_threads,
|
int n_threads,
|
||||||
int tile_size,
|
int tile_size,
|
||||||
|
|||||||
134
script/pulid_extract_id.py
Normal file
134
script/pulid_extract_id.py
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
"""
|
||||||
|
Precompute a PuLID-Flux identity embedding from a single source portrait.
|
||||||
|
|
||||||
|
Writes a gguf file (a single tensor `pulid_id`) that stable-diffusion.cpp's
|
||||||
|
`--pulid-id-embedding` flag consumes.
|
||||||
|
|
||||||
|
Dependencies (recommended: vendor rather than pip-install due to upstream
|
||||||
|
packaging quirks):
|
||||||
|
- torch + safetensors
|
||||||
|
- The ToTheBeginning/PuLID repository's `pulid/` package and `eva_clip/`.
|
||||||
|
Put them on PYTHONPATH or sys.path before running this script.
|
||||||
|
- insightface, facexlib, torchvision, opencv-python, huggingface_hub, gguf
|
||||||
|
- numpy, Pillow
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python script/pulid_extract_id.py \\
|
||||||
|
--portrait /path/to/source-photo.jpg \\
|
||||||
|
--pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \\
|
||||||
|
--out /path/to/source.pulidembd
|
||||||
|
|
||||||
|
The portrait must contain a clearly visible face. insightface's antelopev2
|
||||||
|
detector will be auto-downloaded on first run.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
|
||||||
|
def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from pulid.pipeline_flux import PuLIDPipeline
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device, onnx_provider = "cuda", "gpu"
|
||||||
|
else:
|
||||||
|
device, onnx_provider = "cpu", "cpu"
|
||||||
|
|
||||||
|
print(f"device={device}", flush=True)
|
||||||
|
|
||||||
|
# PuLIDPipeline only attaches pulid_ca attributes to `dit` during
|
||||||
|
# construction; get_id_embedding() never runs Flux, so a dummy object is
|
||||||
|
# enough and avoids importing/building a Flux skeleton.
|
||||||
|
print("instantiating PuLIDPipeline with a dummy Flux object", flush=True)
|
||||||
|
dit = SimpleNamespace()
|
||||||
|
pulid = PuLIDPipeline(dit=dit,
|
||||||
|
device=device,
|
||||||
|
weight_dtype=torch.bfloat16,
|
||||||
|
onnx_provider=onnx_provider)
|
||||||
|
|
||||||
|
print(f"loading PuLID weights from {pulid_weights}", flush=True)
|
||||||
|
pulid.load_pretrain(pretrain_path=pulid_weights, version="v0.9.1")
|
||||||
|
|
||||||
|
print(f"extracting ID embedding from {portrait_path}", flush=True)
|
||||||
|
face_img = np.array(Image.open(portrait_path).convert("RGB"))
|
||||||
|
id_embedding, _ = pulid.get_id_embedding(face_img)
|
||||||
|
print(f"id embedding shape={tuple(id_embedding.shape)} dtype={id_embedding.dtype}",
|
||||||
|
flush=True)
|
||||||
|
|
||||||
|
if id_embedding.ndim == 3 and id_embedding.shape[0] == 1:
|
||||||
|
id_embedding = id_embedding[0]
|
||||||
|
return id_embedding
|
||||||
|
|
||||||
|
|
||||||
|
def write_embd(tensor, out_path: str, dtype_choice: str) -> None:
|
||||||
|
import gguf
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if tensor.ndim != 2:
|
||||||
|
raise ValueError(f"expected (num_tokens, token_dim); got {tuple(tensor.shape)}")
|
||||||
|
num_tokens, token_dim = tensor.shape
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
|
writer = gguf.GGUFWriter(out_path, arch="pulid")
|
||||||
|
writer.add_uint32("pulid.version", 1)
|
||||||
|
|
||||||
|
if dtype_choice == "fp16":
|
||||||
|
arr = tensor.to(torch.float16).contiguous().cpu().numpy()
|
||||||
|
writer.add_tensor("pulid_id", arr)
|
||||||
|
elif dtype_choice == "fp32":
|
||||||
|
arr = tensor.to(torch.float32).contiguous().cpu().numpy()
|
||||||
|
writer.add_tensor("pulid_id", arr)
|
||||||
|
elif dtype_choice == "bf16":
|
||||||
|
raw = tensor.to(torch.bfloat16).contiguous().view(torch.uint16).cpu().numpy()
|
||||||
|
writer.add_tensor("pulid_id", raw,
|
||||||
|
raw_shape=(int(num_tokens), int(token_dim)),
|
||||||
|
raw_dtype=gguf.GGMLQuantizationType.BF16)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"unknown --dtype {dtype_choice}")
|
||||||
|
|
||||||
|
writer.write_header_to_file()
|
||||||
|
writer.write_kv_data_to_file()
|
||||||
|
writer.write_tensors_to_file()
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
print(f"wrote {out_path}: gguf, tensor pulid_id [{token_dim}, {num_tokens}] {dtype_choice}",
|
||||||
|
flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser(
|
||||||
|
description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument("--portrait", required=True,
|
||||||
|
help="Path to the source portrait image (JPG/PNG).")
|
||||||
|
ap.add_argument("--pulid-weights", required=True,
|
||||||
|
help="Path to pulid_flux_v0.9.x.safetensors.")
|
||||||
|
ap.add_argument("--out", required=True,
|
||||||
|
help="Output path for the .pulidembd binary.")
|
||||||
|
ap.add_argument("--dtype", default="fp16",
|
||||||
|
choices=["fp16", "bf16", "fp32"],
|
||||||
|
help="Storage dtype (default fp16; produces ~131 KB).")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if not os.path.exists(args.portrait):
|
||||||
|
print(f"ERROR: portrait not found at {args.portrait}", file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
if not os.path.exists(args.pulid_weights):
|
||||||
|
print(f"ERROR: PuLID weights not found at {args.pulid_weights}", file=sys.stderr)
|
||||||
|
return 3
|
||||||
|
|
||||||
|
embedding = extract(args.portrait, args.pulid_weights)
|
||||||
|
write_embd(embedding, args.out, args.dtype)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@ -1,4 +1,4 @@
|
|||||||
#ifndef __SD_CONDITIONING_CONDITIONER_HPP__
|
#ifndef __SD_CONDITIONING_CONDITIONER_HPP__
|
||||||
#define __SD_CONDITIONING_CONDITIONER_HPP__
|
#define __SD_CONDITIONING_CONDITIONER_HPP__
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -118,7 +118,6 @@ public:
|
|||||||
virtual void set_stream_layers_enabled(bool enabled) {}
|
virtual void set_stream_layers_enabled(bool enabled) {}
|
||||||
virtual void set_flash_attention_enabled(bool enabled) = 0;
|
virtual void set_flash_attention_enabled(bool enabled) = 0;
|
||||||
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
|
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
|
||||||
virtual void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) {}
|
|
||||||
virtual void runner_done() {}
|
virtual void runner_done() {}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -137,25 +136,24 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
std::map<std::string, std::pair<int, int>> embedding_pos_map;
|
std::map<std::string, std::pair<int, int>> embedding_pos_map;
|
||||||
|
|
||||||
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
|
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::map<std::string, std::string>& orig_embedding_map,
|
const std::map<std::string, std::string>& orig_embedding_map,
|
||||||
SDVersion version = VERSION_SD1)
|
SDVersion version = VERSION_SD1,
|
||||||
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
|
: version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
|
||||||
for (const auto& kv : orig_embedding_map) {
|
for (const auto& kv : orig_embedding_map) {
|
||||||
std::string name = kv.first;
|
std::string name = normalize_embedding_name(kv.first);
|
||||||
std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
|
|
||||||
embedding_map[name] = kv.second;
|
embedding_map[name] = kv.second;
|
||||||
tokenizer.add_special_token(name);
|
tokenizer.add_special_token(name);
|
||||||
}
|
}
|
||||||
bool force_clip_f32 = !embedding_map.empty();
|
bool force_clip_f32 = !embedding_map.empty();
|
||||||
if (sd_version_is_sd1(version)) {
|
if (sd_version_is_sd1(version)) {
|
||||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
|
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32, weight_manager);
|
||||||
} else if (sd_version_is_sd2(version)) {
|
} else if (sd_version_is_sd2(version)) {
|
||||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
|
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32, weight_manager);
|
||||||
} else if (sd_version_is_sdxl(version)) {
|
} else if (sd_version_is_sdxl(version)) {
|
||||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
|
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32, weight_manager);
|
||||||
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
|
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32, weight_manager);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -194,13 +192,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) override {
|
|
||||||
text_model->set_weight_manager(manager);
|
|
||||||
if (sd_version_is_sdxl(version)) {
|
|
||||||
text_model2->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void runner_done() override {
|
void runner_done() override {
|
||||||
text_model->runner_done();
|
text_model->runner_done();
|
||||||
if (sd_version_is_sdxl(version)) {
|
if (sd_version_is_sdxl(version)) {
|
||||||
@ -286,17 +277,23 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> convert_token_to_id(std::string text) {
|
static std::string normalize_embedding_name(std::string name) {
|
||||||
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
|
||||||
auto iter = embedding_map.find(str);
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool append_embedding_tokens(std::string str, std::vector<int32_t>& bpe_tokens) {
|
||||||
|
std::string name = normalize_embedding_name(std::move(str));
|
||||||
|
auto iter = embedding_map.find(name);
|
||||||
if (iter == embedding_map.end()) {
|
if (iter == embedding_map.end()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
std::string embedding_path = iter->second;
|
return load_embedding(name, iter->second, bpe_tokens);
|
||||||
if (load_embedding(str, embedding_path, bpe_tokens)) {
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
return false;
|
|
||||||
|
std::vector<int> convert_token_to_id(std::string text) {
|
||||||
|
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
||||||
|
return append_embedding_tokens(str, bpe_tokens);
|
||||||
};
|
};
|
||||||
std::vector<int> curr_tokens = tokenizer.encode(text, on_new_token_cb);
|
std::vector<int> curr_tokens = tokenizer.encode(text, on_new_token_cb);
|
||||||
return curr_tokens;
|
return curr_tokens;
|
||||||
@ -323,15 +320,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
|
||||||
auto iter = embedding_map.find(str);
|
return append_embedding_tokens(str, bpe_tokens);
|
||||||
if (iter == embedding_map.end()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
std::string embedding_path = iter->second;
|
|
||||||
if (load_embedding(str, embedding_path, bpe_tokens)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<int> tokens;
|
std::vector<int> tokens;
|
||||||
@ -522,9 +511,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
|
|||||||
std::string weight_prefix = "cond_stage_model.transformer";
|
std::string weight_prefix = "cond_stage_model.transformer";
|
||||||
|
|
||||||
FrozenCLIPVisionEmbedder(ggml_backend_t backend,
|
FrozenCLIPVisionEmbedder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const String2TensorStorage& tensor_storage_map = {})
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: GGMLRunner(backend, params_backend) {
|
: GGMLRunner(backend, weight_manager) {
|
||||||
bool proj_in = false;
|
bool proj_in = false;
|
||||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||||
if (!starts_with(name, weight_prefix)) {
|
if (!starts_with(name, weight_prefix)) {
|
||||||
@ -580,8 +569,8 @@ struct SD3CLIPEmbedder : public Conditioner {
|
|||||||
std::shared_ptr<T5Runner> t5;
|
std::shared_ptr<T5Runner> t5;
|
||||||
|
|
||||||
SD3CLIPEmbedder(ggml_backend_t backend,
|
SD3CLIPEmbedder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const String2TensorStorage& tensor_storage_map = {})
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: clip_g_tokenizer(0) {
|
: clip_g_tokenizer(0) {
|
||||||
bool use_clip_l = false;
|
bool use_clip_l = false;
|
||||||
bool use_clip_g = false;
|
bool use_clip_g = false;
|
||||||
@ -600,13 +589,13 @@ struct SD3CLIPEmbedder : public Conditioner {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (use_clip_l) {
|
if (use_clip_l) {
|
||||||
clip_l = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
|
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, false, weight_manager);
|
||||||
}
|
}
|
||||||
if (use_clip_g) {
|
if (use_clip_g) {
|
||||||
clip_g = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
|
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, false, weight_manager);
|
||||||
}
|
}
|
||||||
if (use_t5) {
|
if (use_t5) {
|
||||||
t5 = std::make_shared<T5Runner>(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer");
|
t5 = std::make_shared<T5Runner>(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -670,18 +659,6 @@ struct SD3CLIPEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) override {
|
|
||||||
if (clip_l) {
|
|
||||||
clip_l->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
if (clip_g) {
|
|
||||||
clip_g->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
if (t5) {
|
|
||||||
t5->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void runner_done() override {
|
void runner_done() override {
|
||||||
if (clip_l) {
|
if (clip_l) {
|
||||||
clip_l->runner_done();
|
clip_l->runner_done();
|
||||||
@ -961,8 +938,8 @@ struct FluxCLIPEmbedder : public Conditioner {
|
|||||||
size_t chunk_len = 256;
|
size_t chunk_len = 256;
|
||||||
|
|
||||||
FluxCLIPEmbedder(ggml_backend_t backend,
|
FluxCLIPEmbedder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const String2TensorStorage& tensor_storage_map = {}) {
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr) {
|
||||||
bool use_clip_l = false;
|
bool use_clip_l = false;
|
||||||
bool use_t5 = false;
|
bool use_t5 = false;
|
||||||
for (auto pair : tensor_storage_map) {
|
for (auto pair : tensor_storage_map) {
|
||||||
@ -979,12 +956,12 @@ struct FluxCLIPEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (use_clip_l) {
|
if (use_clip_l) {
|
||||||
clip_l = std::make_shared<CLIPTextModelRunner>(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
|
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, false, weight_manager);
|
||||||
} else {
|
} else {
|
||||||
LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
|
LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
|
||||||
}
|
}
|
||||||
if (use_t5) {
|
if (use_t5) {
|
||||||
t5 = std::make_shared<T5Runner>(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer");
|
t5 = std::make_shared<T5Runner>(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager);
|
||||||
} else {
|
} else {
|
||||||
LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
|
LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
|
||||||
}
|
}
|
||||||
@ -1035,15 +1012,6 @@ struct FluxCLIPEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) override {
|
|
||||||
if (clip_l) {
|
|
||||||
clip_l->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
if (t5) {
|
|
||||||
t5->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void runner_done() override {
|
void runner_done() override {
|
||||||
if (clip_l) {
|
if (clip_l) {
|
||||||
clip_l->runner_done();
|
clip_l->runner_done();
|
||||||
@ -1219,11 +1187,11 @@ struct T5CLIPEmbedder : public Conditioner {
|
|||||||
bool is_umt5 = false;
|
bool is_umt5 = false;
|
||||||
|
|
||||||
T5CLIPEmbedder(ggml_backend_t backend,
|
T5CLIPEmbedder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
bool use_mask = false,
|
bool use_mask = false,
|
||||||
int mask_pad = 0,
|
int mask_pad = 0,
|
||||||
bool is_umt5 = false)
|
bool is_umt5 = false,
|
||||||
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
|
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
|
||||||
bool use_t5 = false;
|
bool use_t5 = false;
|
||||||
for (auto pair : tensor_storage_map) {
|
for (auto pair : tensor_storage_map) {
|
||||||
@ -1236,7 +1204,7 @@ struct T5CLIPEmbedder : public Conditioner {
|
|||||||
LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
|
LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
t5 = std::make_shared<T5Runner>(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5);
|
t5 = std::make_shared<T5Runner>(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5, weight_manager);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1270,12 +1238,6 @@ struct T5CLIPEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) override {
|
|
||||||
if (t5) {
|
|
||||||
t5->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void runner_done() override {
|
void runner_done() override {
|
||||||
if (t5) {
|
if (t5) {
|
||||||
t5->runner_done();
|
t5->runner_done();
|
||||||
@ -1422,15 +1384,15 @@ struct AnimaConditioner : public Conditioner {
|
|||||||
std::shared_ptr<LLM::LLMRunner> llm;
|
std::shared_ptr<LLM::LLMRunner> llm;
|
||||||
|
|
||||||
AnimaConditioner(ggml_backend_t backend,
|
AnimaConditioner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const String2TensorStorage& tensor_storage_map = {}) {
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr) {
|
||||||
qwen_tokenizer = std::make_shared<Qwen2Tokenizer>();
|
qwen_tokenizer = std::make_shared<Qwen2Tokenizer>();
|
||||||
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
|
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
|
||||||
backend,
|
backend,
|
||||||
params_backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"text_encoders.llm",
|
"text_encoders.llm",
|
||||||
false);
|
false,
|
||||||
|
weight_manager);
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||||
@ -1453,10 +1415,6 @@ struct AnimaConditioner : public Conditioner {
|
|||||||
llm->set_weight_adapter(adapter);
|
llm->set_weight_adapter(adapter);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) override {
|
|
||||||
llm->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
|
|
||||||
void runner_done() override {
|
void runner_done() override {
|
||||||
llm->runner_done();
|
llm->runner_done();
|
||||||
}
|
}
|
||||||
@ -1545,11 +1503,11 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
std::shared_ptr<LLM::LLMRunner> llm;
|
std::shared_ptr<LLM::LLMRunner> llm;
|
||||||
|
|
||||||
LLMEmbedder(ggml_backend_t backend,
|
LLMEmbedder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
SDVersion version = VERSION_QWEN_IMAGE,
|
SDVersion version = VERSION_QWEN_IMAGE,
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
bool enable_vision = false)
|
bool enable_vision = false,
|
||||||
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: version(version) {
|
: version(version) {
|
||||||
LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
|
LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
|
||||||
if (version == VERSION_FLUX2) {
|
if (version == VERSION_FLUX2) {
|
||||||
@ -1560,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
arch = LLM::LLMArch::GPT_OSS_20B;
|
arch = LLM::LLMArch::GPT_OSS_20B;
|
||||||
} else if (sd_version_is_pid(version)) {
|
} else if (sd_version_is_pid(version)) {
|
||||||
arch = LLM::LLMArch::GEMMA2_2B;
|
arch = LLM::LLMArch::GEMMA2_2B;
|
||||||
} else if (sd_version_is_ideogram4(version)) {
|
} else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) {
|
||||||
arch = LLM::LLMArch::QWEN3_VL;
|
arch = LLM::LLMArch::QWEN3_VL;
|
||||||
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
|
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
|
||||||
arch = LLM::LLMArch::QWEN3;
|
arch = LLM::LLMArch::QWEN3;
|
||||||
@ -1576,10 +1534,10 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
llm = std::make_shared<LLM::LLMRunner>(arch,
|
llm = std::make_shared<LLM::LLMRunner>(arch,
|
||||||
backend,
|
backend,
|
||||||
params_backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"text_encoders.llm",
|
"text_encoders.llm",
|
||||||
enable_vision);
|
enable_vision,
|
||||||
|
weight_manager);
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||||
@ -1604,12 +1562,6 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) override {
|
|
||||||
if (llm) {
|
|
||||||
llm->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void runner_done() override {
|
void runner_done() override {
|
||||||
if (llm) {
|
if (llm) {
|
||||||
llm->runner_done();
|
llm->runner_done();
|
||||||
@ -1826,6 +1778,65 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
|
|
||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
|
} else if (sd_version_is_boogu_image(version)) {
|
||||||
|
prompt_template_encode_start_idx = 0;
|
||||||
|
|
||||||
|
const std::string t2i_system_prompt =
|
||||||
|
"You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows.";
|
||||||
|
const std::string edit_system_prompt =
|
||||||
|
"Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.";
|
||||||
|
const bool has_ref_images = llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty();
|
||||||
|
const bool text_empty = conditioner_params.text.find_first_not_of(" \t\r\n") == std::string::npos;
|
||||||
|
|
||||||
|
if (has_ref_images) {
|
||||||
|
LOG_INFO("BooguImageEditPipeline");
|
||||||
|
const std::string prompt_prefix = "<|im_start|>system\n" + edit_system_prompt + "<|im_end|>\n<|im_start|>user\n";
|
||||||
|
std::string img_prompt;
|
||||||
|
const std::string placeholder = "<|image_pad|>";
|
||||||
|
|
||||||
|
for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
|
||||||
|
const auto& image = (*conditioner_params.ref_images)[i];
|
||||||
|
double factor = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
|
||||||
|
int height = static_cast<int>(image.shape()[1]);
|
||||||
|
int width = static_cast<int>(image.shape()[0]);
|
||||||
|
double beta = std::sqrt((384.0 * 384.0) / (static_cast<double>(height) * static_cast<double>(width)));
|
||||||
|
int h_bar = std::max(static_cast<int>(factor),
|
||||||
|
static_cast<int>(std::round(height * beta / factor)) * static_cast<int>(factor));
|
||||||
|
int w_bar = std::max(static_cast<int>(factor),
|
||||||
|
static_cast<int>(std::round(width * beta / factor)) * static_cast<int>(factor));
|
||||||
|
|
||||||
|
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
|
||||||
|
|
||||||
|
auto resized_image = clip_preprocess(image, w_bar, h_bar);
|
||||||
|
auto image_embed = llm->encode_image(n_threads, resized_image, false, true, true);
|
||||||
|
GGML_ASSERT(!image_embed.empty());
|
||||||
|
|
||||||
|
std::string image_prefix = prompt_prefix + img_prompt + "<|vision_start|>";
|
||||||
|
int image_embed_idx = static_cast<int>(tokenizer->encode(image_prefix, nullptr).size());
|
||||||
|
image_embeds.emplace_back(image_embed_idx, image_embed);
|
||||||
|
|
||||||
|
img_prompt += "<|vision_start|>";
|
||||||
|
int64_t num_image_tokens = image_embed.shape()[1];
|
||||||
|
img_prompt.reserve(img_prompt.size() + static_cast<size_t>(num_image_tokens) * placeholder.size() + 32);
|
||||||
|
for (int j = 0; j < num_image_tokens; j++) {
|
||||||
|
img_prompt += placeholder;
|
||||||
|
}
|
||||||
|
img_prompt += "<|vision_end|>";
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt = prompt_prefix + img_prompt;
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
prompt += "<|im_end|>\n";
|
||||||
|
} else {
|
||||||
|
const std::string& system_prompt = text_empty ? edit_system_prompt : t2i_system_prompt;
|
||||||
|
prompt = "<|im_start|>system\n" + system_prompt + "<|im_end|>\n<|im_start|>user\n";
|
||||||
|
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||||
|
prompt += conditioner_params.text;
|
||||||
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
prompt += "<|im_end|>\n";
|
||||||
|
}
|
||||||
} else if (sd_version_is_longcat(version)) {
|
} else if (sd_version_is_longcat(version)) {
|
||||||
spell_quotes = true;
|
spell_quotes = true;
|
||||||
|
|
||||||
@ -2106,10 +2117,10 @@ struct LTXAVTextProjectionRunner : public GGMLRunner {
|
|||||||
LTXAVTextProjection model;
|
LTXAVTextProjection model;
|
||||||
|
|
||||||
LTXAVTextProjectionRunner(ggml_backend_t backend,
|
LTXAVTextProjectionRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string& prefix = "")
|
const std::string& prefix = "",
|
||||||
: GGMLRunner(backend, params_backend),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: GGMLRunner(backend, weight_manager),
|
||||||
model(tensor_storage_map.find(prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end()) {
|
model(tensor_storage_map.find(prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end()) {
|
||||||
model.init(params_ctx, tensor_storage_map, prefix);
|
model.init(params_ctx, tensor_storage_map, prefix);
|
||||||
}
|
}
|
||||||
@ -2154,22 +2165,22 @@ struct LTXAVEmbedder : public Conditioner {
|
|||||||
bool dual_projection = false;
|
bool dual_projection = false;
|
||||||
|
|
||||||
LTXAVEmbedder(ggml_backend_t backend,
|
LTXAVEmbedder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string& llm_prefix = "text_encoders.llm",
|
const std::string& llm_prefix = "text_encoders.llm",
|
||||||
const std::string& projector_prefix = "text_embedding_projection") {
|
const std::string& projector_prefix = "text_embedding_projection",
|
||||||
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr) {
|
||||||
tokenizer = std::make_shared<GemmaTokenizer>();
|
tokenizer = std::make_shared<GemmaTokenizer>();
|
||||||
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::GEMMA3_12B,
|
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::GEMMA3_12B,
|
||||||
backend,
|
backend,
|
||||||
params_backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
llm_prefix,
|
llm_prefix,
|
||||||
false);
|
false,
|
||||||
|
weight_manager);
|
||||||
dual_projection = tensor_storage_map.find(projector_prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end();
|
dual_projection = tensor_storage_map.find(projector_prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end();
|
||||||
projector = std::make_shared<LTXAVTextProjectionRunner>(backend,
|
projector = std::make_shared<LTXAVTextProjectionRunner>(backend,
|
||||||
params_backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
projector_prefix);
|
projector_prefix,
|
||||||
|
weight_manager);
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||||
@ -2192,11 +2203,6 @@ struct LTXAVEmbedder : public Conditioner {
|
|||||||
projector->set_weight_adapter(adapter);
|
projector->set_weight_adapter(adapter);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) override {
|
|
||||||
llm->set_weight_manager(manager);
|
|
||||||
projector->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
|
|
||||||
void runner_done() override {
|
void runner_done() override {
|
||||||
llm->runner_done();
|
llm->runner_done();
|
||||||
projector->runner_done();
|
projector->runner_done();
|
||||||
|
|||||||
@ -99,7 +99,7 @@ bool convert(const char* input_path,
|
|||||||
model_loader.convert_tensors_name();
|
model_loader.convert_tensors_name();
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_type type = (ggml_type)output_type;
|
ggml_type type = sd_type_to_ggml_type(output_type);
|
||||||
bool output_is_safetensors = ends_with(output_path, ".safetensors");
|
bool output_is_safetensors = ends_with(output_path, ".safetensors");
|
||||||
TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);
|
TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);
|
||||||
|
|
||||||
|
|||||||
@ -1696,11 +1696,9 @@ protected:
|
|||||||
using GraphCutSegment = sd::ggml_graph_cut::Segment;
|
using GraphCutSegment = sd::ggml_graph_cut::Segment;
|
||||||
using GraphCutPlan = sd::ggml_graph_cut::Plan;
|
using GraphCutPlan = sd::ggml_graph_cut::Plan;
|
||||||
|
|
||||||
ggml_backend_t params_backend = nullptr;
|
|
||||||
ggml_backend_t runtime_backend = nullptr;
|
ggml_backend_t runtime_backend = nullptr;
|
||||||
|
|
||||||
ggml_context* params_ctx = nullptr;
|
ggml_context* params_ctx = nullptr;
|
||||||
ggml_backend_buffer_t params_buffer = nullptr;
|
|
||||||
|
|
||||||
ggml_context* cache_ctx = nullptr;
|
ggml_context* cache_ctx = nullptr;
|
||||||
ggml_backend_buffer_t cache_buffer = nullptr;
|
ggml_backend_buffer_t cache_buffer = nullptr;
|
||||||
@ -1880,9 +1878,6 @@ protected:
|
|||||||
auto manager = weight_manager.lock();
|
auto manager = weight_manager.lock();
|
||||||
if (manager == nullptr) {
|
if (manager == nullptr) {
|
||||||
if (!params_to_prepare.empty()) {
|
if (!params_to_prepare.empty()) {
|
||||||
if (params_buffer != nullptr) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
LOG_ERROR("%s weight manager is not set for graph params", get_desc().c_str());
|
LOG_ERROR("%s weight manager is not set for graph params", get_desc().c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -2012,6 +2007,10 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool copy_cache_tensors_to_cache_buffer(const std::unordered_set<std::string>* cache_keep_names = nullptr) {
|
bool copy_cache_tensors_to_cache_buffer(const std::unordered_set<std::string>* cache_keep_names = nullptr) {
|
||||||
|
if (cache_tensor_map.empty() && cache_keep_names == nullptr) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_context* old_cache_ctx = cache_ctx;
|
ggml_context* old_cache_ctx = cache_ctx;
|
||||||
ggml_backend_buffer_t old_cache_buffer = cache_buffer;
|
ggml_backend_buffer_t old_cache_buffer = cache_buffer;
|
||||||
cache_ctx = nullptr;
|
cache_ctx = nullptr;
|
||||||
@ -2194,13 +2193,11 @@ protected:
|
|||||||
plan.valid &&
|
plan.valid &&
|
||||||
max_graph_vram_bytes > 0 &&
|
max_graph_vram_bytes > 0 &&
|
||||||
plan.segments.size() > 1 &&
|
plan.segments.size() > 1 &&
|
||||||
params_backend != runtime_backend &&
|
|
||||||
!sd_backend_is_cpu(runtime_backend);
|
!sd_backend_is_cpu(runtime_backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool can_attempt_graph_cut_segmented_compute() const {
|
bool can_attempt_graph_cut_segmented_compute() const {
|
||||||
return max_graph_vram_bytes > 0 &&
|
return max_graph_vram_bytes > 0 &&
|
||||||
params_backend != runtime_backend &&
|
|
||||||
!sd_backend_is_cpu(runtime_backend);
|
!sd_backend_is_cpu(runtime_backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2631,16 +2628,15 @@ public:
|
|||||||
public:
|
public:
|
||||||
virtual std::string get_desc() = 0;
|
virtual std::string get_desc() = 0;
|
||||||
|
|
||||||
GGMLRunner(ggml_backend_t backend, ggml_backend_t params_backend)
|
GGMLRunner(ggml_backend_t backend,
|
||||||
: params_backend(params_backend),
|
std::shared_ptr<RunnerWeightManager> manager = nullptr)
|
||||||
runtime_backend(backend) {
|
: runtime_backend(backend),
|
||||||
|
weight_manager(manager) {
|
||||||
GGML_ASSERT(runtime_backend != nullptr);
|
GGML_ASSERT(runtime_backend != nullptr);
|
||||||
GGML_ASSERT(params_backend != nullptr);
|
|
||||||
alloc_params_ctx();
|
alloc_params_ctx();
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual ~GGMLRunner() {
|
virtual ~GGMLRunner() {
|
||||||
free_params_buffer();
|
|
||||||
free_compute_buffer();
|
free_compute_buffer();
|
||||||
free_params_ctx();
|
free_params_ctx();
|
||||||
free_compute_ctx();
|
free_compute_ctx();
|
||||||
@ -2674,73 +2670,6 @@ public:
|
|||||||
alloc_compute_ctx();
|
alloc_compute_ctx();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool alloc_params_buffer() {
|
|
||||||
size_t num_tensors = ggml_tensor_num(params_ctx);
|
|
||||||
if (num_tensors > 0) {
|
|
||||||
// ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated
|
|
||||||
// (typical for memory-mapped weights). See ggml-alloc.c n_buffers==0 branch.
|
|
||||||
bool all_have_data = true;
|
|
||||||
for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
|
|
||||||
if (t->data == nullptr) {
|
|
||||||
all_have_data = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (all_have_data) {
|
|
||||||
LOG_DEBUG("%s all params already mmap-allocated (no separate buffer needed)", get_desc().c_str());
|
|
||||||
params_buffer = nullptr;
|
|
||||||
rebuild_params_tensor_set();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// Pinned host buffer when CPU-offloaded for DMA-direct H2D.
|
|
||||||
ggml_backend_buffer_type_t params_buft = nullptr;
|
|
||||||
if (params_backend != runtime_backend) {
|
|
||||||
ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend);
|
|
||||||
if (runtime_dev != nullptr) {
|
|
||||||
params_buft = ggml_backend_dev_host_buffer_type(runtime_dev);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (params_buft == nullptr) {
|
|
||||||
params_buft = ggml_backend_get_default_buffer_type(params_backend);
|
|
||||||
}
|
|
||||||
params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft);
|
|
||||||
if (params_buffer == nullptr) {
|
|
||||||
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
|
|
||||||
get_desc().c_str(),
|
|
||||||
num_tensors);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
rebuild_params_tensor_set();
|
|
||||||
ggml_backend_buffer_set_usage(params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
||||||
size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
|
|
||||||
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
|
|
||||||
get_desc().c_str(),
|
|
||||||
params_buffer_size / (1024.f * 1024.f),
|
|
||||||
sd_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
|
|
||||||
num_tensors);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void free_params_buffer() {
|
|
||||||
if (params_buffer != nullptr) {
|
|
||||||
ggml_backend_buffer_free(params_buffer);
|
|
||||||
params_buffer = nullptr;
|
|
||||||
}
|
|
||||||
observed_max_effective_budget_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t get_params_buffer_size() {
|
|
||||||
if (params_buffer != nullptr) {
|
|
||||||
return ggml_backend_buffer_get_size(params_buffer);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void free_cache_ctx_and_buffer() {
|
void free_cache_ctx_and_buffer() {
|
||||||
free_cache_buffer();
|
free_cache_buffer();
|
||||||
@ -2886,15 +2815,6 @@ public:
|
|||||||
weight_adapter = adapter;
|
weight_adapter = adapter;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) {
|
|
||||||
weight_manager = manager;
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager,
|
|
||||||
const std::string&) {
|
|
||||||
set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) {
|
void set_max_graph_vram_bytes(size_t max_vram_bytes) {
|
||||||
max_graph_vram_bytes = max_vram_bytes;
|
max_graph_vram_bytes = max_vram_bytes;
|
||||||
}
|
}
|
||||||
@ -2902,14 +2822,6 @@ public:
|
|||||||
void set_stream_layers_enabled(bool enabled) {
|
void set_stream_layers_enabled(bool enabled) {
|
||||||
stream_layers_enabled = enabled;
|
stream_layers_enabled = enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t get_runtime_backend() {
|
|
||||||
return runtime_backend;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t get_params_backend() {
|
|
||||||
return params_backend;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class GGMLBlock {
|
class GGMLBlock {
|
||||||
|
|||||||
@ -45,6 +45,10 @@ static bool is_default_backend_token(const std::string& name) {
|
|||||||
return lower.empty() || lower == "default" || lower == "auto";
|
return lower.empty() || lower == "default" || lower == "auto";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_disk_backend_token(const std::string& name) {
|
||||||
|
return lower_copy(trim_copy(name)) == "disk";
|
||||||
|
}
|
||||||
|
|
||||||
static bool parse_backend_module(const std::string& raw_name, SDBackendModule* module) {
|
static bool parse_backend_module(const std::string& raw_name, SDBackendModule* module) {
|
||||||
std::string name = lower_copy(trim_copy(raw_name));
|
std::string name = lower_copy(trim_copy(raw_name));
|
||||||
name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
|
name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
|
||||||
@ -200,6 +204,36 @@ void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool add_rpc_devices(const std::string& servers) {
|
||||||
|
const std::string in = trim_copy(servers);
|
||||||
|
if (in.empty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
auto rpc_servers = split_copy(in, ',');
|
||||||
|
if (rpc_servers.empty()) {
|
||||||
|
LOG_ERROR("invalid RPC servers specification: '%s'", servers.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||||
|
if (!rpc_reg) {
|
||||||
|
LOG_ERROR("RPC backend not found, cannot add RPC servers");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char* endpoint);
|
||||||
|
ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t)ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
|
||||||
|
if (!ggml_backend_rpc_add_server_fn) {
|
||||||
|
LOG_ERROR("RPC backend does not have ggml_backend_rpc_add_server function, cannot add RPC servers");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (const auto& server : rpc_servers) {
|
||||||
|
LOG_INFO("Adding RPC server: %s", server.c_str());
|
||||||
|
auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
|
||||||
|
// no return value to check for success but should print errors from the RPC backend if it fails to add the server
|
||||||
|
ggml_backend_register(reg);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_backend_load_all_once() {
|
static void ggml_backend_load_all_once() {
|
||||||
// If the registry already has devices and the CPU backend is present,
|
// If the registry already has devices and the CPU backend is present,
|
||||||
// assume either static registration or explicit host-side preloading has
|
// assume either static registration or explicit host-side preloading has
|
||||||
@ -246,7 +280,7 @@ static std::string get_default_backend_name() {
|
|||||||
return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string sd_resolve_backend_name(const std::string& name) {
|
std::string sd_backend_resolve_name(const std::string& name) {
|
||||||
ggml_backend_load_all_once();
|
ggml_backend_load_all_once();
|
||||||
std::string requested = trim_copy(name);
|
std::string requested = trim_copy(name);
|
||||||
std::string lower = lower_copy(requested);
|
std::string lower = lower_copy(requested);
|
||||||
@ -284,7 +318,7 @@ static std::string sd_resolve_backend_name(const std::string& name) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool backend_name_exists(const std::string& name) {
|
static bool backend_name_exists(const std::string& name) {
|
||||||
return !sd_resolve_backend_name(name).empty();
|
return !sd_backend_resolve_name(name).empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_t init_named_backend(const std::string& name) {
|
static ggml_backend_t init_named_backend(const std::string& name) {
|
||||||
@ -294,7 +328,7 @@ static ggml_backend_t init_named_backend(const std::string& name) {
|
|||||||
return ggml_backend_init_best();
|
return ggml_backend_init_best();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string resolved = sd_resolve_backend_name(name);
|
std::string resolved = sd_backend_resolve_name(name);
|
||||||
if (resolved.empty()) {
|
if (resolved.empty()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -504,6 +538,9 @@ ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) {
|
|||||||
if (name.empty()) {
|
if (name.empty()) {
|
||||||
return runtime_backend(module);
|
return runtime_backend(module);
|
||||||
}
|
}
|
||||||
|
if (is_disk_backend_token(name)) {
|
||||||
|
return runtime_backend(module);
|
||||||
|
}
|
||||||
return init_cached_backend(name);
|
return init_cached_backend(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -515,6 +552,10 @@ bool SDBackendManager::params_backend_is_cpu(SDBackendModule module) {
|
|||||||
return sd_backend_is_cpu(params_backend(module));
|
return sd_backend_is_cpu(params_backend(module));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool SDBackendManager::params_backend_is_disk(SDBackendModule module) const {
|
||||||
|
return is_disk_backend_token(params_assignment_.get(module));
|
||||||
|
}
|
||||||
|
|
||||||
bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) {
|
bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) {
|
||||||
ggml_backend_t backend = runtime_backend(module);
|
ggml_backend_t backend = runtime_backend(module);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
@ -534,10 +575,6 @@ bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule modu
|
|||||||
|
|
||||||
bool SDBackendManager::init(const char* backend_spec,
|
bool SDBackendManager::init(const char* backend_spec,
|
||||||
const char* params_backend_spec,
|
const char* params_backend_spec,
|
||||||
bool offload_params_to_cpu,
|
|
||||||
bool keep_clip_on_cpu,
|
|
||||||
bool keep_vae_on_cpu,
|
|
||||||
bool keep_control_net_on_cpu,
|
|
||||||
std::string* error) {
|
std::string* error) {
|
||||||
reset();
|
reset();
|
||||||
|
|
||||||
@ -548,31 +585,21 @@ bool SDBackendManager::init(const char* backend_spec,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (runtime_assignment_.empty()) {
|
|
||||||
if (keep_clip_on_cpu) {
|
|
||||||
runtime_assignment_.set_module(SDBackendModule::TE, "cpu");
|
|
||||||
}
|
|
||||||
if (keep_vae_on_cpu) {
|
|
||||||
runtime_assignment_.set_module(SDBackendModule::VAE, "cpu");
|
|
||||||
}
|
|
||||||
if (keep_control_net_on_cpu) {
|
|
||||||
runtime_assignment_.set_module(SDBackendModule::CONTROL_NET, "cpu");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params_assignment_.empty() && offload_params_to_cpu) {
|
|
||||||
params_assignment_.set_default("cpu");
|
|
||||||
}
|
|
||||||
|
|
||||||
return validate(error);
|
return validate(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool SDBackendManager::validate(std::string* error) const {
|
bool SDBackendManager::validate(std::string* error) const {
|
||||||
auto validate_name = [&](const std::string& name) -> bool {
|
auto validate_runtime_name = [&](const std::string& name) -> bool {
|
||||||
if (is_default_backend_token(name)) {
|
if (is_default_backend_token(name)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (!sd_resolve_backend_name(name).empty()) {
|
if (is_disk_backend_token(name)) {
|
||||||
|
if (error != nullptr) {
|
||||||
|
*error = "backend 'disk' is only supported by params_backend";
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!sd_backend_resolve_name(name).empty()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (error != nullptr) {
|
if (error != nullptr) {
|
||||||
@ -580,18 +607,24 @@ bool SDBackendManager::validate(std::string* error) const {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
auto validate_params_name = [&](const std::string& name) -> bool {
|
||||||
|
if (is_disk_backend_token(name)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return validate_runtime_name(name);
|
||||||
|
};
|
||||||
|
|
||||||
if (!validate_name(runtime_assignment_.default_name) ||
|
if (!validate_runtime_name(runtime_assignment_.default_name) ||
|
||||||
!validate_name(params_assignment_.default_name)) {
|
!validate_params_name(params_assignment_.default_name)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (const auto& kv : runtime_assignment_.module_names) {
|
for (const auto& kv : runtime_assignment_.module_names) {
|
||||||
if (!validate_name(kv.second)) {
|
if (!validate_runtime_name(kv.second)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (const auto& kv : params_assignment_.module_names) {
|
for (const auto& kv : params_assignment_.module_names) {
|
||||||
if (!validate_name(kv.second)) {
|
if (!validate_params_name(kv.second)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -599,7 +632,7 @@ bool SDBackendManager::validate(std::string* error) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t SDBackendManager::init_cached_backend(const std::string& name) {
|
ggml_backend_t SDBackendManager::init_cached_backend(const std::string& name) {
|
||||||
std::string resolved = sd_resolve_backend_name(name);
|
std::string resolved = sd_backend_resolve_name(name);
|
||||||
std::string key = lower_copy(resolved);
|
std::string key = lower_copy(resolved);
|
||||||
ggml_backend_t backend = nullptr;
|
ggml_backend_t backend = nullptr;
|
||||||
|
|
||||||
|
|||||||
@ -51,10 +51,6 @@ public:
|
|||||||
|
|
||||||
bool init(const char* backend_spec,
|
bool init(const char* backend_spec,
|
||||||
const char* params_backend_spec,
|
const char* params_backend_spec,
|
||||||
bool offload_params_to_cpu,
|
|
||||||
bool keep_clip_on_cpu,
|
|
||||||
bool keep_vae_on_cpu,
|
|
||||||
bool keep_control_net_on_cpu,
|
|
||||||
std::string* error);
|
std::string* error);
|
||||||
void reset();
|
void reset();
|
||||||
|
|
||||||
@ -63,6 +59,7 @@ public:
|
|||||||
|
|
||||||
bool runtime_backend_is_cpu(SDBackendModule module);
|
bool runtime_backend_is_cpu(SDBackendModule module);
|
||||||
bool params_backend_is_cpu(SDBackendModule module);
|
bool params_backend_is_cpu(SDBackendModule module);
|
||||||
|
bool params_backend_is_disk(SDBackendModule module) const;
|
||||||
bool runtime_backend_supports_host_buffer(SDBackendModule module);
|
bool runtime_backend_supports_host_buffer(SDBackendModule module);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -74,6 +71,8 @@ bool sd_backend_is(ggml_backend_t backend, const std::string& name);
|
|||||||
bool sd_backend_is_cpu(ggml_backend_t backend);
|
bool sd_backend_is_cpu(ggml_backend_t backend);
|
||||||
ggml_backend_t sd_backend_cpu_init();
|
ggml_backend_t sd_backend_cpu_init();
|
||||||
bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
std::string sd_backend_resolve_name(const std::string& name);
|
||||||
const char* sd_backend_module_name(SDBackendModule module);
|
const char* sd_backend_module_name(SDBackendModule module);
|
||||||
void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
|
void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
|
||||||
|
bool add_rpc_devices(const std::string& servers);
|
||||||
#endif // __SD_CORE_GGML_EXTEND_BACKEND_H__
|
#endif // __SD_CORE_GGML_EXTEND_BACKEND_H__
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
#include "core/ggml_graph_cut.h"
|
#include "core/ggml_graph_cut.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cctype>
|
||||||
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <set>
|
#include <set>
|
||||||
@ -8,6 +10,7 @@
|
|||||||
#include <stack>
|
#include <stack>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "core/ggml_extend_backend.h"
|
||||||
#include "core/util.h"
|
#include "core/util.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
@ -83,6 +86,157 @@ namespace sd::ggml_graph_cut {
|
|||||||
segment.output_bytes;
|
segment.output_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string lower_ascii_copy(std::string value) {
|
||||||
|
std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
|
||||||
|
return static_cast<char>(std::tolower(c));
|
||||||
|
});
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string normalize_backend_budget_key(const std::string& value) {
|
||||||
|
return lower_ascii_copy(trim(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool is_default_max_vram_key(const std::string& key) {
|
||||||
|
std::string normalized = normalize_backend_budget_key(key);
|
||||||
|
return normalized == "all" || normalized == "default" || normalized == "*";
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool parse_max_vram_budget_value(const std::string& text, float* value, std::string* error) {
|
||||||
|
float parsed = 0.f;
|
||||||
|
if (!parse_strict_float(text, parsed) || !std::isfinite(parsed)) {
|
||||||
|
if (error != nullptr) {
|
||||||
|
*error = "invalid --max-vram value '" + text + "'";
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*value = parsed;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::string> backend_budget_keys(ggml_backend_t backend) {
|
||||||
|
std::vector<std::string> keys;
|
||||||
|
if (backend == nullptr) {
|
||||||
|
return keys;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
|
||||||
|
if (dev != nullptr) {
|
||||||
|
keys.push_back(normalize_backend_budget_key(ggml_backend_dev_name(dev)));
|
||||||
|
}
|
||||||
|
const char* backend_name = ggml_backend_name(backend);
|
||||||
|
if (backend_name != nullptr) {
|
||||||
|
keys.push_back(normalize_backend_budget_key(backend_name));
|
||||||
|
}
|
||||||
|
return keys;
|
||||||
|
}
|
||||||
|
|
||||||
|
void MaxVramAssignment::reset(float fallback_gib) {
|
||||||
|
default_gib = fallback_gib;
|
||||||
|
backend_gib.clear();
|
||||||
|
resolved_backend_bytes.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MaxVramAssignment::parse(const std::string& raw_spec, std::string* error) {
|
||||||
|
const std::string in = trim(raw_spec);
|
||||||
|
if (in.empty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const std::string& raw_part : split_string(in, ',')) {
|
||||||
|
const std::string part = trim(raw_part);
|
||||||
|
if (part.empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t eq = part.find('=');
|
||||||
|
if (eq == std::string::npos) {
|
||||||
|
float value = 0.f;
|
||||||
|
if (!parse_max_vram_budget_value(part, &value, error)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
default_gib = value;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string key = trim(part.substr(0, eq));
|
||||||
|
const std::string value_text = trim(part.substr(eq + 1));
|
||||||
|
if (key.empty() || value_text.empty()) {
|
||||||
|
if (error != nullptr) {
|
||||||
|
*error = "invalid --max-vram assignment '" + part + "'";
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
float value = 0.f;
|
||||||
|
if (!parse_max_vram_budget_value(value_text, &value, error)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_default_max_vram_key(key)) {
|
||||||
|
default_gib = value;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string backend_key = trim(key);
|
||||||
|
if (backend_key.empty()) {
|
||||||
|
if (error != nullptr) {
|
||||||
|
*error = "invalid --max-vram backend key in '" + part + "'";
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
backend_gib[backend_key] = value;
|
||||||
|
}
|
||||||
|
resolved_backend_bytes.clear();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MaxVramAssignment::canonicalize_backend_keys(std::string* error) {
|
||||||
|
if (backend_gib.empty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unordered_map<std::string, float> normalized;
|
||||||
|
for (const auto& kv : backend_gib) {
|
||||||
|
std::string resolved = sd_backend_resolve_name(kv.first);
|
||||||
|
if (resolved.empty()) {
|
||||||
|
if (error != nullptr) {
|
||||||
|
*error = "unknown --max-vram backend '" + kv.first + "'";
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
normalized[normalize_backend_budget_key(resolved)] = kv.second;
|
||||||
|
}
|
||||||
|
backend_gib = std::move(normalized);
|
||||||
|
resolved_backend_bytes.clear();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t MaxVramAssignment::bytes_for_backend(ggml_backend_t backend) {
|
||||||
|
std::vector<std::string> keys = backend_budget_keys(backend);
|
||||||
|
const std::string cache_key = keys.empty() ? std::string("<none>") : keys.front();
|
||||||
|
auto cached = resolved_backend_bytes.find(cache_key);
|
||||||
|
if (cached != resolved_backend_bytes.end()) {
|
||||||
|
return cached->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
float budget_gib = default_gib;
|
||||||
|
if (!backend_gib.empty()) {
|
||||||
|
for (const std::string& key : keys) {
|
||||||
|
auto backend_it = backend_gib.find(key);
|
||||||
|
if (backend_it != backend_gib.end()) {
|
||||||
|
budget_gib = backend_it->second;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const float resolved_gib = resolve_max_vram_gib(budget_gib, backend);
|
||||||
|
const size_t bytes = max_vram_gib_to_bytes(resolved_gib);
|
||||||
|
resolved_backend_bytes[cache_key] = bytes;
|
||||||
|
return bytes;
|
||||||
|
}
|
||||||
|
|
||||||
size_t max_vram_gib_to_bytes(float max_vram) {
|
size_t max_vram_gib_to_bytes(float max_vram) {
|
||||||
if (max_vram <= 0.f) {
|
if (max_vram <= 0.f) {
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
#include <array>
|
#include <array>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -68,6 +69,17 @@ namespace sd::ggml_graph_cut {
|
|||||||
|
|
||||||
static constexpr const char* GGML_RUNNER_CUT_PREFIX = "ggml_runner_cut:";
|
static constexpr const char* GGML_RUNNER_CUT_PREFIX = "ggml_runner_cut:";
|
||||||
|
|
||||||
|
struct MaxVramAssignment {
|
||||||
|
float default_gib = 0.f;
|
||||||
|
std::unordered_map<std::string, float> backend_gib;
|
||||||
|
std::unordered_map<std::string, size_t> resolved_backend_bytes;
|
||||||
|
|
||||||
|
void reset(float fallback_gib);
|
||||||
|
bool parse(const std::string& raw_spec, std::string* error);
|
||||||
|
bool canonicalize_backend_keys(std::string* error);
|
||||||
|
size_t bytes_for_backend(ggml_backend_t backend);
|
||||||
|
};
|
||||||
|
|
||||||
bool is_graph_cut_tensor(const ggml_tensor* tensor);
|
bool is_graph_cut_tensor(const ggml_tensor* tensor);
|
||||||
std::string make_graph_cut_name(const std::string& group, const std::string& output);
|
std::string make_graph_cut_name(const std::string& group, const std::string& output);
|
||||||
void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output);
|
void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output);
|
||||||
|
|||||||
@ -406,6 +406,15 @@ std::vector<std::string> split_string(const std::string& str, char delimiter) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_type sd_type_to_ggml_type(sd_type_t sdtype) {
|
||||||
|
const int type_value = static_cast<int>(sdtype);
|
||||||
|
if (type_value < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)) {
|
||||||
|
return static_cast<ggml_type>(type_value);
|
||||||
|
} else {
|
||||||
|
return GGML_TYPE_COUNT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
KeyValueArgs parse_key_value_args(const char* args, const char* context) {
|
KeyValueArgs parse_key_value_args(const char* args, const char* context) {
|
||||||
KeyValueArgs pairs;
|
KeyValueArgs pairs;
|
||||||
|
|
||||||
|
|||||||
@ -80,6 +80,8 @@ void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float
|
|||||||
|
|
||||||
void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);
|
void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);
|
||||||
|
|
||||||
|
ggml_type sd_type_to_ggml_type(sd_type_t sdtype);
|
||||||
|
|
||||||
std::string trim(const std::string& s);
|
std::string trim(const std::string& s);
|
||||||
|
|
||||||
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
|
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
|
||||||
|
|||||||
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
#include "conditioning/conditioner.hpp"
|
#include "conditioning/conditioner.hpp"
|
||||||
#include "core/ggml_extend_backend.h"
|
#include "core/ggml_extend_backend.h"
|
||||||
|
#include "model/diffusion/model.hpp"
|
||||||
#include "model_loader.h"
|
#include "model_loader.h"
|
||||||
#include "model_manager.h"
|
#include "model_manager.h"
|
||||||
#include "stable-diffusion.h"
|
#include "stable-diffusion.h"
|
||||||
@ -19,6 +20,7 @@ struct GenerationExtensionInitContext {
|
|||||||
SDVersion version;
|
SDVersion version;
|
||||||
const String2TensorStorage& tensor_storage_map;
|
const String2TensorStorage& tensor_storage_map;
|
||||||
ModelLoader& model_loader;
|
ModelLoader& model_loader;
|
||||||
|
std::shared_ptr<ModelManager> model_manager;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
std::function<bool(SDBackendModule)> ensure_backend_pair;
|
std::function<bool(SDBackendModule)> ensure_backend_pair;
|
||||||
std::function<ggml_backend_t(SDBackendModule)> backend_for;
|
std::function<ggml_backend_t(SDBackendModule)> backend_for;
|
||||||
@ -29,6 +31,7 @@ struct GenerationExtensionConditionContext {
|
|||||||
Conditioner* conditioner;
|
Conditioner* conditioner;
|
||||||
ConditionerParams& condition_params;
|
ConditionerParams& condition_params;
|
||||||
const sd_pm_params_t& pm_params;
|
const sd_pm_params_t& pm_params;
|
||||||
|
const sd_pulid_params_t& pulid_params;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
int total_steps;
|
int total_steps;
|
||||||
};
|
};
|
||||||
@ -46,7 +49,6 @@ struct GenerationExtension {
|
|||||||
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>&) {}
|
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>&) {}
|
||||||
virtual void collect_loras(std::vector<ModelManager::LoraSpec>&) {}
|
virtual void collect_loras(std::vector<ModelManager::LoraSpec>&) {}
|
||||||
virtual void add_ignore_tensors(std::set<std::string>&) const {}
|
virtual void add_ignore_tensors(std::set<std::string>&) const {}
|
||||||
virtual void set_weight_manager(const std::shared_ptr<RunnerWeightManager>&) {}
|
|
||||||
virtual void runner_done() {}
|
virtual void runner_done() {}
|
||||||
virtual void reset_runtime_condition() {}
|
virtual void reset_runtime_condition() {}
|
||||||
virtual bool prepare_condition(GenerationExtensionConditionContext&) {
|
virtual bool prepare_condition(GenerationExtensionConditionContext&) {
|
||||||
@ -56,8 +58,20 @@ struct GenerationExtension {
|
|||||||
const SDCondition& condition) const {
|
const SDCondition& condition) const {
|
||||||
return condition;
|
return condition;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Called in the denoise loop for each enabled extension, after the per-step
|
||||||
|
// DiffusionParams (including its version-specific `extra`) has been built,
|
||||||
|
// but before diffusion_model->compute(). Lets an extension feed data into
|
||||||
|
// the diffusion forward that the conditioning-side hooks can't reach -- it
|
||||||
|
// can set/override fields on `params` (typically the architecture-specific
|
||||||
|
// `params.extra`, e.g. a guidance tensor, control payload, or an identity
|
||||||
|
// embedding for an adapter that injects inside the model's blocks). The
|
||||||
|
// extension targets whichever `extra` variant matches the active model.
|
||||||
|
// Mutates `params` only, never the extension. Default no-op.
|
||||||
|
virtual void before_diffusion(DiffusionParams& /*params*/, int /*step*/) const {}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::shared_ptr<GenerationExtension> create_photomaker_extension();
|
std::shared_ptr<GenerationExtension> create_photomaker_extension();
|
||||||
|
std::shared_ptr<GenerationExtension> create_pulid_extension();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -134,11 +134,12 @@ struct PhotoMakerExtension : public GenerationExtension {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pmid_model = std::make_shared<PhotoMakerIDEncoder>(ctx.backend_for(SDBackendModule::PHOTOMAKER),
|
pmid_model = std::make_shared<PhotoMakerIDEncoder>(ctx.backend_for(SDBackendModule::PHOTOMAKER),
|
||||||
ctx.params_backend_for(SDBackendModule::PHOTOMAKER),
|
|
||||||
ctx.tensor_storage_map,
|
ctx.tensor_storage_map,
|
||||||
"pmid",
|
"pmid",
|
||||||
ctx.version,
|
ctx.version,
|
||||||
pm_version);
|
pm_version,
|
||||||
|
20.f,
|
||||||
|
ctx.model_manager);
|
||||||
if (pm_version == PM_VERSION_2) {
|
if (pm_version == PM_VERSION_2) {
|
||||||
LOG_INFO("using PhotoMaker Version 2");
|
LOG_INFO("using PhotoMaker Version 2");
|
||||||
}
|
}
|
||||||
@ -174,12 +175,6 @@ struct PhotoMakerExtension : public GenerationExtension {
|
|||||||
ignore_tensors.insert("pmid.unet.");
|
ignore_tensors.insert("pmid.unet.");
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) override {
|
|
||||||
if (pmid_model != nullptr) {
|
|
||||||
pmid_model->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void runner_done() override {
|
void runner_done() override {
|
||||||
if (pmid_model != nullptr) {
|
if (pmid_model != nullptr) {
|
||||||
pmid_model->runner_done();
|
pmid_model->runner_done();
|
||||||
|
|||||||
123
src/extensions/pulid_extension.cpp
Normal file
123
src/extensions/pulid_extension.cpp
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
#include "extensions/generation_extension.h"
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
#include <variant>
|
||||||
|
|
||||||
|
#include "core/tensor_ggml.hpp"
|
||||||
|
#include "core/util.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
|
static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
|
||||||
|
sd::Tensor<float> empty;
|
||||||
|
if (path == nullptr || strlen(path) == 0) {
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_context* ctx_data = nullptr;
|
||||||
|
struct gguf_init_params gp = {/*.no_alloc =*/false, /*.ctx =*/&ctx_data};
|
||||||
|
struct gguf_context* gguf_ctx = gguf_init_from_file(path, gp);
|
||||||
|
if (gguf_ctx == nullptr || ctx_data == nullptr) {
|
||||||
|
LOG_WARN("PuLID id-embedding: cannot read gguf '%s'", path);
|
||||||
|
if (gguf_ctx != nullptr)
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
if (ctx_data != nullptr)
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* t = ggml_get_tensor(ctx_data, "pulid_id");
|
||||||
|
if (t == nullptr) {
|
||||||
|
LOG_WARN("PuLID id-embedding: no 'pulid_id' tensor in '%s'", path);
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t token_dim = t->ne[0];
|
||||||
|
const int64_t num_tokens = t->ne[1];
|
||||||
|
if (token_dim <= 0 || num_tokens <= 0 || token_dim > 65536 || num_tokens > 1024 ||
|
||||||
|
t->ne[2] != 1 || t->ne[3] != 1) {
|
||||||
|
LOG_WARN("PuLID id-embedding: implausible shape [%lld, %lld] in '%s'",
|
||||||
|
(long long)token_dim, (long long)num_tokens, path);
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t n_elem = (size_t)token_dim * (size_t)num_tokens;
|
||||||
|
sd::Tensor<float> out({token_dim, num_tokens, 1});
|
||||||
|
float* dst = out.data();
|
||||||
|
if (t->type == GGML_TYPE_F32) {
|
||||||
|
memcpy(dst, t->data, n_elem * sizeof(float));
|
||||||
|
} else if (t->type == GGML_TYPE_F16) {
|
||||||
|
const ggml_fp16_t* src = reinterpret_cast<const ggml_fp16_t*>(t->data);
|
||||||
|
for (size_t i = 0; i < n_elem; i++) {
|
||||||
|
dst[i] = ggml_fp16_to_fp32(src[i]);
|
||||||
|
}
|
||||||
|
} else if (t->type == GGML_TYPE_BF16) {
|
||||||
|
const ggml_bf16_t* src = reinterpret_cast<const ggml_bf16_t*>(t->data);
|
||||||
|
for (size_t i = 0; i < n_elem; i++) {
|
||||||
|
dst[i] = ggml_bf16_to_fp32(src[i]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOG_WARN("PuLID id-embedding: unsupported tensor type %s in '%s'",
|
||||||
|
ggml_type_name(t->type), path);
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("PuLID id-embedding: loaded [%lld, %lld] type=%s from '%s'",
|
||||||
|
(long long)token_dim, (long long)num_tokens, ggml_type_name(t->type), path);
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PuLIDExtension : public GenerationExtension {
|
||||||
|
bool enabled = false;
|
||||||
|
sd::Tensor<float> id_embedding;
|
||||||
|
float id_weight = 1.0f;
|
||||||
|
|
||||||
|
const char* name() const override {
|
||||||
|
return "pulid";
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_enabled() const override {
|
||||||
|
return enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool init(const GenerationExtensionInitContext& ctx) override {
|
||||||
|
enabled = strlen(SAFE_STR(ctx.params->pulid_weights_path)) > 0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset_runtime_condition() override {
|
||||||
|
id_embedding = {};
|
||||||
|
id_weight = 1.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool prepare_condition(GenerationExtensionConditionContext& ctx) override {
|
||||||
|
reset_runtime_condition();
|
||||||
|
if (!enabled) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
id_embedding = load_pulid_id_embedding(ctx.pulid_params.id_embedding_path);
|
||||||
|
id_weight = ctx.pulid_params.id_weight;
|
||||||
|
return false; // PuLID does not modify the conditioning
|
||||||
|
}
|
||||||
|
|
||||||
|
void before_diffusion(DiffusionParams& params, int /*step*/) const override {
|
||||||
|
if (!enabled || id_embedding.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (auto* flux_extra = std::get_if<FluxDiffusionExtra>(¶ms.extra)) {
|
||||||
|
flux_extra->pulid_id = &id_embedding;
|
||||||
|
flux_extra->pulid_id_weight = id_weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::shared_ptr<GenerationExtension> create_pulid_extension() {
|
||||||
|
return std::make_shared<PuLIDExtension>();
|
||||||
|
}
|
||||||
16
src/model.h
16
src/model.h
@ -42,6 +42,7 @@ enum SDVersion {
|
|||||||
VERSION_LTXAV,
|
VERSION_LTXAV,
|
||||||
VERSION_HIDREAM_O1,
|
VERSION_HIDREAM_O1,
|
||||||
VERSION_Z_IMAGE,
|
VERSION_Z_IMAGE,
|
||||||
|
VERSION_BOOGU_IMAGE,
|
||||||
VERSION_OVIS_IMAGE,
|
VERSION_OVIS_IMAGE,
|
||||||
VERSION_ERNIE_IMAGE,
|
VERSION_ERNIE_IMAGE,
|
||||||
VERSION_LENS,
|
VERSION_LENS,
|
||||||
@ -143,6 +144,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool sd_version_is_boogu_image(SDVersion version) {
|
||||||
|
if (version == VERSION_BOOGU_IMAGE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool sd_version_is_longcat(SDVersion version) {
|
static inline bool sd_version_is_longcat(SDVersion version) {
|
||||||
if (version == VERSION_LONGCAT) {
|
if (version == VERSION_LONGCAT) {
|
||||||
return true;
|
return true;
|
||||||
@ -178,6 +186,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool sd_version_uses_flux_vae(SDVersion version) {
|
||||||
|
if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool sd_version_uses_flux2_vae(SDVersion version) {
|
static inline bool sd_version_uses_flux2_vae(SDVersion version) {
|
||||||
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
|
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
|
||||||
return true;
|
return true;
|
||||||
@ -206,6 +221,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
|
|||||||
version == VERSION_HIDREAM_O1 ||
|
version == VERSION_HIDREAM_O1 ||
|
||||||
sd_version_is_anima(version) ||
|
sd_version_is_anima(version) ||
|
||||||
sd_version_is_z_image(version) ||
|
sd_version_is_z_image(version) ||
|
||||||
|
sd_version_is_boogu_image(version) ||
|
||||||
sd_version_is_ernie_image(version) ||
|
sd_version_is_ernie_image(version) ||
|
||||||
sd_version_is_lens(version) ||
|
sd_version_is_lens(version) ||
|
||||||
sd_version_is_longcat(version) ||
|
sd_version_is_longcat(version) ||
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include "core/ggml_extend.hpp"
|
#include "core/ggml_extend.hpp"
|
||||||
#include "model_loader.h"
|
#include "model_loader.h"
|
||||||
|
#include "model_manager.h"
|
||||||
|
|
||||||
#define LORA_GRAPH_BASE_SIZE 10240
|
#define LORA_GRAPH_BASE_SIZE 10240
|
||||||
|
|
||||||
@ -14,7 +15,8 @@ struct LoraModel : public GGMLRunner {
|
|||||||
std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
|
std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
|
||||||
std::set<std::string> applied_lora_tensors;
|
std::set<std::string> applied_lora_tensors;
|
||||||
std::string file_path;
|
std::string file_path;
|
||||||
ModelLoader model_loader;
|
std::shared_ptr<ModelManager> model_manager;
|
||||||
|
ggml_backend_t params_backend = nullptr;
|
||||||
bool load_failed = false;
|
bool load_failed = false;
|
||||||
bool applied = false;
|
bool applied = false;
|
||||||
bool tensor_preprocessed = false;
|
bool tensor_preprocessed = false;
|
||||||
@ -23,13 +25,14 @@ struct LoraModel : public GGMLRunner {
|
|||||||
|
|
||||||
LoraModel(const std::string& lora_id,
|
LoraModel(const std::string& lora_id,
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
ggml_backend_t params_backend_,
|
||||||
const std::string& file_path = "",
|
const std::string& file_path = "",
|
||||||
std::string prefix = "",
|
std::string prefix = "",
|
||||||
SDVersion version = VERSION_COUNT)
|
SDVersion version = VERSION_COUNT,
|
||||||
: lora_id(lora_id), file_path(file_path), GGMLRunner(backend, params_backend) {
|
std::shared_ptr<ModelManager> manager = std::make_shared<ModelManager>())
|
||||||
|
: GGMLRunner(backend, manager), lora_id(lora_id), file_path(file_path), model_manager(std::move(manager)), params_backend(params_backend_) {
|
||||||
prefix = "lora." + prefix;
|
prefix = "lora." + prefix;
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) {
|
if (model_manager == nullptr || !model_manager->loader().init_from_file_and_convert_name(file_path, prefix, version)) {
|
||||||
load_failed = true;
|
load_failed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -71,7 +74,10 @@ struct LoraModel : public GGMLRunner {
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
model_loader.set_n_threads(n_threads);
|
if (model_manager != nullptr) {
|
||||||
|
model_manager->set_n_threads(n_threads);
|
||||||
|
}
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
model_loader.load_tensors(on_new_tensor_cb);
|
model_loader.load_tensors(on_new_tensor_cb);
|
||||||
|
|
||||||
if (tensors_to_create.empty()) {
|
if (tensors_to_create.empty()) {
|
||||||
@ -88,23 +94,42 @@ struct LoraModel : public GGMLRunner {
|
|||||||
lora_tensors[name] = real;
|
lora_tensors[name] = real;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!alloc_params_buffer()) {
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
LOG_ERROR("lora model buffer allocation failed");
|
for (const auto& pair : lora_tensors) {
|
||||||
|
tensors[pair.first] = pair.second;
|
||||||
|
}
|
||||||
|
if (model_manager == nullptr ||
|
||||||
|
!model_manager->register_param_tensors("LoRA",
|
||||||
|
std::move(tensors),
|
||||||
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
|
runtime_backend,
|
||||||
|
params_backend) ||
|
||||||
|
!model_manager->validate_registered_tensors()) {
|
||||||
|
LOG_ERROR("lora model manager registration failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
std::vector<ggml_tensor*> lora_params;
|
||||||
|
lora_params.reserve(lora_tensors.size());
|
||||||
|
for (const auto& pair : lora_tensors) {
|
||||||
|
lora_params.push_back(pair.second);
|
||||||
|
}
|
||||||
|
if (!model_manager->prepare_params(lora_params)) {
|
||||||
|
LOG_ERROR("lora model manager prepare params failed");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
dry_run = false;
|
|
||||||
model_loader.load_tensors(on_new_tensor_cb);
|
|
||||||
|
|
||||||
LOG_DEBUG("finished loaded lora");
|
LOG_DEBUG("finished loaded lora");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void release_loaded_tensors() {
|
void release_loaded_tensors() {
|
||||||
|
runner_done();
|
||||||
free_compute_buffer();
|
free_compute_buffer();
|
||||||
free_params_buffer();
|
model_manager.reset();
|
||||||
free_params_ctx();
|
free_params_ctx();
|
||||||
alloc_params_ctx();
|
alloc_params_ctx();
|
||||||
|
model_manager = std::make_shared<ModelManager>();
|
||||||
|
weight_manager = model_manager;
|
||||||
lora_tensors.clear();
|
lora_tensors.clear();
|
||||||
original_tensor_to_final_tensor.clear();
|
original_tensor_to_final_tensor.clear();
|
||||||
applied_lora_tensors.clear();
|
applied_lora_tensors.clear();
|
||||||
@ -633,7 +658,7 @@ struct LoraModel : public GGMLRunner {
|
|||||||
if (lokr_w2)
|
if (lokr_w2)
|
||||||
applied_lora_tensors.insert(lokr_w2_name);
|
applied_lora_tensors.insert(lokr_w2_name);
|
||||||
if (lokr_w2_a)
|
if (lokr_w2_a)
|
||||||
applied_lora_tensors.insert(lokr_w2_name);
|
applied_lora_tensors.insert(lokr_w2_a_name);
|
||||||
if (lokr_w2_b)
|
if (lokr_w2_b)
|
||||||
applied_lora_tensors.insert(lokr_w2_b_name);
|
applied_lora_tensors.insert(lokr_w2_b_name);
|
||||||
applied_lora_tensors.insert(alpha_name);
|
applied_lora_tensors.insert(alpha_name);
|
||||||
|
|||||||
@ -413,13 +413,13 @@ public:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
PhotoMakerIDEncoder(ggml_backend_t backend,
|
PhotoMakerIDEncoder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
SDVersion version = VERSION_SDXL,
|
SDVersion version = VERSION_SDXL,
|
||||||
PMVersion pm_v = PM_VERSION_1,
|
PMVersion pm_v = PM_VERSION_1,
|
||||||
float sty = 20.f)
|
float sty = 20.f,
|
||||||
: GGMLRunner(backend, params_backend),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: GGMLRunner(backend, weight_manager),
|
||||||
version(version),
|
version(version),
|
||||||
pm_version(pm_v),
|
pm_version(pm_v),
|
||||||
style_strength(sty) {
|
style_strength(sty) {
|
||||||
@ -565,17 +565,18 @@ public:
|
|||||||
struct PhotoMakerIDEmbed : public GGMLRunner {
|
struct PhotoMakerIDEmbed : public GGMLRunner {
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
std::string file_path;
|
std::string file_path;
|
||||||
ModelLoader* model_loader;
|
std::shared_ptr<ModelManager> model_manager;
|
||||||
|
ggml_backend_t params_backend = nullptr;
|
||||||
bool load_failed = false;
|
bool load_failed = false;
|
||||||
bool applied = false;
|
bool applied = false;
|
||||||
|
|
||||||
PhotoMakerIDEmbed(ggml_backend_t backend,
|
PhotoMakerIDEmbed(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
ggml_backend_t params_backend_,
|
||||||
ModelLoader* ml,
|
std::shared_ptr<ModelManager> manager = std::make_shared<ModelManager>(),
|
||||||
const std::string& file_path = "",
|
const std::string& file_path = "",
|
||||||
const std::string& prefix = "")
|
const std::string& prefix = "")
|
||||||
: file_path(file_path), GGMLRunner(backend, params_backend), model_loader(ml) {
|
: GGMLRunner(backend, manager), file_path(file_path), model_manager(std::move(manager)), params_backend(params_backend_) {
|
||||||
if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
|
if (model_manager == nullptr || !model_manager->loader().init_from_file_and_convert_name(file_path, prefix)) {
|
||||||
load_failed = true;
|
load_failed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -616,15 +617,27 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
model_loader->set_n_threads(n_threads);
|
model_manager->set_n_threads(n_threads);
|
||||||
model_loader->load_tensors(on_new_tensor_cb);
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!alloc_params_buffer()) {
|
model_loader.load_tensors(on_new_tensor_cb);
|
||||||
LOG_ERROR("PhotoMaker ID embeds buffer allocation failed");
|
if (!model_manager->register_param_tensors("PhotoMaker ID embeds",
|
||||||
|
tensors,
|
||||||
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
|
runtime_backend,
|
||||||
|
params_backend) ||
|
||||||
|
!model_manager->validate_registered_tensors()) {
|
||||||
|
LOG_ERROR("PhotoMaker ID embeds model manager registration failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
std::vector<ggml_tensor*> id_embed_params;
|
||||||
|
id_embed_params.reserve(tensors.size());
|
||||||
|
for (const auto& pair : tensors) {
|
||||||
|
id_embed_params.push_back(pair.second);
|
||||||
|
}
|
||||||
|
if (!model_manager->prepare_params(id_embed_params)) {
|
||||||
|
LOG_ERROR("PhotoMaker ID embeds model manager prepare params failed");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
dry_run = false;
|
|
||||||
model_loader->load_tensors(on_new_tensor_cb);
|
|
||||||
|
|
||||||
LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
|
LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
76
src/model/adapter/pulid.hpp
Normal file
76
src/model/adapter/pulid.hpp
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
#ifndef __PULID_HPP__
|
||||||
|
#define __PULID_HPP__
|
||||||
|
|
||||||
|
#include "core/ggml_extend.hpp"
|
||||||
|
#include "model/common/block.hpp"
|
||||||
|
|
||||||
|
class PuLIDPerceiverAttentionCA : public GGMLBlock {
|
||||||
|
public:
|
||||||
|
static constexpr int64_t DEFAULT_DIM = 3072; // Flux hidden size
|
||||||
|
static constexpr int64_t DEFAULT_DIM_HEAD = 128;
|
||||||
|
static constexpr int64_t DEFAULT_HEADS = 16;
|
||||||
|
static constexpr int64_t DEFAULT_KV_DIM = 2048; // PuLID ID-embedding dim
|
||||||
|
|
||||||
|
protected:
|
||||||
|
int64_t dim;
|
||||||
|
int64_t dim_head;
|
||||||
|
int64_t heads;
|
||||||
|
int64_t kv_dim;
|
||||||
|
int64_t inner_dim;
|
||||||
|
|
||||||
|
public:
|
||||||
|
PuLIDPerceiverAttentionCA(int64_t dim = DEFAULT_DIM,
|
||||||
|
int64_t dim_head = DEFAULT_DIM_HEAD,
|
||||||
|
int64_t heads = DEFAULT_HEADS,
|
||||||
|
int64_t kv_dim = DEFAULT_KV_DIM)
|
||||||
|
: dim(dim),
|
||||||
|
dim_head(dim_head),
|
||||||
|
heads(heads),
|
||||||
|
kv_dim(kv_dim),
|
||||||
|
inner_dim(dim_head * heads) {
|
||||||
|
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(kv_dim));
|
||||||
|
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||||
|
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, /*bias=*/false));
|
||||||
|
blocks["to_kv"] = std::shared_ptr<GGMLBlock>(new Linear(kv_dim, inner_dim * 2, /*bias=*/false));
|
||||||
|
blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, /*bias=*/false));
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* id_embedding,
|
||||||
|
ggml_tensor* image_tokens) {
|
||||||
|
auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
|
||||||
|
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
|
||||||
|
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
|
||||||
|
auto to_kv = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
|
||||||
|
auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
|
||||||
|
|
||||||
|
ggml_tensor* x_normed = norm1->forward(ctx, id_embedding);
|
||||||
|
ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens);
|
||||||
|
|
||||||
|
ggml_tensor* q = to_q->forward(ctx, lat_normed); // [N, T_img, 2048]
|
||||||
|
ggml_tensor* kv = to_kv->forward(ctx, x_normed); // [N, T_img, 3072]
|
||||||
|
|
||||||
|
ggml_tensor* k = ggml_view_3d(ctx->ggml_ctx, kv,
|
||||||
|
inner_dim, kv->ne[1], kv->ne[2],
|
||||||
|
kv->nb[1], kv->nb[2],
|
||||||
|
/*offset=*/0);
|
||||||
|
ggml_tensor* v = ggml_view_3d(ctx->ggml_ctx, kv,
|
||||||
|
inner_dim, kv->ne[1], kv->ne[2],
|
||||||
|
kv->nb[1], kv->nb[2],
|
||||||
|
/*offset=*/inner_dim * ggml_element_size(kv));
|
||||||
|
k = ggml_cont(ctx->ggml_ctx, k);
|
||||||
|
v = ggml_cont(ctx->ggml_ctx, v);
|
||||||
|
|
||||||
|
ggml_tensor* attn_out = ggml_ext_attention_ext(
|
||||||
|
ctx->ggml_ctx, ctx->backend,
|
||||||
|
q, k, v,
|
||||||
|
heads,
|
||||||
|
/*mask=*/nullptr,
|
||||||
|
/*diag_mask_inf=*/false);
|
||||||
|
|
||||||
|
ggml_tensor* out = to_out->forward(ctx, attn_out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __PULID_HPP__
|
||||||
@ -560,11 +560,11 @@ protected:
|
|||||||
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
|
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
float get_alpha() {
|
ggml_tensor* get_alpha(GGMLRunnerContext* ctx) {
|
||||||
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
|
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
|
||||||
// so learned_with_images is same as learned
|
// so learned_with_images is same as learned
|
||||||
float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
|
auto mix_factor = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["mix_factor"]);
|
||||||
return sigmoid(alpha);
|
return ggml_sigmoid(ctx->ggml_ctx, mix_factor);
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -578,11 +578,12 @@ public:
|
|||||||
ggml_tensor* x_spatial,
|
ggml_tensor* x_spatial,
|
||||||
ggml_tensor* x_temporal) {
|
ggml_tensor* x_temporal) {
|
||||||
// image_only_indicator is always tensor([0.])
|
// image_only_indicator is always tensor([0.])
|
||||||
float alpha = get_alpha();
|
auto alpha = get_alpha(ctx);
|
||||||
auto x = ggml_add(ctx->ggml_ctx,
|
return ggml_add(ctx->ggml_ctx,
|
||||||
ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
|
x_temporal,
|
||||||
ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
|
ggml_mul(ctx->ggml_ctx,
|
||||||
return x;
|
ggml_sub(ctx->ggml_ctx, x_spatial, x_temporal),
|
||||||
|
alpha));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -899,10 +899,12 @@ namespace Rope {
|
|||||||
// q,k,v: [N, L, n_head, d_head]
|
// q,k,v: [N, L, n_head, d_head]
|
||||||
// pe: [L, d_head/2, 2, 2]
|
// pe: [L, d_head/2, 2, 2]
|
||||||
// return: [N, L, n_head*d_head]
|
// return: [N, L, n_head*d_head]
|
||||||
|
int64_t n_head = q->ne[1];
|
||||||
|
|
||||||
q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head]
|
q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head]
|
||||||
k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head]
|
k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head]
|
||||||
|
|
||||||
auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head]
|
auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, mask, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head]
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
}; // namespace Rope
|
}; // namespace Rope
|
||||||
|
|||||||
@ -227,6 +227,7 @@ namespace Anima {
|
|||||||
k4 = k_norm->forward(ctx, k4);
|
k4 = k_norm->forward(ctx, k4);
|
||||||
|
|
||||||
ggml_tensor* attn_out = nullptr;
|
ggml_tensor* attn_out = nullptr;
|
||||||
|
float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f;
|
||||||
if (pe_q != nullptr || pe_k != nullptr) {
|
if (pe_q != nullptr || pe_k != nullptr) {
|
||||||
if (pe_q == nullptr) {
|
if (pe_q == nullptr) {
|
||||||
pe_q = pe_k;
|
pe_q = pe_k;
|
||||||
@ -244,7 +245,8 @@ namespace Anima {
|
|||||||
num_heads,
|
num_heads,
|
||||||
nullptr,
|
nullptr,
|
||||||
true,
|
true,
|
||||||
ctx->flash_attn_enabled);
|
ctx->flash_attn_enabled,
|
||||||
|
scale);
|
||||||
} else {
|
} else {
|
||||||
auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
|
auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
|
||||||
auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
|
auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
|
||||||
@ -256,7 +258,8 @@ namespace Anima {
|
|||||||
num_heads,
|
num_heads,
|
||||||
nullptr,
|
nullptr,
|
||||||
false,
|
false,
|
||||||
ctx->flash_attn_enabled);
|
ctx->flash_attn_enabled,
|
||||||
|
scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
return out_proj->forward(ctx, attn_out);
|
return out_proj->forward(ctx, attn_out);
|
||||||
@ -561,10 +564,10 @@ namespace Anima {
|
|||||||
AnimaNet net;
|
AnimaNet net;
|
||||||
|
|
||||||
AnimaRunner(ggml_backend_t backend,
|
AnimaRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "model.diffusion_model")
|
const std::string prefix = "model.diffusion_model",
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(AnimaConfig::detect_from_weights(tensor_storage_map, prefix + ".net")) {
|
config(AnimaConfig::detect_from_weights(tensor_storage_map, prefix + ".net")) {
|
||||||
net = AnimaNet(config);
|
net = AnimaNet(config);
|
||||||
net.init(params_ctx, tensor_storage_map, prefix + ".net");
|
net.init(params_ctx, tensor_storage_map, prefix + ".net");
|
||||||
|
|||||||
835
src/model/diffusion/boogu.hpp
Normal file
835
src/model/diffusion/boogu.hpp
Normal file
@ -0,0 +1,835 @@
|
|||||||
|
#ifndef __SD_MODEL_DIFFUSION_BOOGU_HPP__
|
||||||
|
#define __SD_MODEL_DIFFUSION_BOOGU_HPP__
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "core/ggml_extend.hpp"
|
||||||
|
#include "model/common/rope.hpp"
|
||||||
|
#include "model/diffusion/dit.hpp"
|
||||||
|
#include "model/diffusion/model.hpp"
|
||||||
|
#include "model/diffusion/qwen_image.hpp"
|
||||||
|
#include "model_loader.h"
|
||||||
|
|
||||||
|
namespace Boogu {
|
||||||
|
constexpr int BOOGU_GRAPH_SIZE = 65536;
|
||||||
|
|
||||||
|
struct BooguConfig {
|
||||||
|
int patch_size = 2;
|
||||||
|
int64_t in_channels = 16;
|
||||||
|
int64_t out_channels = 16;
|
||||||
|
int64_t hidden_size = 3360;
|
||||||
|
int64_t num_layers = 32;
|
||||||
|
int64_t num_double_stream_layers = 8;
|
||||||
|
int64_t num_refiner_layers = 2;
|
||||||
|
int64_t num_attention_heads = 28;
|
||||||
|
int64_t num_kv_heads = 7;
|
||||||
|
int64_t head_dim = 120;
|
||||||
|
int64_t multiple_of = 256;
|
||||||
|
int64_t instruction_feat_dim = 4096;
|
||||||
|
int64_t timestep_embed_dim = 1024;
|
||||||
|
int theta = 10000;
|
||||||
|
float timestep_scale = 1000.0f;
|
||||||
|
float norm_eps = 1e-5f;
|
||||||
|
std::vector<int> axes_dim = {40, 40, 40};
|
||||||
|
int64_t axes_dim_sum = 120;
|
||||||
|
|
||||||
|
static int64_t count_blocks(const String2TensorStorage& tensor_storage_map,
|
||||||
|
const std::string& prefix,
|
||||||
|
const std::string& block_prefix) {
|
||||||
|
int64_t count = 0;
|
||||||
|
for (const auto& [name, _] : tensor_storage_map) {
|
||||||
|
if (!starts_with(name, prefix)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
size_t pos = name.find(block_prefix);
|
||||||
|
if (pos == std::string::npos) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto items = split_string(name.substr(pos), '.');
|
||||||
|
if (items.size() > 1) {
|
||||||
|
count = std::max<int64_t>(count, atoi(items[1].c_str()) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static BooguConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
|
||||||
|
BooguConfig config;
|
||||||
|
int64_t detected_head_dim = 0;
|
||||||
|
int64_t detected_kv_dim = 0;
|
||||||
|
|
||||||
|
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||||
|
if (!starts_with(name, prefix)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (ends_with(name, "x_embedder.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
int64_t patch_area = config.patch_size * config.patch_size;
|
||||||
|
config.in_channels = tensor_storage.ne[0] / patch_area;
|
||||||
|
config.hidden_size = tensor_storage.ne[1];
|
||||||
|
} else if (ends_with(name, "time_caption_embed.caption_embedder.1.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
config.instruction_feat_dim = tensor_storage.ne[0];
|
||||||
|
config.hidden_size = tensor_storage.ne[1];
|
||||||
|
} else if (ends_with(name, "single_stream_layers.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
|
||||||
|
detected_head_dim = tensor_storage.ne[0];
|
||||||
|
} else if (ends_with(name, "double_stream_layers.0.img_self_attn.norm_q.weight") && tensor_storage.n_dims == 1) {
|
||||||
|
detected_head_dim = tensor_storage.ne[0];
|
||||||
|
} else if (ends_with(name, "single_stream_layers.0.attn.to_k.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
detected_kv_dim = tensor_storage.ne[1];
|
||||||
|
} else if (ends_with(name, "double_stream_layers.0.img_instruct_attn.processor.img_to_k.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
detected_kv_dim = tensor_storage.ne[1];
|
||||||
|
} else if (ends_with(name, "norm_out.linear_2.weight") && tensor_storage.n_dims == 2) {
|
||||||
|
int64_t patch_area = config.patch_size * config.patch_size;
|
||||||
|
config.out_channels = tensor_storage.ne[1] / patch_area;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
config.num_layers = std::max<int64_t>(1, count_blocks(tensor_storage_map, prefix, "single_stream_layers."));
|
||||||
|
config.num_double_stream_layers = std::max<int64_t>(0, count_blocks(tensor_storage_map, prefix, "double_stream_layers."));
|
||||||
|
int64_t noise_refiner_layers = count_blocks(tensor_storage_map, prefix, "noise_refiner.");
|
||||||
|
int64_t ref_refiner_layers = count_blocks(tensor_storage_map, prefix, "ref_image_refiner.");
|
||||||
|
int64_t context_refiner_layers = count_blocks(tensor_storage_map, prefix, "context_refiner.");
|
||||||
|
config.num_refiner_layers = std::max<int64_t>(1, std::max(noise_refiner_layers, std::max(ref_refiner_layers, context_refiner_layers)));
|
||||||
|
|
||||||
|
if (detected_head_dim > 0) {
|
||||||
|
config.head_dim = detected_head_dim;
|
||||||
|
config.num_attention_heads = config.hidden_size / config.head_dim;
|
||||||
|
config.axes_dim_sum = config.head_dim;
|
||||||
|
if (detected_kv_dim > 0) {
|
||||||
|
config.num_kv_heads = detected_kv_dim / config.head_dim;
|
||||||
|
}
|
||||||
|
if (config.axes_dim_sum == 120) {
|
||||||
|
config.axes_dim = {40, 40, 40};
|
||||||
|
} else if (config.axes_dim_sum % 3 == 0) {
|
||||||
|
int axis = static_cast<int>(config.axes_dim_sum / 3);
|
||||||
|
config.axes_dim = {axis, axis, axis};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
config.timestep_embed_dim = std::min<int64_t>(config.hidden_size, 1024);
|
||||||
|
|
||||||
|
LOG_DEBUG("boogu_image: layers=%" PRId64 ", double_stream_layers=%" PRId64 ", refiner_layers=%" PRId64 ", hidden=%" PRId64 ", heads=%" PRId64 ", kv_heads=%" PRId64 ", head_dim=%" PRId64 ", in_channels=%" PRId64 ", out_channels=%" PRId64,
|
||||||
|
config.num_layers,
|
||||||
|
config.num_double_stream_layers,
|
||||||
|
config.num_refiner_layers,
|
||||||
|
config.hidden_size,
|
||||||
|
config.num_attention_heads,
|
||||||
|
config.num_kv_heads,
|
||||||
|
config.head_dim,
|
||||||
|
config.in_channels,
|
||||||
|
config.out_channels);
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
__STATIC_INLINE__ ggml_tensor* scale_modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) {
|
||||||
|
scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);
|
||||||
|
return ggml_add(ctx, x, ggml_mul(ctx, x, scale));
|
||||||
|
}
|
||||||
|
|
||||||
|
__STATIC_INLINE__ ggml_tensor* gate_residual(ggml_context* ctx, ggml_tensor* residual, ggml_tensor* x, ggml_tensor* gate) {
|
||||||
|
gate = ggml_tanh(ctx, gate);
|
||||||
|
gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]);
|
||||||
|
x = ggml_mul(ctx, x, gate);
|
||||||
|
return ggml_add(ctx, residual, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct LuminaCombinedTimestepCaptionEmbedding : public GGMLBlock {
|
||||||
|
int64_t frequency_embedding_size;
|
||||||
|
float timestep_scale;
|
||||||
|
|
||||||
|
LuminaCombinedTimestepCaptionEmbedding(int64_t hidden_size,
|
||||||
|
int64_t instruction_feat_dim,
|
||||||
|
int64_t frequency_embedding_size,
|
||||||
|
float norm_eps,
|
||||||
|
float timestep_scale)
|
||||||
|
: frequency_embedding_size(frequency_embedding_size),
|
||||||
|
timestep_scale(timestep_scale) {
|
||||||
|
blocks["timestep_embedder"] = std::make_shared<Qwen::TimestepEmbedding>(frequency_embedding_size, std::min<int64_t>(hidden_size, 1024));
|
||||||
|
blocks["caption_embedder.0"] = std::make_shared<RMSNorm>(instruction_feat_dim, norm_eps);
|
||||||
|
blocks["caption_embedder.1"] = std::make_shared<Linear>(instruction_feat_dim, hidden_size, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* timestep, ggml_tensor* text_hidden_states) {
|
||||||
|
auto timestep_embedder = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["timestep_embedder"]);
|
||||||
|
auto caption_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["caption_embedder.0"]);
|
||||||
|
auto caption_embedder_1 = std::dynamic_pointer_cast<Linear>(blocks["caption_embedder.1"]);
|
||||||
|
|
||||||
|
auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(frequency_embedding_size), 10000, timestep_scale);
|
||||||
|
auto time_embed = timestep_embedder->forward(ctx, timestep_proj);
|
||||||
|
auto caption_embed = caption_embedder_1->forward(ctx, caption_embedder_0->forward(ctx, text_hidden_states));
|
||||||
|
return {time_embed, caption_embed};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LuminaRMSNormZero : public GGMLBlock {
|
||||||
|
LuminaRMSNormZero(int64_t embedding_dim, int64_t conditioning_embedding_dim, float norm_eps) {
|
||||||
|
blocks["linear"] = std::make_shared<Linear>(conditioning_embedding_dim, 4 * embedding_dim, true);
|
||||||
|
blocks["norm"] = std::make_shared<RMSNorm>(embedding_dim, norm_eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb) {
|
||||||
|
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
|
||||||
|
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
|
||||||
|
|
||||||
|
emb = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, emb));
|
||||||
|
auto mods = ggml_ext_chunk(ctx->ggml_ctx, emb, 4, 0);
|
||||||
|
|
||||||
|
auto scale_msa = mods[0];
|
||||||
|
auto gate_msa = mods[1];
|
||||||
|
auto scale_mlp = mods[2];
|
||||||
|
auto gate_mlp = mods[3];
|
||||||
|
|
||||||
|
x = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), scale_msa);
|
||||||
|
return {x, gate_msa, scale_mlp, gate_mlp};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LuminaFeedForward : public GGMLBlock {
|
||||||
|
LuminaFeedForward(int64_t dim, int64_t inner_dim, int64_t multiple_of) {
|
||||||
|
inner_dim = multiple_of * ((inner_dim + multiple_of - 1) / multiple_of);
|
||||||
|
blocks["linear_1"] = std::make_shared<Linear>(dim, inner_dim, false);
|
||||||
|
blocks["linear_2"] = std::make_shared<Linear>(inner_dim, dim, false);
|
||||||
|
blocks["linear_3"] = std::make_shared<Linear>(dim, inner_dim, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
||||||
|
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
|
||||||
|
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
|
||||||
|
auto linear_3 = std::dynamic_pointer_cast<Linear>(blocks["linear_3"]);
|
||||||
|
|
||||||
|
if (sd_backend_is(ctx->backend, "Vulkan")) {
|
||||||
|
linear_2->set_force_prec_f32(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto h1 = linear_1->forward(ctx, x);
|
||||||
|
auto h2 = linear_3->forward(ctx, x);
|
||||||
|
x = ggml_swiglu_split(ctx->ggml_ctx, h1, h2);
|
||||||
|
x = linear_2->forward(ctx, x);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LuminaLayerNormContinuous : public GGMLBlock {
|
||||||
|
LuminaLayerNormContinuous(int64_t embedding_dim,
|
||||||
|
int64_t conditioning_embedding_dim,
|
||||||
|
int64_t out_dim) {
|
||||||
|
blocks["linear_1"] = std::make_shared<Linear>(conditioning_embedding_dim, embedding_dim, true);
|
||||||
|
blocks["norm"] = std::make_shared<LayerNorm>(embedding_dim, 1e-6f, false);
|
||||||
|
blocks["linear_2"] = std::make_shared<Linear>(embedding_dim, out_dim, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning_embedding) {
|
||||||
|
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
|
||||||
|
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
|
||||||
|
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
|
||||||
|
|
||||||
|
auto emb = linear_1->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning_embedding));
|
||||||
|
x = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), emb);
|
||||||
|
x = linear_2->forward(ctx, x);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Attention : public GGMLBlock {
|
||||||
|
int64_t dim_head;
|
||||||
|
int64_t heads;
|
||||||
|
int64_t kv_heads;
|
||||||
|
|
||||||
|
Attention(int64_t query_dim, int64_t dim_head, int64_t heads, int64_t kv_heads, float eps = 1e-5f)
|
||||||
|
: dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
|
||||||
|
blocks["to_q"] = std::make_shared<Linear>(query_dim, heads * dim_head, false);
|
||||||
|
blocks["to_k"] = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
|
||||||
|
blocks["to_v"] = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
|
||||||
|
blocks["norm_q"] = std::make_shared<RMSNorm>(dim_head, eps);
|
||||||
|
blocks["norm_k"] = std::make_shared<RMSNorm>(dim_head, eps);
|
||||||
|
blocks["to_out.0"] = std::make_shared<Linear>(heads * dim_head, query_dim, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* hidden_states,
|
||||||
|
ggml_tensor* encoder_hidden_states,
|
||||||
|
ggml_tensor* rotary_emb,
|
||||||
|
ggml_tensor* attention_mask = nullptr) {
|
||||||
|
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
|
||||||
|
auto to_k = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
|
||||||
|
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
|
||||||
|
auto norm_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
|
||||||
|
auto norm_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
|
||||||
|
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
|
||||||
|
|
||||||
|
if (sd_backend_is(ctx->backend, "Vulkan")) {
|
||||||
|
to_out_0->set_force_prec_f32(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t N = hidden_states->ne[2];
|
||||||
|
int64_t Lq = hidden_states->ne[1];
|
||||||
|
int64_t Lk = encoder_hidden_states->ne[1];
|
||||||
|
|
||||||
|
auto q = to_q->forward(ctx, hidden_states);
|
||||||
|
q = ggml_reshape_4d(ctx->ggml_ctx, q, dim_head, heads, Lq, N);
|
||||||
|
auto k = to_k->forward(ctx, encoder_hidden_states);
|
||||||
|
k = ggml_reshape_4d(ctx->ggml_ctx, k, dim_head, kv_heads, Lk, N);
|
||||||
|
auto v = to_v->forward(ctx, encoder_hidden_states);
|
||||||
|
v = ggml_reshape_4d(ctx->ggml_ctx, v, dim_head, kv_heads, Lk, N);
|
||||||
|
|
||||||
|
q = norm_q->forward(ctx, q);
|
||||||
|
k = norm_k->forward(ctx, k);
|
||||||
|
|
||||||
|
auto out = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
|
||||||
|
out = to_out_0->forward(ctx, out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BooguImageTransformerBlock : public GGMLBlock {
|
||||||
|
bool modulation;
|
||||||
|
|
||||||
|
BooguImageTransformerBlock(int64_t dim,
|
||||||
|
int64_t num_attention_heads,
|
||||||
|
int64_t num_kv_heads,
|
||||||
|
int64_t multiple_of,
|
||||||
|
float norm_eps,
|
||||||
|
bool modulation)
|
||||||
|
: modulation(modulation) {
|
||||||
|
int64_t head_dim = dim / num_attention_heads;
|
||||||
|
blocks["attn"] = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
|
||||||
|
blocks["feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
|
||||||
|
if (modulation) {
|
||||||
|
blocks["norm1"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
} else {
|
||||||
|
blocks["norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
}
|
||||||
|
blocks["ffn_norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["ffn_norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* hidden_states,
|
||||||
|
ggml_tensor* rotary_emb,
|
||||||
|
ggml_tensor* temb = nullptr,
|
||||||
|
ggml_tensor* attention_mask = nullptr) {
|
||||||
|
auto attn = std::dynamic_pointer_cast<Attention>(blocks["attn"]);
|
||||||
|
auto feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["feed_forward"]);
|
||||||
|
auto ffn_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
|
||||||
|
auto norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm2"]);
|
||||||
|
auto ffn_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
|
||||||
|
|
||||||
|
if (modulation) {
|
||||||
|
auto norm1 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["norm1"]);
|
||||||
|
auto mods = norm1->forward(ctx, hidden_states, temb);
|
||||||
|
|
||||||
|
auto norm_hidden_states = std::get<0>(mods);
|
||||||
|
auto gate_msa = std::get<1>(mods);
|
||||||
|
auto scale_mlp = std::get<2>(mods);
|
||||||
|
auto gate_mlp = std::get<3>(mods);
|
||||||
|
|
||||||
|
auto attn_output = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
|
||||||
|
hidden_states = gate_residual(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output), gate_msa);
|
||||||
|
|
||||||
|
auto mlp_input = scale_modulate(ctx->ggml_ctx, ffn_norm1->forward(ctx, hidden_states), scale_mlp);
|
||||||
|
auto mlp_output = feed_forward->forward(ctx, mlp_input);
|
||||||
|
hidden_states = gate_residual(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output), gate_mlp);
|
||||||
|
} else {
|
||||||
|
auto norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm1"]);
|
||||||
|
|
||||||
|
auto norm_hidden_states = norm1->forward(ctx, hidden_states);
|
||||||
|
auto attn_output = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
|
||||||
|
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output));
|
||||||
|
|
||||||
|
auto mlp_output = feed_forward->forward(ctx, ffn_norm1->forward(ctx, hidden_states));
|
||||||
|
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output));
|
||||||
|
}
|
||||||
|
return hidden_states;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BooguImageJointAttention : public GGMLBlock {
|
||||||
|
int64_t dim_head;
|
||||||
|
int64_t heads;
|
||||||
|
int64_t kv_heads;
|
||||||
|
|
||||||
|
BooguImageJointAttention(int64_t dim, int64_t dim_head, int64_t heads, int64_t kv_heads)
|
||||||
|
: dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
|
||||||
|
blocks["norm_q"] = std::make_shared<RMSNorm>(dim_head, 1e-5f);
|
||||||
|
blocks["norm_k"] = std::make_shared<RMSNorm>(dim_head, 1e-5f);
|
||||||
|
blocks["to_out.0"] = std::make_shared<Linear>(heads * dim_head, dim, false);
|
||||||
|
blocks["processor.img_to_q"] = std::make_shared<Linear>(dim, heads * dim_head, false);
|
||||||
|
blocks["processor.img_to_k"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
|
||||||
|
blocks["processor.img_to_v"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
|
||||||
|
blocks["processor.instruct_to_q"] = std::make_shared<Linear>(dim, heads * dim_head, false);
|
||||||
|
blocks["processor.instruct_to_k"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
|
||||||
|
blocks["processor.instruct_to_v"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
|
||||||
|
blocks["processor.instruct_out"] = std::make_shared<Linear>(heads * dim_head, dim, false);
|
||||||
|
blocks["processor.img_out"] = std::make_shared<Linear>(heads * dim_head, dim, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* img_hidden_states,
|
||||||
|
ggml_tensor* instruct_hidden_states,
|
||||||
|
ggml_tensor* rotary_emb,
|
||||||
|
ggml_tensor* attention_mask = nullptr) {
|
||||||
|
auto norm_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
|
||||||
|
auto norm_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
|
||||||
|
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
|
||||||
|
auto img_to_q = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_q"]);
|
||||||
|
auto img_to_k = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_k"]);
|
||||||
|
auto img_to_v = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_v"]);
|
||||||
|
auto instruct_to_q = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_q"]);
|
||||||
|
auto instruct_to_k = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_k"]);
|
||||||
|
auto instruct_to_v = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_v"]);
|
||||||
|
auto instruct_out = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_out"]);
|
||||||
|
auto img_out = std::dynamic_pointer_cast<Linear>(blocks["processor.img_out"]);
|
||||||
|
|
||||||
|
if (sd_backend_is(ctx->backend, "Vulkan")) {
|
||||||
|
to_out_0->set_force_prec_f32(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t N = img_hidden_states->ne[2];
|
||||||
|
int64_t L_img = img_hidden_states->ne[1];
|
||||||
|
int64_t L_instruct = instruct_hidden_states->ne[1];
|
||||||
|
|
||||||
|
auto img_q = img_to_q->forward(ctx, img_hidden_states);
|
||||||
|
img_q = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, heads, L_img, N);
|
||||||
|
auto img_k = img_to_k->forward(ctx, img_hidden_states);
|
||||||
|
img_k = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, kv_heads, L_img, N);
|
||||||
|
auto img_v = img_to_v->forward(ctx, img_hidden_states);
|
||||||
|
img_v = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, kv_heads, L_img, N);
|
||||||
|
|
||||||
|
auto instruct_q = instruct_to_q->forward(ctx, instruct_hidden_states);
|
||||||
|
instruct_q = ggml_reshape_4d(ctx->ggml_ctx, instruct_q, dim_head, heads, L_instruct, N);
|
||||||
|
auto instruct_k = instruct_to_k->forward(ctx, instruct_hidden_states);
|
||||||
|
instruct_k = ggml_reshape_4d(ctx->ggml_ctx, instruct_k, dim_head, kv_heads, L_instruct, N);
|
||||||
|
auto instruct_v = instruct_to_v->forward(ctx, instruct_hidden_states);
|
||||||
|
instruct_v = ggml_reshape_4d(ctx->ggml_ctx, instruct_v, dim_head, kv_heads, L_instruct, N);
|
||||||
|
|
||||||
|
auto q = ggml_concat(ctx->ggml_ctx, instruct_q, img_q, 2);
|
||||||
|
auto k = ggml_concat(ctx->ggml_ctx, instruct_k, img_k, 2);
|
||||||
|
auto v = ggml_concat(ctx->ggml_ctx, instruct_v, img_v, 2);
|
||||||
|
q = norm_q->forward(ctx, q);
|
||||||
|
k = norm_k->forward(ctx, k);
|
||||||
|
|
||||||
|
auto hidden_states = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
|
||||||
|
auto instruct_attn = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, 0, L_instruct);
|
||||||
|
auto img_attn = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, L_instruct, L_instruct + L_img);
|
||||||
|
|
||||||
|
instruct_attn = instruct_out->forward(ctx, instruct_attn);
|
||||||
|
img_attn = img_out->forward(ctx, img_attn);
|
||||||
|
hidden_states = ggml_concat(ctx->ggml_ctx, instruct_attn, img_attn, 1);
|
||||||
|
hidden_states = to_out_0->forward(ctx, hidden_states);
|
||||||
|
return hidden_states;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BooguImageDoubleStreamBlock : public GGMLBlock {
|
||||||
|
BooguImageDoubleStreamBlock(int64_t dim,
|
||||||
|
int64_t num_attention_heads,
|
||||||
|
int64_t num_kv_heads,
|
||||||
|
int64_t multiple_of,
|
||||||
|
float norm_eps) {
|
||||||
|
int64_t head_dim = dim / num_attention_heads;
|
||||||
|
blocks["img_instruct_attn"] = std::make_shared<BooguImageJointAttention>(dim, head_dim, num_attention_heads, num_kv_heads);
|
||||||
|
blocks["img_self_attn"] = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
|
||||||
|
blocks["img_feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
|
||||||
|
blocks["instruct_feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
|
||||||
|
blocks["img_norm1"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["img_norm2"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["img_norm3"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["instruct_norm1"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["instruct_norm2"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
|
||||||
|
blocks["img_attn_norm"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["img_self_attn_norm"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["img_ffn_norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["img_ffn_norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["instruct_attn_norm"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["instruct_ffn_norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
blocks["instruct_ffn_norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* img_hidden_states,
|
||||||
|
ggml_tensor* instruct_hidden_states,
|
||||||
|
ggml_tensor* joint_rotary_emb,
|
||||||
|
ggml_tensor* img_rotary_emb,
|
||||||
|
ggml_tensor* temb) {
|
||||||
|
auto img_instruct_attn = std::dynamic_pointer_cast<BooguImageJointAttention>(blocks["img_instruct_attn"]);
|
||||||
|
auto img_self_attn = std::dynamic_pointer_cast<Attention>(blocks["img_self_attn"]);
|
||||||
|
auto img_feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["img_feed_forward"]);
|
||||||
|
auto instruct_feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["instruct_feed_forward"]);
|
||||||
|
auto img_norm1 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm1"]);
|
||||||
|
auto img_norm2 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm2"]);
|
||||||
|
auto img_norm3 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm3"]);
|
||||||
|
auto instruct_norm1 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm1"]);
|
||||||
|
auto instruct_norm2 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm2"]);
|
||||||
|
auto img_attn_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["img_attn_norm"]);
|
||||||
|
auto img_self_attn_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["img_self_attn_norm"]);
|
||||||
|
auto img_ffn_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm1"]);
|
||||||
|
auto img_ffn_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm2"]);
|
||||||
|
auto instruct_attn_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_attn_norm"]);
|
||||||
|
auto instruct_ffn_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm1"]);
|
||||||
|
auto instruct_ffn_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm2"]);
|
||||||
|
|
||||||
|
int64_t L_instruct = instruct_hidden_states->ne[1];
|
||||||
|
|
||||||
|
auto img_norm1_out_vec = img_norm1->forward(ctx, img_hidden_states, temb);
|
||||||
|
auto img_norm2_out_vec = img_norm2->forward(ctx, img_hidden_states, temb);
|
||||||
|
auto img_norm3_out_vec = img_norm3->forward(ctx, img_hidden_states, temb);
|
||||||
|
auto instruct_norm1_out_vec = instruct_norm1->forward(ctx, instruct_hidden_states, temb);
|
||||||
|
auto instruct_norm2_out_vec = instruct_norm2->forward(ctx, instruct_hidden_states, temb);
|
||||||
|
|
||||||
|
auto img_norm1_out = std::get<0>(img_norm1_out_vec);
|
||||||
|
auto img_gate_msa = std::get<1>(img_norm1_out_vec);
|
||||||
|
auto img_scale_mlp = std::get<2>(img_norm1_out_vec);
|
||||||
|
auto img_gate_mlp = std::get<3>(img_norm1_out_vec);
|
||||||
|
|
||||||
|
auto img_norm2_out = std::get<0>(img_norm2_out_vec);
|
||||||
|
auto img_shift_mlp = std::get<1>(img_norm2_out_vec);
|
||||||
|
|
||||||
|
auto img_norm3_out = std::get<0>(img_norm3_out_vec);
|
||||||
|
auto img_gate_self = std::get<1>(img_norm3_out_vec);
|
||||||
|
|
||||||
|
auto instruct_norm1_out = std::get<0>(instruct_norm1_out_vec);
|
||||||
|
auto instruct_gate_msa = std::get<1>(instruct_norm1_out_vec);
|
||||||
|
auto instruct_scale_mlp = std::get<2>(instruct_norm1_out_vec);
|
||||||
|
auto instruct_gate_mlp = std::get<3>(instruct_norm1_out_vec);
|
||||||
|
|
||||||
|
auto instruct_norm2_out = std::get<0>(instruct_norm2_out_vec);
|
||||||
|
auto instruct_shift_mlp = std::get<1>(instruct_norm2_out_vec);
|
||||||
|
|
||||||
|
auto joint_attn_out = img_instruct_attn->forward(ctx, img_norm1_out, instruct_norm1_out, joint_rotary_emb);
|
||||||
|
auto instruct_attn_out = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, 0, L_instruct);
|
||||||
|
auto img_attn_out = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, L_instruct, joint_attn_out->ne[1]);
|
||||||
|
|
||||||
|
auto img_self_attn_out = img_self_attn->forward(ctx, img_norm3_out, img_norm3_out, img_rotary_emb);
|
||||||
|
|
||||||
|
img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_attn_norm->forward(ctx, img_attn_out), img_gate_msa);
|
||||||
|
img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_self_attn_norm->forward(ctx, img_self_attn_out), img_gate_self);
|
||||||
|
|
||||||
|
auto img_mlp_input = scale_modulate(ctx->ggml_ctx, img_norm2_out, img_scale_mlp);
|
||||||
|
img_shift_mlp = ggml_reshape_3d(ctx->ggml_ctx, img_shift_mlp, img_shift_mlp->ne[0], 1, img_shift_mlp->ne[1]);
|
||||||
|
img_mlp_input = ggml_add(ctx->ggml_ctx, img_mlp_input, img_shift_mlp);
|
||||||
|
auto img_mlp_out = img_feed_forward->forward(ctx, img_ffn_norm1->forward(ctx, img_mlp_input));
|
||||||
|
img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_ffn_norm2->forward(ctx, img_mlp_out), img_gate_mlp);
|
||||||
|
|
||||||
|
instruct_hidden_states = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_attn_norm->forward(ctx, instruct_attn_out), instruct_gate_msa);
|
||||||
|
auto instruct_mlp_input = scale_modulate(ctx->ggml_ctx, instruct_norm2_out, instruct_scale_mlp);
|
||||||
|
instruct_shift_mlp = ggml_reshape_3d(ctx->ggml_ctx, instruct_shift_mlp, instruct_shift_mlp->ne[0], 1, instruct_shift_mlp->ne[1]);
|
||||||
|
instruct_mlp_input = ggml_add(ctx->ggml_ctx, instruct_mlp_input, instruct_shift_mlp);
|
||||||
|
auto instruct_mlp_out = instruct_feed_forward->forward(ctx, instruct_ffn_norm1->forward(ctx, instruct_mlp_input));
|
||||||
|
instruct_hidden_states = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_ffn_norm2->forward(ctx, instruct_mlp_out), instruct_gate_mlp);
|
||||||
|
|
||||||
|
return {img_hidden_states, instruct_hidden_states};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BooguImageModel : public GGMLBlock {
|
||||||
|
BooguConfig config;
|
||||||
|
|
||||||
|
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
||||||
|
GGML_UNUSED(tensor_storage_map);
|
||||||
|
GGML_UNUSED(prefix);
|
||||||
|
params["image_index_embedding"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, config.hidden_size, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
BooguImageModel() = default;
|
||||||
|
BooguImageModel(BooguConfig config)
|
||||||
|
: config(std::move(config)) {
|
||||||
|
blocks["x_embedder"] = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
|
||||||
|
blocks["ref_image_patch_embedder"] = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
|
||||||
|
blocks["time_caption_embed"] = std::make_shared<LuminaCombinedTimestepCaptionEmbedding>(this->config.hidden_size,
|
||||||
|
this->config.instruction_feat_dim,
|
||||||
|
256,
|
||||||
|
this->config.norm_eps,
|
||||||
|
this->config.timestep_scale);
|
||||||
|
|
||||||
|
for (int i = 0; i < this->config.num_refiner_layers; i++) {
|
||||||
|
blocks["noise_refiner." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps,
|
||||||
|
true);
|
||||||
|
blocks["ref_image_refiner." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps,
|
||||||
|
true);
|
||||||
|
blocks["context_refiner." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < this->config.num_double_stream_layers; i++) {
|
||||||
|
blocks["double_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageDoubleStreamBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < this->config.num_layers; i++) {
|
||||||
|
blocks["single_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
|
||||||
|
this->config.num_attention_heads,
|
||||||
|
this->config.num_kv_heads,
|
||||||
|
this->config.multiple_of,
|
||||||
|
this->config.norm_eps,
|
||||||
|
true);
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks["norm_out"] = std::make_shared<LuminaLayerNormContinuous>(this->config.hidden_size,
|
||||||
|
this->config.timestep_embed_dim,
|
||||||
|
this->config.patch_size * this->config.patch_size * this->config.out_channels);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* image_index_embedding(GGMLRunnerContext* ctx, int index) {
|
||||||
|
GGML_ASSERT(index >= 0 && index < 5);
|
||||||
|
auto embedding = params["image_index_embedding"];
|
||||||
|
auto out = ggml_view_1d(ctx->ggml_ctx,
|
||||||
|
embedding,
|
||||||
|
config.hidden_size,
|
||||||
|
index * config.hidden_size * ggml_element_size(embedding));
|
||||||
|
out = ggml_reshape_3d(ctx->ggml_ctx, out, config.hidden_size, 1, 1);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* embed_refs(GGMLRunnerContext* ctx, const std::vector<ggml_tensor*>& ref_latents) {
|
||||||
|
if (ref_latents.empty()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
auto ref_image_patch_embedder = std::dynamic_pointer_cast<Linear>(blocks["ref_image_patch_embedder"]);
|
||||||
|
|
||||||
|
ggml_tensor* ref_img = nullptr;
|
||||||
|
for (int i = 0; i < static_cast<int>(ref_latents.size()); i++) {
|
||||||
|
auto ref = DiT::pad_and_patchify(ctx, ref_latents[i], config.patch_size, config.patch_size, false);
|
||||||
|
ref = ref_image_patch_embedder->forward(ctx, ref);
|
||||||
|
ref = ggml_add(ctx->ggml_ctx, ref, image_index_embedding(ctx, std::min(i, 4)));
|
||||||
|
ref_img = ref_img == nullptr ? ref : ggml_concat(ctx->ggml_ctx, ref_img, ref, 1);
|
||||||
|
}
|
||||||
|
return ref_img;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* x,
|
||||||
|
ggml_tensor* timesteps,
|
||||||
|
ggml_tensor* context,
|
||||||
|
ggml_tensor* pe,
|
||||||
|
std::vector<ggml_tensor*> ref_latents = {}) {
|
||||||
|
int64_t W = x->ne[0];
|
||||||
|
int64_t H = x->ne[1];
|
||||||
|
int64_t N = x->ne[3];
|
||||||
|
GGML_ASSERT(N == 1);
|
||||||
|
|
||||||
|
auto x_embedder = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]);
|
||||||
|
auto time_caption_embed = std::dynamic_pointer_cast<LuminaCombinedTimestepCaptionEmbedding>(blocks["time_caption_embed"]);
|
||||||
|
auto norm_out = std::dynamic_pointer_cast<LuminaLayerNormContinuous>(blocks["norm_out"]);
|
||||||
|
|
||||||
|
auto timestep = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, timesteps), timesteps);
|
||||||
|
auto embeds = time_caption_embed->forward(ctx, timestep, context);
|
||||||
|
auto temb = embeds.first;
|
||||||
|
auto txt = embeds.second;
|
||||||
|
|
||||||
|
auto img = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size, false);
|
||||||
|
int64_t img_len = img->ne[1];
|
||||||
|
img = x_embedder->forward(ctx, img);
|
||||||
|
auto ref_img = embed_refs(ctx, ref_latents);
|
||||||
|
int64_t ref_len = ref_img != nullptr ? ref_img->ne[1] : 0;
|
||||||
|
int64_t txt_len = txt->ne[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(pe->ne[3] == txt_len + ref_len + img_len);
|
||||||
|
auto txt_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt_len);
|
||||||
|
auto noise_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len + ref_len, txt_len + ref_len + img_len);
|
||||||
|
|
||||||
|
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
|
||||||
|
txt = block->forward(ctx, txt, txt_pe);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.context_refiner." + std::to_string(i), "txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
|
||||||
|
img = block->forward(ctx, img, noise_pe, temb);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(img, "boogu.noise_refiner." + std::to_string(i), "img");
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* combined_img = img;
|
||||||
|
if (ref_img != nullptr) {
|
||||||
|
auto ref_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + ref_len);
|
||||||
|
for (int i = 0; i < config.num_refiner_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["ref_image_refiner." + std::to_string(i)]);
|
||||||
|
ref_img = block->forward(ctx, ref_img, ref_pe, temb);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(ref_img, "boogu.ref_image_refiner." + std::to_string(i), "ref_img");
|
||||||
|
}
|
||||||
|
combined_img = ggml_concat(ctx->ggml_ctx, ref_img, img, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + combined_img->ne[1]);
|
||||||
|
for (int i = 0; i < config.num_double_stream_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageDoubleStreamBlock>(blocks["double_stream_layers." + std::to_string(i)]);
|
||||||
|
auto result = block->forward(ctx, combined_img, txt, pe, img_pe, temb);
|
||||||
|
combined_img = result.first;
|
||||||
|
txt = result.second;
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(combined_img, "boogu.double_stream_layers." + std::to_string(i), "img");
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.double_stream_layers." + std::to_string(i), "txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto hidden_states = ggml_concat(ctx->ggml_ctx, txt, combined_img, 1);
|
||||||
|
for (int i = 0; i < config.num_layers; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["single_stream_layers." + std::to_string(i)]);
|
||||||
|
hidden_states = block->forward(ctx, hidden_states, pe, temb);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "boogu.single_stream_layers." + std::to_string(i), "hidden_states");
|
||||||
|
}
|
||||||
|
|
||||||
|
hidden_states = norm_out->forward(ctx, hidden_states, temb);
|
||||||
|
hidden_states = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, hidden_states->ne[1] - img_len, hidden_states->ne[1]);
|
||||||
|
hidden_states = DiT::unpatchify_and_crop(ctx->ggml_ctx, hidden_states, H, W, config.patch_size, config.patch_size, false);
|
||||||
|
hidden_states = ggml_ext_scale(ctx->ggml_ctx, hidden_states, -1.f);
|
||||||
|
return hidden_states;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
__STATIC_INLINE__ int patched_token_count(int64_t size, int patch_size) {
|
||||||
|
int pad = (patch_size - (static_cast<int>(size) % patch_size)) % patch_size;
|
||||||
|
return (static_cast<int>(size) + pad) / patch_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
__STATIC_INLINE__ void append_spatial_ids(std::vector<std::vector<float>>& ids,
|
||||||
|
int bs,
|
||||||
|
int pe_shift,
|
||||||
|
int h_tokens,
|
||||||
|
int w_tokens) {
|
||||||
|
std::vector<std::vector<float>> image_ids(h_tokens * w_tokens, std::vector<float>(3, 0.0f));
|
||||||
|
for (int h = 0; h < h_tokens; h++) {
|
||||||
|
for (int w = 0; w < w_tokens; w++) {
|
||||||
|
image_ids[h * w_tokens + w][0] = static_cast<float>(pe_shift);
|
||||||
|
image_ids[h * w_tokens + w][1] = static_cast<float>(h);
|
||||||
|
image_ids[h * w_tokens + w][2] = static_cast<float>(w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int b = 0; b < bs; b++) {
|
||||||
|
ids.insert(ids.end(), image_ids.begin(), image_ids.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__STATIC_INLINE__ std::vector<float> gen_boogu_pe(int h,
|
||||||
|
int w,
|
||||||
|
int patch_size,
|
||||||
|
int bs,
|
||||||
|
int context_len,
|
||||||
|
const std::vector<ggml_tensor*>& ref_latents,
|
||||||
|
int theta,
|
||||||
|
const std::vector<int>& axes_dim) {
|
||||||
|
std::vector<std::vector<float>> ids;
|
||||||
|
ids.reserve(static_cast<size_t>(bs) * context_len);
|
||||||
|
for (int b = 0; b < bs; b++) {
|
||||||
|
for (int i = 0; i < context_len; i++) {
|
||||||
|
float pos = static_cast<float>(i);
|
||||||
|
ids.push_back({pos, pos, pos});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int pe_shift = context_len;
|
||||||
|
for (ggml_tensor* ref : ref_latents) {
|
||||||
|
int ref_h_tokens = patched_token_count(ref->ne[1], patch_size);
|
||||||
|
int ref_w_tokens = patched_token_count(ref->ne[0], patch_size);
|
||||||
|
append_spatial_ids(ids, bs, pe_shift, ref_h_tokens, ref_w_tokens);
|
||||||
|
pe_shift += std::max(ref_h_tokens, ref_w_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
int h_tokens = patched_token_count(h, patch_size);
|
||||||
|
int w_tokens = patched_token_count(w, patch_size);
|
||||||
|
append_spatial_ids(ids, bs, pe_shift, h_tokens, w_tokens);
|
||||||
|
|
||||||
|
return Rope::embed_nd(ids, bs, static_cast<float>(theta), axes_dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct BooguImageRunner : public DiffusionModelRunner {
|
||||||
|
BooguConfig config;
|
||||||
|
BooguImageModel boogu;
|
||||||
|
std::vector<float> pe_vec;
|
||||||
|
|
||||||
|
BooguImageRunner(ggml_backend_t backend,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string prefix = "",
|
||||||
|
SDVersion version = VERSION_BOOGU_IMAGE,
|
||||||
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
|
config(BooguConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||||
|
boogu = BooguImageModel(config);
|
||||||
|
boogu.init(params_ctx, tensor_storage_map, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() override {
|
||||||
|
return "boogu_image";
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||||
|
boogu.get_param_tensors(tensors, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
|
||||||
|
const sd::Tensor<float>& timesteps_tensor,
|
||||||
|
const sd::Tensor<float>& context_tensor,
|
||||||
|
const std::vector<sd::Tensor<float>>& ref_latents_tensor = {}) {
|
||||||
|
ggml_cgraph* gf = new_graph_custom(BOOGU_GRAPH_SIZE);
|
||||||
|
ggml_tensor* x = make_input(x_tensor);
|
||||||
|
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
||||||
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
|
GGML_ASSERT(!context_tensor.empty());
|
||||||
|
ggml_tensor* context = make_input(context_tensor);
|
||||||
|
|
||||||
|
std::vector<ggml_tensor*> ref_latents;
|
||||||
|
ref_latents.reserve(ref_latents_tensor.size());
|
||||||
|
for (const auto& ref_latent_tensor : ref_latents_tensor) {
|
||||||
|
ref_latents.push_back(make_input(ref_latent_tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
pe_vec = gen_boogu_pe(static_cast<int>(x->ne[1]),
|
||||||
|
static_cast<int>(x->ne[0]),
|
||||||
|
config.patch_size,
|
||||||
|
static_cast<int>(x->ne[3]),
|
||||||
|
static_cast<int>(context->ne[1]),
|
||||||
|
ref_latents,
|
||||||
|
config.theta,
|
||||||
|
config.axes_dim);
|
||||||
|
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
|
||||||
|
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
|
||||||
|
set_backend_tensor_data(pe, pe_vec.data());
|
||||||
|
|
||||||
|
auto runner_ctx = get_context();
|
||||||
|
ggml_tensor* out = boogu.forward(&runner_ctx, x, timesteps, context, pe, ref_latents);
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> compute(int n_threads,
|
||||||
|
const sd::Tensor<float>& x,
|
||||||
|
const sd::Tensor<float>& timesteps,
|
||||||
|
const sd::Tensor<float>& context,
|
||||||
|
const std::vector<sd::Tensor<float>>& ref_latents = {}) {
|
||||||
|
auto get_graph = [&]() -> ggml_cgraph* {
|
||||||
|
return build_graph(x, timesteps, context, ref_latents);
|
||||||
|
};
|
||||||
|
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
|
||||||
|
}
|
||||||
|
|
||||||
|
sd::Tensor<float> compute(int n_threads,
|
||||||
|
const DiffusionParams& diffusion_params) override {
|
||||||
|
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||||
|
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||||
|
static const std::vector<sd::Tensor<float>> empty_ref_latents;
|
||||||
|
return compute(n_threads,
|
||||||
|
*diffusion_params.x,
|
||||||
|
*diffusion_params.timesteps,
|
||||||
|
tensor_or_empty(diffusion_params.context),
|
||||||
|
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace Boogu
|
||||||
|
|
||||||
|
#endif // __SD_MODEL_DIFFUSION_BOOGU_HPP__
|
||||||
@ -1,8 +1,9 @@
|
|||||||
#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__
|
#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__
|
||||||
#define __SD_MODEL_DIFFUSION_CONTROL_HPP__
|
#define __SD_MODEL_DIFFUSION_CONTROL_HPP__
|
||||||
|
|
||||||
#include "model/common/block.hpp"
|
#include "model/common/block.hpp"
|
||||||
#include "model_loader.h"
|
#include "model_loader.h"
|
||||||
|
#include "model_manager.h"
|
||||||
|
|
||||||
#define CONTROL_NET_GRAPH_SIZE 1536
|
#define CONTROL_NET_GRAPH_SIZE 1536
|
||||||
|
|
||||||
@ -311,20 +312,24 @@ struct ControlNet : public GGMLRunner {
|
|||||||
ControlNetBlock control_net;
|
ControlNetBlock control_net;
|
||||||
std::string weight_prefix;
|
std::string weight_prefix;
|
||||||
|
|
||||||
ggml_backend_buffer_t control_buffer = nullptr;
|
|
||||||
ggml_context* control_ctx = nullptr;
|
|
||||||
std::vector<ggml_tensor*> control_outputs_ggml;
|
std::vector<ggml_tensor*> control_outputs_ggml;
|
||||||
ggml_tensor* guided_hint_output_ggml = nullptr;
|
ggml_tensor* guided_hint_output_ggml = nullptr;
|
||||||
std::vector<sd::Tensor<float>> controls;
|
std::vector<sd::Tensor<float>> controls;
|
||||||
sd::Tensor<float> guided_hint;
|
|
||||||
bool guided_hint_cached = false;
|
bool guided_hint_cached = false;
|
||||||
|
std::shared_ptr<ModelManager> owned_model_manager;
|
||||||
|
ggml_backend_t params_backend = nullptr;
|
||||||
|
|
||||||
|
static const char* guided_hint_cache_name() {
|
||||||
|
return "controlnet.guided_hint";
|
||||||
|
}
|
||||||
|
|
||||||
ControlNet(ggml_backend_t backend,
|
ControlNet(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
ggml_backend_t params_backend_,
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
SDVersion version = VERSION_SD1,
|
SDVersion version = VERSION_SD1,
|
||||||
const std::string& prefix = "")
|
const std::string& prefix = "",
|
||||||
: GGMLRunner(backend, params_backend), version(version), control_net(version), weight_prefix(prefix) {
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: GGMLRunner(backend, weight_manager), version(version), control_net(version), weight_prefix(prefix), params_backend(params_backend_) {
|
||||||
control_net.init(params_ctx, tensor_storage_map, prefix);
|
control_net.init(params_ctx, tensor_storage_map, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -332,44 +337,12 @@ struct ControlNet : public GGMLRunner {
|
|||||||
free_control_ctx();
|
free_control_ctx();
|
||||||
}
|
}
|
||||||
|
|
||||||
void alloc_control_ctx(std::vector<ggml_tensor*> outs) {
|
|
||||||
ggml_init_params params;
|
|
||||||
params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
|
|
||||||
params.mem_buffer = nullptr;
|
|
||||||
params.no_alloc = true;
|
|
||||||
control_ctx = ggml_init(params);
|
|
||||||
|
|
||||||
control_outputs_ggml.resize(outs.size() - 1);
|
|
||||||
|
|
||||||
size_t control_buffer_size = 0;
|
|
||||||
|
|
||||||
guided_hint_output_ggml = ggml_dup_tensor(control_ctx, outs[0]);
|
|
||||||
control_buffer_size += ggml_nbytes(guided_hint_output_ggml);
|
|
||||||
|
|
||||||
for (int i = 0; i < outs.size() - 1; i++) {
|
|
||||||
control_outputs_ggml[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
|
|
||||||
control_buffer_size += ggml_nbytes(control_outputs_ggml[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
|
|
||||||
|
|
||||||
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
|
|
||||||
}
|
|
||||||
|
|
||||||
void free_control_ctx() {
|
void free_control_ctx() {
|
||||||
if (control_buffer != nullptr) {
|
|
||||||
ggml_backend_buffer_free(control_buffer);
|
|
||||||
control_buffer = nullptr;
|
|
||||||
}
|
|
||||||
if (control_ctx != nullptr) {
|
|
||||||
ggml_free(control_ctx);
|
|
||||||
control_ctx = nullptr;
|
|
||||||
}
|
|
||||||
guided_hint_output_ggml = nullptr;
|
guided_hint_output_ggml = nullptr;
|
||||||
guided_hint_cached = false;
|
guided_hint_cached = false;
|
||||||
guided_hint = {};
|
|
||||||
control_outputs_ggml.clear();
|
control_outputs_ggml.clear();
|
||||||
controls.clear();
|
controls.clear();
|
||||||
|
free_cache_ctx_and_buffer();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string get_desc() override {
|
std::string get_desc() override {
|
||||||
@ -393,11 +366,17 @@ struct ControlNet : public GGMLRunner {
|
|||||||
ggml_tensor* context = make_optional_input(context_tensor);
|
ggml_tensor* context = make_optional_input(context_tensor);
|
||||||
ggml_tensor* y = make_optional_input(y_tensor);
|
ggml_tensor* y = make_optional_input(y_tensor);
|
||||||
|
|
||||||
|
guided_hint_output_ggml = nullptr;
|
||||||
|
control_outputs_ggml.clear();
|
||||||
|
|
||||||
ggml_tensor* guided_hint_input = nullptr;
|
ggml_tensor* guided_hint_input = nullptr;
|
||||||
if (guided_hint_cached && !guided_hint.empty()) {
|
if (guided_hint_cached) {
|
||||||
guided_hint_input = make_input(guided_hint);
|
guided_hint_input = get_cache_tensor_by_name(guided_hint_cache_name());
|
||||||
hint = nullptr;
|
if (guided_hint_input == nullptr) {
|
||||||
} else {
|
guided_hint_cached = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (guided_hint_input == nullptr) {
|
||||||
hint = make_input(hint_tensor);
|
hint = make_input(hint_tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -411,13 +390,19 @@ struct ControlNet : public GGMLRunner {
|
|||||||
context,
|
context,
|
||||||
y);
|
y);
|
||||||
|
|
||||||
if (control_ctx == nullptr) {
|
if (guided_hint_input == nullptr && !outs.empty()) {
|
||||||
alloc_control_ctx(outs);
|
guided_hint_output_ggml = outs[0];
|
||||||
|
ggml_set_output(guided_hint_output_ggml);
|
||||||
|
cache(guided_hint_cache_name(), guided_hint_output_ggml);
|
||||||
|
ggml_build_forward_expand(gf, guided_hint_output_ggml);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint_output_ggml));
|
control_outputs_ggml.reserve(outs.size() > 0 ? outs.size() - 1 : 0);
|
||||||
for (int i = 0; i < outs.size() - 1; i++) {
|
for (size_t i = 1; i < outs.size(); i++) {
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], control_outputs_ggml[i]));
|
ggml_tensor* control_output = outs[i];
|
||||||
|
ggml_set_output(control_output);
|
||||||
|
ggml_build_forward_expand(gf, control_output);
|
||||||
|
control_outputs_ggml.push_back(control_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
@ -437,15 +422,12 @@ struct ControlNet : public GGMLRunner {
|
|||||||
return build_graph(x, hint, timesteps, context, y);
|
return build_graph(x, hint, timesteps, context, y);
|
||||||
};
|
};
|
||||||
|
|
||||||
auto compute_result = GGMLRunner::compute<float>(get_graph, n_threads, false, false, false);
|
auto compute_result = GGMLRunner::compute<float>(get_graph, n_threads, false, false, false, true);
|
||||||
if (!compute_result.has_value()) {
|
if (!compute_result.has_value()) {
|
||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (guided_hint_output_ggml != nullptr) {
|
guided_hint_cached = get_cache_tensor_by_name(guided_hint_cache_name()) != nullptr;
|
||||||
guided_hint = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml<float>(guided_hint_output_ggml),
|
|
||||||
4);
|
|
||||||
}
|
|
||||||
controls.clear();
|
controls.clear();
|
||||||
controls.reserve(control_outputs_ggml.size());
|
controls.reserve(control_outputs_ggml.size());
|
||||||
for (ggml_tensor* control : control_outputs_ggml) {
|
for (ggml_tensor* control : control_outputs_ggml) {
|
||||||
@ -453,37 +435,40 @@ struct ControlNet : public GGMLRunner {
|
|||||||
GGML_ASSERT(!control_host.empty());
|
GGML_ASSERT(!control_host.empty());
|
||||||
controls.push_back(std::move(control_host));
|
controls.push_back(std::move(control_host));
|
||||||
}
|
}
|
||||||
guided_hint_cached = true;
|
|
||||||
return controls;
|
return controls;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_from_file(const std::string& file_path, int n_threads) {
|
bool load_from_file(const std::string& file_path, int n_threads) {
|
||||||
LOG_INFO("loading control net from '%s'", file_path.c_str());
|
LOG_INFO("loading control net from '%s'", file_path.c_str());
|
||||||
if (!alloc_params_buffer()) {
|
|
||||||
LOG_ERROR("control net model buffer allocation failed");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
control_net.get_param_tensors(tensors);
|
control_net.get_param_tensors(tensors);
|
||||||
std::set<std::string> ignore_tensors;
|
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto manager = std::dynamic_pointer_cast<ModelManager>(weight_manager.lock());
|
||||||
|
if (manager == nullptr) {
|
||||||
|
owned_model_manager = std::make_shared<ModelManager>();
|
||||||
|
weight_manager = owned_model_manager;
|
||||||
|
manager = owned_model_manager;
|
||||||
|
}
|
||||||
|
|
||||||
|
ModelLoader& model_loader = manager->loader();
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path)) {
|
if (!model_loader.init_from_file_and_convert_name(file_path)) {
|
||||||
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
model_loader.set_n_threads(n_threads);
|
manager->set_n_threads(n_threads);
|
||||||
bool success = model_loader.load_tensors(tensors, ignore_tensors);
|
if (!manager->register_param_tensors("ControlNet",
|
||||||
|
std::move(tensors),
|
||||||
if (!success) {
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
LOG_ERROR("load control net tensors from model loader failed");
|
runtime_backend,
|
||||||
|
params_backend) ||
|
||||||
|
!manager->validate_registered_tensors()) {
|
||||||
|
LOG_ERROR("register control net tensors with model manager failed");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INFO("control net model loaded");
|
LOG_INFO("control net model loaded");
|
||||||
return success;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -162,6 +162,8 @@ namespace ErnieImage {
|
|||||||
int64_t S = x->ne[1];
|
int64_t S = x->ne[1];
|
||||||
int64_t N = x->ne[2];
|
int64_t N = x->ne[2];
|
||||||
|
|
||||||
|
float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f;
|
||||||
|
|
||||||
auto q = to_q->forward(ctx, x);
|
auto q = to_q->forward(ctx, x);
|
||||||
auto k = to_k->forward(ctx, x);
|
auto k = to_k->forward(ctx, x);
|
||||||
auto v = to_v->forward(ctx, x);
|
auto v = to_v->forward(ctx, x);
|
||||||
@ -182,7 +184,7 @@ namespace ErnieImage {
|
|||||||
k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, heads, S, head_dim]
|
k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, heads, S, head_dim]
|
||||||
k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);
|
k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);
|
||||||
|
|
||||||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled); // [N, S, hidden_size]
|
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled, scale); // [N, S, hidden_size]
|
||||||
x = to_out_0->forward(ctx, x);
|
x = to_out_0->forward(ctx, x);
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
@ -387,10 +389,10 @@ namespace ErnieImage {
|
|||||||
std::vector<float> pe_vec;
|
std::vector<float> pe_vec;
|
||||||
|
|
||||||
ErnieImageRunner(ggml_backend_t backend,
|
ErnieImageRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "")
|
const std::string prefix = "",
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(ErnieImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
config(ErnieImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||||
ernie_image = ErnieImageModel(config);
|
ernie_image = ErnieImageModel(config);
|
||||||
ernie_image.init(params_ctx, tensor_storage_map, prefix);
|
ernie_image.init(params_ctx, tensor_storage_map, prefix);
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "model/adapter/pulid.hpp"
|
||||||
#include "model/common/rope.hpp"
|
#include "model/common/rope.hpp"
|
||||||
#include "model/diffusion/dit.hpp"
|
#include "model/diffusion/dit.hpp"
|
||||||
#include "model/diffusion/model.hpp"
|
#include "model/diffusion/model.hpp"
|
||||||
@ -49,6 +50,10 @@ namespace Flux {
|
|||||||
float ref_index_scale = 1.f;
|
float ref_index_scale = 1.f;
|
||||||
ChromaRadianceConfig chroma_radiance_params;
|
ChromaRadianceConfig chroma_radiance_params;
|
||||||
|
|
||||||
|
bool pulid_enabled = false;
|
||||||
|
int pulid_double_interval = 2;
|
||||||
|
int pulid_single_interval = 4;
|
||||||
|
|
||||||
static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string& prefix,
|
const std::string& prefix,
|
||||||
SDVersion version = VERSION_FLUX) {
|
SDVersion version = VERSION_FLUX) {
|
||||||
@ -138,6 +143,9 @@ namespace Flux {
|
|||||||
if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
|
if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
|
||||||
head_dim = tensor_storage.ne[0];
|
head_dim = tensor_storage.ne[0];
|
||||||
}
|
}
|
||||||
|
if (name.find("pulid_ca.") != std::string::npos) {
|
||||||
|
config.pulid_enabled = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != config.patch_size) {
|
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != config.patch_size) {
|
||||||
GGML_ASSERT(config.patch_size == 2 * actual_radiance_patch_size);
|
GGML_ASSERT(config.patch_size == 2 * actual_radiance_patch_size);
|
||||||
@ -957,6 +965,20 @@ namespace Flux {
|
|||||||
blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
|
blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
|
||||||
blocks["single_stream_modulation"] = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
|
blocks["single_stream_modulation"] = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (config.pulid_enabled) {
|
||||||
|
int num_double_ca = (config.depth + config.pulid_double_interval - 1) / config.pulid_double_interval;
|
||||||
|
int num_single_ca = (config.depth_single_blocks + config.pulid_single_interval - 1) / config.pulid_single_interval;
|
||||||
|
int num_ca = num_double_ca + num_single_ca;
|
||||||
|
for (int i = 0; i < num_ca; i++) {
|
||||||
|
blocks["pulid_ca." + std::to_string(i)] =
|
||||||
|
std::shared_ptr<GGMLBlock>(new PuLIDPerceiverAttentionCA(
|
||||||
|
/*dim=*/config.hidden_size,
|
||||||
|
/*dim_head=*/PuLIDPerceiverAttentionCA::DEFAULT_DIM_HEAD,
|
||||||
|
/*heads=*/PuLIDPerceiverAttentionCA::DEFAULT_HEADS,
|
||||||
|
/*kv_dim=*/PuLIDPerceiverAttentionCA::DEFAULT_KV_DIM));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
|
ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
|
||||||
@ -967,7 +989,9 @@ namespace Flux {
|
|||||||
ggml_tensor* guidance,
|
ggml_tensor* guidance,
|
||||||
ggml_tensor* pe,
|
ggml_tensor* pe,
|
||||||
ggml_tensor* mod_index_arange = nullptr,
|
ggml_tensor* mod_index_arange = nullptr,
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
ggml_tensor* pulid_id = nullptr,
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
||||||
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
||||||
auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
|
auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
|
||||||
@ -1044,6 +1068,13 @@ namespace Flux {
|
|||||||
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
|
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
|
||||||
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
|
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
|
||||||
|
|
||||||
|
const bool pulid_active = config.pulid_enabled && pulid_id != nullptr;
|
||||||
|
if (pulid_active && !skip_layers.empty()) {
|
||||||
|
LOG_WARN("PuLID + skip_layers is not supported; disabling PuLID for this generation.");
|
||||||
|
}
|
||||||
|
const bool pulid_run = pulid_active && skip_layers.empty();
|
||||||
|
int ca_idx = 0;
|
||||||
|
|
||||||
for (int i = 0; i < config.depth; i++) {
|
for (int i = 0; i < config.depth; i++) {
|
||||||
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
|
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
|
||||||
continue;
|
continue;
|
||||||
@ -1056,9 +1087,19 @@ namespace Flux {
|
|||||||
txt = img_txt.second; // [N, n_txt_token, hidden_size]
|
txt = img_txt.second; // [N, n_txt_token, hidden_size]
|
||||||
sd::ggml_graph_cut::mark_graph_cut(img, "flux.double_blocks." + std::to_string(i), "img");
|
sd::ggml_graph_cut::mark_graph_cut(img, "flux.double_blocks." + std::to_string(i), "img");
|
||||||
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.double_blocks." + std::to_string(i), "txt");
|
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.double_blocks." + std::to_string(i), "txt");
|
||||||
|
|
||||||
|
if (pulid_run && (i % config.pulid_double_interval == 0)) {
|
||||||
|
auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
|
||||||
|
blocks["pulid_ca." + std::to_string(ca_idx)]);
|
||||||
|
ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img); // [N, n_img_token, hidden_size]
|
||||||
|
img = ggml_add(ctx->ggml_ctx, img, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(img, "flux.pulid_ca." + std::to_string(ca_idx), "img");
|
||||||
|
ca_idx++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
|
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
|
||||||
|
const int64_t n_txt_tok = txt->ne[1];
|
||||||
for (int i = 0; i < config.depth_single_blocks; i++) {
|
for (int i = 0; i < config.depth_single_blocks; i++) {
|
||||||
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
|
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
|
||||||
continue;
|
continue;
|
||||||
@ -1067,6 +1108,29 @@ namespace Flux {
|
|||||||
|
|
||||||
txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
|
txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
|
||||||
sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.single_blocks." + std::to_string(i), "txt_img");
|
sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.single_blocks." + std::to_string(i), "txt_img");
|
||||||
|
|
||||||
|
if (pulid_run && (i % config.pulid_single_interval == 0)) {
|
||||||
|
auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
|
||||||
|
blocks["pulid_ca." + std::to_string(ca_idx)]);
|
||||||
|
ggml_tensor* txt_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
|
||||||
|
txt_img->ne[0], n_txt_tok, txt_img->ne[2],
|
||||||
|
txt_img->nb[1], txt_img->nb[2],
|
||||||
|
0);
|
||||||
|
ggml_tensor* img_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
|
||||||
|
txt_img->ne[0],
|
||||||
|
txt_img->ne[1] - n_txt_tok,
|
||||||
|
txt_img->ne[2],
|
||||||
|
txt_img->nb[1],
|
||||||
|
txt_img->nb[2],
|
||||||
|
n_txt_tok * txt_img->nb[1]);
|
||||||
|
txt_part = ggml_cont(ctx->ggml_ctx, txt_part);
|
||||||
|
img_part = ggml_cont(ctx->ggml_ctx, img_part);
|
||||||
|
ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img_part);
|
||||||
|
img_part = ggml_add(ctx->ggml_ctx, img_part, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
|
||||||
|
txt_img = ggml_concat(ctx->ggml_ctx, txt_part, img_part, 1);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.pulid_ca." + std::to_string(ca_idx), "txt_img");
|
||||||
|
ca_idx++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
img = ggml_view_3d(ctx->ggml_ctx,
|
img = ggml_view_3d(ctx->ggml_ctx,
|
||||||
@ -1105,7 +1169,9 @@ namespace Flux {
|
|||||||
ggml_tensor* mod_index_arange = nullptr,
|
ggml_tensor* mod_index_arange = nullptr,
|
||||||
ggml_tensor* dct = nullptr,
|
ggml_tensor* dct = nullptr,
|
||||||
std::vector<ggml_tensor*> ref_latents = {},
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
ggml_tensor* pulid_id = nullptr,
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
GGML_ASSERT(x->ne[3] == 1);
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
|
|
||||||
int64_t W = x->ne[0];
|
int64_t W = x->ne[0];
|
||||||
@ -1131,7 +1197,8 @@ namespace Flux {
|
|||||||
img = ggml_reshape_3d(ctx->ggml_ctx, img, img->ne[0] * img->ne[1], img->ne[2], img->ne[3]); // [N, hidden_size, H/patch_size*W/patch_size]
|
img = ggml_reshape_3d(ctx->ggml_ctx, img, img->ne[0] * img->ne[1], img->ne[2], img->ne[3]); // [N, hidden_size, H/patch_size*W/patch_size]
|
||||||
img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3)); // [N, H/patch_size*W/patch_size, hidden_size]
|
img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3)); // [N, H/patch_size*W/patch_size, hidden_size]
|
||||||
|
|
||||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers); // [N, n_img_token, hidden_size]
|
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers,
|
||||||
|
pulid_id, pulid_id_weight); // [N, n_img_token, hidden_size]
|
||||||
|
|
||||||
// nerf decode
|
// nerf decode
|
||||||
auto nerf_image_embedder = std::dynamic_pointer_cast<NerfEmbedder>(blocks["nerf_image_embedder"]);
|
auto nerf_image_embedder = std::dynamic_pointer_cast<NerfEmbedder>(blocks["nerf_image_embedder"]);
|
||||||
@ -1179,7 +1246,9 @@ namespace Flux {
|
|||||||
ggml_tensor* mod_index_arange = nullptr,
|
ggml_tensor* mod_index_arange = nullptr,
|
||||||
ggml_tensor* dct = nullptr,
|
ggml_tensor* dct = nullptr,
|
||||||
std::vector<ggml_tensor*> ref_latents = {},
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
ggml_tensor* pulid_id = nullptr,
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
GGML_ASSERT(x->ne[3] == 1);
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
|
|
||||||
int64_t W = x->ne[0];
|
int64_t W = x->ne[0];
|
||||||
@ -1226,7 +1295,8 @@ namespace Flux {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
|
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers,
|
||||||
|
pulid_id, pulid_id_weight); // [N, num_tokens, C * patch_size * patch_size]
|
||||||
|
|
||||||
if (out->ne[1] > img_tokens) {
|
if (out->ne[1] > img_tokens) {
|
||||||
out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], img_tokens, out->ne[2], out->nb[1], out->nb[2], 0);
|
out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], img_tokens, out->ne[2], out->nb[1], out->nb[2], 0);
|
||||||
@ -1248,7 +1318,9 @@ namespace Flux {
|
|||||||
ggml_tensor* mod_index_arange = nullptr,
|
ggml_tensor* mod_index_arange = nullptr,
|
||||||
ggml_tensor* dct = nullptr,
|
ggml_tensor* dct = nullptr,
|
||||||
std::vector<ggml_tensor*> ref_latents = {},
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
ggml_tensor* pulid_id = nullptr,
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
// Forward pass of DiT.
|
// Forward pass of DiT.
|
||||||
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
|
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
|
||||||
// timestep: (N,) tensor of diffusion timesteps
|
// timestep: (N,) tensor of diffusion timesteps
|
||||||
@ -1271,7 +1343,9 @@ namespace Flux {
|
|||||||
mod_index_arange,
|
mod_index_arange,
|
||||||
dct,
|
dct,
|
||||||
ref_latents,
|
ref_latents,
|
||||||
skip_layers);
|
skip_layers,
|
||||||
|
pulid_id,
|
||||||
|
pulid_id_weight);
|
||||||
} else {
|
} else {
|
||||||
return forward_flux_chroma(ctx,
|
return forward_flux_chroma(ctx,
|
||||||
x,
|
x,
|
||||||
@ -1284,7 +1358,9 @@ namespace Flux {
|
|||||||
mod_index_arange,
|
mod_index_arange,
|
||||||
dct,
|
dct,
|
||||||
ref_latents,
|
ref_latents,
|
||||||
skip_layers);
|
skip_layers,
|
||||||
|
pulid_id,
|
||||||
|
pulid_id_weight);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -1301,12 +1377,12 @@ namespace Flux {
|
|||||||
bool use_mask = false;
|
bool use_mask = false;
|
||||||
|
|
||||||
FluxRunner(ggml_backend_t backend,
|
FluxRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
SDVersion version = VERSION_FLUX,
|
SDVersion version = VERSION_FLUX,
|
||||||
bool use_mask = false)
|
bool use_mask = false,
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(FluxConfig::detect_from_weights(tensor_storage_map, prefix, version)),
|
config(FluxConfig::detect_from_weights(tensor_storage_map, prefix, version)),
|
||||||
version(version),
|
version(version),
|
||||||
use_mask(use_mask) {
|
use_mask(use_mask) {
|
||||||
@ -1384,7 +1460,9 @@ namespace Flux {
|
|||||||
const sd::Tensor<float>& guidance_tensor = {},
|
const sd::Tensor<float>& guidance_tensor = {},
|
||||||
const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
|
const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
|
||||||
bool increase_ref_index = false,
|
bool increase_ref_index = false,
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
const sd::Tensor<float>& pulid_id_tensor = {},
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
ggml_tensor* x = make_input(x_tensor);
|
ggml_tensor* x = make_input(x_tensor);
|
||||||
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
||||||
ggml_tensor* context = make_optional_input(context_tensor);
|
ggml_tensor* context = make_optional_input(context_tensor);
|
||||||
@ -1461,6 +1539,10 @@ namespace Flux {
|
|||||||
set_backend_tensor_data(dct, dct_vec.data());
|
set_backend_tensor_data(dct, dct_vec.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_tensor* pulid_id = pulid_id_tensor.empty()
|
||||||
|
? nullptr
|
||||||
|
: make_input(pulid_id_tensor);
|
||||||
|
|
||||||
auto runner_ctx = get_context();
|
auto runner_ctx = get_context();
|
||||||
|
|
||||||
ggml_tensor* out = flux.forward(&runner_ctx,
|
ggml_tensor* out = flux.forward(&runner_ctx,
|
||||||
@ -1474,7 +1556,9 @@ namespace Flux {
|
|||||||
mod_index_arange,
|
mod_index_arange,
|
||||||
dct,
|
dct,
|
||||||
ref_latents,
|
ref_latents,
|
||||||
skip_layers);
|
skip_layers,
|
||||||
|
pulid_id,
|
||||||
|
pulid_id_weight);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, out);
|
ggml_build_forward_expand(gf, out);
|
||||||
|
|
||||||
@ -1490,14 +1574,17 @@ namespace Flux {
|
|||||||
const sd::Tensor<float>& guidance = {},
|
const sd::Tensor<float>& guidance = {},
|
||||||
const std::vector<sd::Tensor<float>>& ref_latents = {},
|
const std::vector<sd::Tensor<float>>& ref_latents = {},
|
||||||
bool increase_ref_index = false,
|
bool increase_ref_index = false,
|
||||||
std::vector<int> skip_layers = std::vector<int>()) {
|
std::vector<int> skip_layers = std::vector<int>(),
|
||||||
|
const sd::Tensor<float>& pulid_id = {},
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
// x: [N, in_channels, h, w]
|
// x: [N, in_channels, h, w]
|
||||||
// timesteps: [N, ]
|
// timesteps: [N, ]
|
||||||
// context: [N, max_position, hidden_size]
|
// context: [N, max_position, hidden_size]
|
||||||
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||||
// guidance: [N, ]
|
// guidance: [N, ]
|
||||||
|
// pulid_id: empty (no injection) or [N, num_id_tokens=32, kv_dim=2048]
|
||||||
auto get_graph = [&]() -> ggml_cgraph* {
|
auto get_graph = [&]() -> ggml_cgraph* {
|
||||||
return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
|
return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers, pulid_id, pulid_id_weight);
|
||||||
};
|
};
|
||||||
|
|
||||||
auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
|
auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
|
||||||
@ -1520,7 +1607,9 @@ namespace Flux {
|
|||||||
tensor_or_empty(extra->guidance),
|
tensor_or_empty(extra->guidance),
|
||||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
||||||
diffusion_params.increase_ref_index,
|
diffusion_params.increase_ref_index,
|
||||||
extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
|
extra->skip_layers ? *extra->skip_layers : empty_skip_layers,
|
||||||
|
tensor_or_empty(extra->pulid_id),
|
||||||
|
extra->pulid_id_weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
void test() {
|
void test() {
|
||||||
@ -1583,7 +1672,8 @@ namespace Flux {
|
|||||||
ggml_backend_t backend = sd_backend_cpu_init();
|
ggml_backend_t backend = sd_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_COUNT;
|
ggml_type model_data_type = GGML_TYPE_COUNT;
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
|
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
||||||
return;
|
return;
|
||||||
@ -1599,24 +1689,20 @@ namespace Flux {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend,
|
std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend,
|
||||||
backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"model.diffusion_model",
|
"model.diffusion_model",
|
||||||
VERSION_FLUX2,
|
VERSION_FLUX2,
|
||||||
false);
|
false,
|
||||||
|
model_manager);
|
||||||
|
|
||||||
if (!flux->alloc_params_buffer()) {
|
if (!model_manager->register_runner_params("Flux test",
|
||||||
LOG_ERROR("flux model allocation failed");
|
*flux,
|
||||||
return;
|
"model.diffusion_model",
|
||||||
}
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
|
backend,
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
backend) ||
|
||||||
flux->get_param_tensors(tensors, "model.diffusion_model");
|
!model_manager->validate_registered_tensors()) {
|
||||||
|
LOG_ERROR("register flux tensors with model manager failed");
|
||||||
bool success = model_loader.load_tensors(tensors);
|
|
||||||
|
|
||||||
if (!success) {
|
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
|
#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
|
||||||
#define __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
|
#define __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -282,10 +282,10 @@ namespace HiDreamO1 {
|
|||||||
std::array<std::vector<float>, 4> pos_embed_weight_data_;
|
std::array<std::vector<float>, 4> pos_embed_weight_data_;
|
||||||
|
|
||||||
HiDreamO1VisionRunner(ggml_backend_t backend,
|
HiDreamO1VisionRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string& prefix = "model.visual")
|
const std::string& prefix = "model.visual",
|
||||||
: GGMLRunner(backend, params_backend),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: GGMLRunner(backend, weight_manager),
|
||||||
config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)),
|
config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)),
|
||||||
model(std::make_shared<LLM::VisionModel>(false, config.llm.vision)) {
|
model(std::make_shared<LLM::VisionModel>(false, config.llm.vision)) {
|
||||||
model->init(params_ctx, tensor_storage_map, prefix);
|
model->init(params_ctx, tensor_storage_map, prefix);
|
||||||
@ -343,10 +343,10 @@ namespace HiDreamO1 {
|
|||||||
std::vector<float> attention_mask_vec;
|
std::vector<float> attention_mask_vec;
|
||||||
|
|
||||||
HiDreamO1Runner(ggml_backend_t backend,
|
HiDreamO1Runner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string& prefix = "model")
|
const std::string& prefix = "model",
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)) {
|
config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)) {
|
||||||
model = HiDreamO1Model(config);
|
model = HiDreamO1Model(config);
|
||||||
model.init(params_ctx, tensor_storage_map, prefix);
|
model.init(params_ctx, tensor_storage_map, prefix);
|
||||||
@ -490,9 +490,9 @@ namespace HiDreamO1 {
|
|||||||
std::shared_ptr<HiDreamO1VisionRunner> vision_runner;
|
std::shared_ptr<HiDreamO1VisionRunner> vision_runner;
|
||||||
|
|
||||||
HiDreamO1Conditioner(ggml_backend_t backend,
|
HiDreamO1Conditioner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const String2TensorStorage& tensor_storage_map = {})
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: vision_runner(std::make_shared<HiDreamO1VisionRunner>(backend, params_backend, tensor_storage_map)) {}
|
: vision_runner(std::make_shared<HiDreamO1VisionRunner>(backend, tensor_storage_map, "model.visual", weight_manager)) {}
|
||||||
|
|
||||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||||
vision_runner->get_param_tensors(tensors);
|
vision_runner->get_param_tensors(tensors);
|
||||||
@ -510,10 +510,6 @@ namespace HiDreamO1 {
|
|||||||
vision_runner->set_weight_adapter(adapter);
|
vision_runner->set_weight_adapter(adapter);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_weight_manager(const std::shared_ptr<RunnerWeightManager>& manager) override {
|
|
||||||
vision_runner->set_weight_manager(manager);
|
|
||||||
}
|
|
||||||
|
|
||||||
void runner_done() override {
|
void runner_done() override {
|
||||||
vision_runner->runner_done();
|
vision_runner->runner_done();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -449,10 +449,10 @@ namespace Ideogram4 {
|
|||||||
std::vector<int32_t> image_indicator_vec;
|
std::vector<int32_t> image_indicator_vec;
|
||||||
|
|
||||||
Ideogram4Runner(ggml_backend_t backend,
|
Ideogram4Runner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "")
|
const std::string prefix = "",
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(Ideogram4Config::detect_from_weights(tensor_storage_map, prefix)),
|
config(Ideogram4Config::detect_from_weights(tensor_storage_map, prefix)),
|
||||||
uncond_prefix(prefix + ".uncond") {
|
uncond_prefix(prefix + ".uncond") {
|
||||||
model = Ideogram4Transformer(config);
|
model = Ideogram4Transformer(config);
|
||||||
|
|||||||
@ -356,10 +356,10 @@ namespace Lens {
|
|||||||
std::vector<float> pe_vec;
|
std::vector<float> pe_vec;
|
||||||
|
|
||||||
LensRunner(ggml_backend_t backend,
|
LensRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "")
|
const std::string prefix = "",
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(LensConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
config(LensConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||||
lens = LensModel(config);
|
lens = LensModel(config);
|
||||||
lens.init(params_ctx, tensor_storage_map, prefix);
|
lens.init(params_ctx, tensor_storage_map, prefix);
|
||||||
|
|||||||
@ -1686,10 +1686,10 @@ namespace LTXV {
|
|||||||
sd::Tensor<float> ax_input_cache;
|
sd::Tensor<float> ax_input_cache;
|
||||||
|
|
||||||
LTXAVRunner(ggml_backend_t backend,
|
LTXAVRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string& prefix = "model.diffusion_model")
|
const std::string& prefix = "model.diffusion_model",
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(LTXAVConfig::detect_from_weights(tensor_storage_map, prefix)),
|
config(LTXAVConfig::detect_from_weights(tensor_storage_map, prefix)),
|
||||||
model(config) {
|
model(config) {
|
||||||
model.init(params_ctx, tensor_storage_map, prefix);
|
model.init(params_ctx, tensor_storage_map, prefix);
|
||||||
@ -2025,7 +2025,8 @@ namespace LTXV {
|
|||||||
ggml_backend_t backend = sd_backend_cpu_init();
|
ggml_backend_t backend = sd_backend_cpu_init();
|
||||||
LOG_INFO("loading ltxav from '%s'", model_path.c_str());
|
LOG_INFO("loading ltxav from '%s'", model_path.c_str());
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!model_loader.init_from_file_and_convert_name(model_path, "model.diffusion_model.")) {
|
if (!model_loader.init_from_file_and_convert_name(model_path, "model.diffusion_model.")) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
||||||
return;
|
return;
|
||||||
@ -2040,19 +2041,18 @@ namespace LTXV {
|
|||||||
|
|
||||||
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||||
std::shared_ptr<LTXAVRunner> ltxav = std::make_shared<LTXAVRunner>(backend,
|
std::shared_ptr<LTXAVRunner> ltxav = std::make_shared<LTXAVRunner>(backend,
|
||||||
backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"model.diffusion_model");
|
"model.diffusion_model",
|
||||||
|
model_manager);
|
||||||
|
|
||||||
if (!ltxav->alloc_params_buffer()) {
|
if (!model_manager->register_runner_params("LTXAV test",
|
||||||
LOG_ERROR("ltxav buffer allocation failed");
|
*ltxav,
|
||||||
return;
|
"model.diffusion_model",
|
||||||
}
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
backend,
|
||||||
ltxav->get_param_tensors(tensors, "model.diffusion_model");
|
backend) ||
|
||||||
|
!model_manager->validate_registered_tensors()) {
|
||||||
if (!model_loader.load_tensors(tensors)) {
|
LOG_ERROR("register ltxav tensors with model manager failed");
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -879,10 +879,10 @@ struct MMDiTRunner : public DiffusionModelRunner {
|
|||||||
MMDiT mmdit;
|
MMDiT mmdit;
|
||||||
|
|
||||||
MMDiTRunner(ggml_backend_t backend,
|
MMDiTRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "")
|
const std::string prefix = "",
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
|
config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
|
||||||
mmdit(config) {
|
mmdit(config) {
|
||||||
mmdit.init(params_ctx, tensor_storage_map, prefix);
|
mmdit.init(params_ctx, tensor_storage_map, prefix);
|
||||||
@ -1001,28 +1001,25 @@ struct MMDiTRunner : public DiffusionModelRunner {
|
|||||||
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
ggml_backend_t backend = sd_backend_cpu_init();
|
ggml_backend_t backend = sd_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_F16;
|
ggml_type model_data_type = GGML_TYPE_F16;
|
||||||
std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, backend);
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, String2TensorStorage{}, "", model_manager);
|
||||||
{
|
{
|
||||||
LOG_INFO("loading from '%s'", file_path.c_str());
|
LOG_INFO("loading from '%s'", file_path.c_str());
|
||||||
|
|
||||||
if (!mmdit->alloc_params_buffer()) {
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
LOG_ERROR("mmdit embeds buffer allocation failed");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
|
||||||
mmdit->get_param_tensors(tensors, "model.diffusion_model");
|
|
||||||
|
|
||||||
ModelLoader model_loader;
|
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path)) {
|
if (!model_loader.init_from_file_and_convert_name(file_path)) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors);
|
if (!model_manager->register_runner_params("MMDiT test",
|
||||||
|
*mmdit,
|
||||||
if (!success) {
|
"model.diffusion_model",
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
|
backend,
|
||||||
|
backend) ||
|
||||||
|
!model_manager->validate_registered_tensors()) {
|
||||||
|
LOG_ERROR("register mmdit tensors with model manager failed");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__
|
#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__
|
||||||
#define __SD_MODEL_DIFFUSION_MODEL_HPP__
|
#define __SD_MODEL_DIFFUSION_MODEL_HPP__
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -7,6 +7,7 @@
|
|||||||
|
|
||||||
#include "core/ggml_extend.hpp"
|
#include "core/ggml_extend.hpp"
|
||||||
#include "core/tensor_ggml.hpp"
|
#include "core/tensor_ggml.hpp"
|
||||||
|
#include "model_manager.h"
|
||||||
|
|
||||||
struct UNetDiffusionExtra {
|
struct UNetDiffusionExtra {
|
||||||
int num_video_frames = -1;
|
int num_video_frames = -1;
|
||||||
@ -21,6 +22,8 @@ struct SkipLayerDiffusionExtra {
|
|||||||
struct FluxDiffusionExtra {
|
struct FluxDiffusionExtra {
|
||||||
const sd::Tensor<float>* guidance = nullptr;
|
const sd::Tensor<float>* guidance = nullptr;
|
||||||
const std::vector<int>* skip_layers = nullptr;
|
const std::vector<int>* skip_layers = nullptr;
|
||||||
|
const sd::Tensor<float>* pulid_id = nullptr;
|
||||||
|
float pulid_id_weight = 1.0f;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct AnimaDiffusionExtra {
|
struct AnimaDiffusionExtra {
|
||||||
@ -88,9 +91,9 @@ protected:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
DiffusionModelRunner(ggml_backend_t backend,
|
DiffusionModelRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
const std::string& prefix,
|
||||||
const std::string& prefix)
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: GGMLRunner(backend, params_backend),
|
: GGMLRunner(backend, weight_manager),
|
||||||
prefix(prefix) {}
|
prefix(prefix) {}
|
||||||
|
|
||||||
virtual sd::Tensor<float> compute(int n_threads,
|
virtual sd::Tensor<float> compute(int n_threads,
|
||||||
|
|||||||
@ -710,10 +710,10 @@ namespace Pid {
|
|||||||
std::vector<float> pixel_pos_comp_vec;
|
std::vector<float> pixel_pos_comp_vec;
|
||||||
|
|
||||||
PiDRunner(ggml_backend_t backend,
|
PiDRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string prefix = "model.diffusion_model")
|
const std::string prefix = "model.diffusion_model",
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(PixelDiTConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
config(PixelDiTConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||||
model = PixelDiT(config);
|
model = PixelDiT(config);
|
||||||
model.init(params_ctx, tensor_storage_map, prefix);
|
model.init(params_ctx, tensor_storage_map, prefix);
|
||||||
|
|||||||
@ -518,12 +518,12 @@ namespace Qwen {
|
|||||||
SDVersion version;
|
SDVersion version;
|
||||||
|
|
||||||
QwenImageRunner(ggml_backend_t backend,
|
QwenImageRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
SDVersion version = VERSION_QWEN_IMAGE,
|
SDVersion version = VERSION_QWEN_IMAGE,
|
||||||
bool zero_cond_t = false)
|
bool zero_cond_t = false,
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(QwenImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
config(QwenImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||||
config.zero_cond_t = config.zero_cond_t || zero_cond_t;
|
config.zero_cond_t = config.zero_cond_t || zero_cond_t;
|
||||||
qwen_image = QwenImageModel(config);
|
qwen_image = QwenImageModel(config);
|
||||||
@ -691,7 +691,8 @@ namespace Qwen {
|
|||||||
ggml_backend_t backend = sd_backend_cpu_init();
|
ggml_backend_t backend = sd_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_Q8_0;
|
ggml_type model_data_type = GGML_TYPE_Q8_0;
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
|
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
||||||
return;
|
return;
|
||||||
@ -705,23 +706,20 @@ namespace Qwen {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
|
std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
|
||||||
backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"model.diffusion_model",
|
"model.diffusion_model",
|
||||||
VERSION_QWEN_IMAGE);
|
VERSION_QWEN_IMAGE,
|
||||||
|
false,
|
||||||
|
model_manager);
|
||||||
|
|
||||||
if (!qwen_image->alloc_params_buffer()) {
|
if (!model_manager->register_runner_params("Qwen image test",
|
||||||
LOG_ERROR("qwen_image buffer allocation failed");
|
*qwen_image,
|
||||||
return;
|
"model.diffusion_model",
|
||||||
}
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
|
backend,
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
backend) ||
|
||||||
qwen_image->get_param_tensors(tensors, "model.diffusion_model");
|
!model_manager->validate_registered_tensors()) {
|
||||||
|
LOG_ERROR("register qwen_image tensors with model manager failed");
|
||||||
bool success = model_loader.load_tensors(tensors);
|
|
||||||
|
|
||||||
if (!success) {
|
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -694,11 +694,11 @@ struct UNetModelRunner : public DiffusionModelRunner {
|
|||||||
UnetModelBlock unet;
|
UnetModelBlock unet;
|
||||||
|
|
||||||
UNetModelRunner(ggml_backend_t backend,
|
UNetModelRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
SDVersion version = VERSION_SD1)
|
SDVersion version = VERSION_SD1,
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(UNetConfig::detect_from_weights(tensor_storage_map, prefix, version)),
|
config(UNetConfig::detect_from_weights(tensor_storage_map, prefix, version)),
|
||||||
unet(config) {
|
unet(config) {
|
||||||
unet.init(params_ctx, tensor_storage_map, prefix);
|
unet.init(params_ctx, tensor_storage_map, prefix);
|
||||||
|
|||||||
@ -799,11 +799,11 @@ namespace WAN {
|
|||||||
SDVersion version;
|
SDVersion version;
|
||||||
|
|
||||||
WanRunner(ggml_backend_t backend,
|
WanRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
SDVersion version = VERSION_WAN2)
|
SDVersion version = VERSION_WAN2,
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(WanConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
config(WanConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||||
if (config.num_layers == 30) {
|
if (config.num_layers == 30) {
|
||||||
if (version == VERSION_WAN2_2_TI2V) {
|
if (version == VERSION_WAN2_2_TI2V) {
|
||||||
@ -1017,7 +1017,8 @@ namespace WAN {
|
|||||||
ggml_type model_data_type = GGML_TYPE_F16;
|
ggml_type model_data_type = GGML_TYPE_F16;
|
||||||
LOG_INFO("loading from '%s'", file_path.c_str());
|
LOG_INFO("loading from '%s'", file_path.c_str());
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
|
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
||||||
return;
|
return;
|
||||||
@ -1031,23 +1032,19 @@ namespace WAN {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<WanRunner> wan = std::make_shared<WanRunner>(backend,
|
std::shared_ptr<WanRunner> wan = std::make_shared<WanRunner>(backend,
|
||||||
backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"model.diffusion_model",
|
"model.diffusion_model",
|
||||||
VERSION_WAN2_2_TI2V);
|
VERSION_WAN2_2_TI2V,
|
||||||
|
model_manager);
|
||||||
|
|
||||||
if (!wan->alloc_params_buffer()) {
|
if (!model_manager->register_runner_params("Wan test",
|
||||||
LOG_ERROR("wan buffer allocation failed");
|
*wan,
|
||||||
return;
|
"model.diffusion_model",
|
||||||
}
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
|
backend,
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
backend) ||
|
||||||
wan->get_param_tensors(tensors, "model.diffusion_model");
|
!model_manager->validate_registered_tensors()) {
|
||||||
|
LOG_ERROR("register wan tensors with model manager failed");
|
||||||
bool success = model_loader.load_tensors(tensors);
|
|
||||||
|
|
||||||
if (!success) {
|
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -553,11 +553,11 @@ namespace ZImage {
|
|||||||
SDVersion version;
|
SDVersion version;
|
||||||
|
|
||||||
ZImageRunner(ggml_backend_t backend,
|
ZImageRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
SDVersion version = VERSION_Z_IMAGE)
|
SDVersion version = VERSION_Z_IMAGE,
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: DiffusionModelRunner(backend, prefix, weight_manager),
|
||||||
config(ZImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
config(ZImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
|
||||||
z_image = ZImageModel(config);
|
z_image = ZImageModel(config);
|
||||||
z_image.init(params_ctx, tensor_storage_map, prefix);
|
z_image.init(params_ctx, tensor_storage_map, prefix);
|
||||||
@ -698,7 +698,8 @@ namespace ZImage {
|
|||||||
ggml_backend_t backend = sd_backend_cpu_init();
|
ggml_backend_t backend = sd_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_Q8_0;
|
ggml_type model_data_type = GGML_TYPE_Q8_0;
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
|
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
||||||
return;
|
return;
|
||||||
@ -714,22 +715,19 @@ namespace ZImage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<ZImageRunner> z_image = std::make_shared<ZImageRunner>(backend,
|
std::shared_ptr<ZImageRunner> z_image = std::make_shared<ZImageRunner>(backend,
|
||||||
backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"model.diffusion_model",
|
"model.diffusion_model",
|
||||||
VERSION_QWEN_IMAGE);
|
VERSION_QWEN_IMAGE,
|
||||||
|
model_manager);
|
||||||
|
|
||||||
if (!z_image->alloc_params_buffer()) {
|
if (!model_manager->register_runner_params("ZImage test",
|
||||||
LOG_ERROR("z_image buffer allocation failed");
|
*z_image,
|
||||||
return;
|
"model.diffusion_model",
|
||||||
}
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
backend,
|
||||||
z_image->get_param_tensors(tensors, "model.diffusion_model");
|
backend) ||
|
||||||
|
!model_manager->validate_registered_tensors()) {
|
||||||
bool success = model_loader.load_tensors(tensors);
|
LOG_ERROR("register z_image tensors with model manager failed");
|
||||||
|
|
||||||
if (!success) {
|
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
#ifndef __SD_MODEL_TE_CLIP_HPP__
|
#ifndef __SD_MODEL_TE_CLIP_HPP__
|
||||||
#define __SD_MODEL_TE_CLIP_HPP__
|
#define __SD_MODEL_TE_CLIP_HPP__
|
||||||
|
|
||||||
#include "core/ggml_extend.hpp"
|
#include "core/ggml_extend.hpp"
|
||||||
@ -469,13 +469,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
|||||||
std::vector<float> attention_mask_vec;
|
std::vector<float> attention_mask_vec;
|
||||||
|
|
||||||
CLIPTextModelRunner(ggml_backend_t backend,
|
CLIPTextModelRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
||||||
bool with_final_ln = true,
|
bool with_final_ln = true,
|
||||||
bool force_clip_f32 = false)
|
bool force_clip_f32 = false,
|
||||||
: GGMLRunner(backend, params_backend) {
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: GGMLRunner(backend, weight_manager) {
|
||||||
bool proj_in = false;
|
bool proj_in = false;
|
||||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||||
if (!starts_with(name, prefix)) {
|
if (!starts_with(name, prefix)) {
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
#ifndef __SD_MODEL_TE_LLM_HPP__
|
#ifndef __SD_MODEL_TE_LLM_HPP__
|
||||||
#define __SD_MODEL_TE_LLM_HPP__
|
#define __SD_MODEL_TE_LLM_HPP__
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -22,6 +22,7 @@
|
|||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
#include "model/common/rope.hpp"
|
#include "model/common/rope.hpp"
|
||||||
#include "model_loader.h"
|
#include "model_loader.h"
|
||||||
|
#include "model_manager.h"
|
||||||
#include "tokenizers/bpe_tokenizer.h"
|
#include "tokenizers/bpe_tokenizer.h"
|
||||||
#include "tokenizers/gemma_tokenizer.h"
|
#include "tokenizers/gemma_tokenizer.h"
|
||||||
#include "tokenizers/gpt_oss_tokenizer.h"
|
#include "tokenizers/gpt_oss_tokenizer.h"
|
||||||
@ -78,6 +79,7 @@ namespace LLM {
|
|||||||
int window_size = 112;
|
int window_size = 112;
|
||||||
int num_position_embeddings = 0;
|
int num_position_embeddings = 0;
|
||||||
std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
|
std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
|
||||||
|
bool split_patch_embed = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLMConfig {
|
struct LLMConfig {
|
||||||
@ -179,6 +181,7 @@ namespace LLM {
|
|||||||
}
|
}
|
||||||
|
|
||||||
config.num_layers = 0;
|
config.num_layers = 0;
|
||||||
|
int detected_vision_layers = 0;
|
||||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||||
if (!starts_with(name, prefix)) {
|
if (!starts_with(name, prefix)) {
|
||||||
continue;
|
continue;
|
||||||
@ -189,6 +192,38 @@ namespace LLM {
|
|||||||
if (contains(name, "attn.q_proj")) {
|
if (contains(name, "attn.q_proj")) {
|
||||||
config.llama_cpp_style = true;
|
config.llama_cpp_style = true;
|
||||||
}
|
}
|
||||||
|
if (contains(name, "visual.patch_embed.proj.1.weight")) {
|
||||||
|
config.vision.split_patch_embed = true;
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.patch_embed.proj.0.weight")) {
|
||||||
|
config.vision.patch_size = static_cast<int>(tensor_storage.ne[0]);
|
||||||
|
config.vision.in_channels = tensor_storage.ne[2];
|
||||||
|
config.vision.hidden_size = tensor_storage.ne[3];
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.patch_embed.bias")) {
|
||||||
|
config.vision.hidden_size = tensor_storage.ne[0];
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.pos_embed.weight")) {
|
||||||
|
config.vision.hidden_size = tensor_storage.ne[0];
|
||||||
|
config.vision.num_position_embeddings = static_cast<int>(tensor_storage.ne[1]);
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.blocks.")) {
|
||||||
|
auto items = split_string(name.substr(pos), '.');
|
||||||
|
if (items.size() > 2) {
|
||||||
|
int block_index = atoi(items[2].c_str());
|
||||||
|
if (block_index + 1 > detected_vision_layers) {
|
||||||
|
detected_vision_layers = block_index + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.blocks.0.mlp.linear_fc1.weight") ||
|
||||||
|
contains(name, "visual.blocks.0.mlp.gate_proj.weight")) {
|
||||||
|
config.vision.intermediate_size = tensor_storage.ne[1];
|
||||||
|
}
|
||||||
|
if (contains(name, "visual.merger.linear_fc2.weight") ||
|
||||||
|
contains(name, "visual.merger.mlp.2.weight")) {
|
||||||
|
config.vision.out_hidden_size = tensor_storage.ne[1];
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
pos = name.find("layers.");
|
pos = name.find("layers.");
|
||||||
@ -218,6 +253,9 @@ namespace LLM {
|
|||||||
if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
|
if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
|
||||||
config.num_heads = 16;
|
config.num_heads = 16;
|
||||||
}
|
}
|
||||||
|
if (detected_vision_layers > 0) {
|
||||||
|
config.vision.num_layers = detected_vision_layers;
|
||||||
|
}
|
||||||
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
|
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
|
||||||
config.num_layers,
|
config.num_layers,
|
||||||
config.vocab_size,
|
config.vocab_size,
|
||||||
@ -538,40 +576,51 @@ namespace LLM {
|
|||||||
|
|
||||||
struct VisionPatchEmbed : public GGMLBlock {
|
struct VisionPatchEmbed : public GGMLBlock {
|
||||||
protected:
|
protected:
|
||||||
bool llama_cpp_style;
|
bool split_patch_embed;
|
||||||
|
bool bias;
|
||||||
int patch_size;
|
int patch_size;
|
||||||
int temporal_patch_size;
|
int temporal_patch_size;
|
||||||
int64_t in_channels;
|
int64_t in_channels;
|
||||||
int64_t embed_dim;
|
int64_t embed_dim;
|
||||||
|
|
||||||
|
void init_params(ggml_context* ctx,
|
||||||
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
|
const std::string prefix = "") override {
|
||||||
|
GGML_UNUSED(tensor_storage_map);
|
||||||
|
GGML_UNUSED(prefix);
|
||||||
|
if (split_patch_embed && bias) {
|
||||||
|
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, embed_dim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
VisionPatchEmbed(bool llama_cpp_style,
|
VisionPatchEmbed(bool split_patch_embed,
|
||||||
LLMVisionArch arch,
|
LLMVisionArch arch,
|
||||||
int patch_size = 14,
|
int patch_size = 14,
|
||||||
int temporal_patch_size = 2,
|
int temporal_patch_size = 2,
|
||||||
int64_t in_channels = 3,
|
int64_t in_channels = 3,
|
||||||
int64_t embed_dim = 1152)
|
int64_t embed_dim = 1152)
|
||||||
: llama_cpp_style(llama_cpp_style),
|
: split_patch_embed(split_patch_embed),
|
||||||
|
bias(arch == LLMVisionArch::QWEN3_VL),
|
||||||
patch_size(patch_size),
|
patch_size(patch_size),
|
||||||
temporal_patch_size(temporal_patch_size),
|
temporal_patch_size(temporal_patch_size),
|
||||||
in_channels(in_channels),
|
in_channels(in_channels),
|
||||||
embed_dim(embed_dim) {
|
embed_dim(embed_dim) {
|
||||||
bool bias = arch == LLMVisionArch::QWEN3_VL;
|
if (split_patch_embed) {
|
||||||
if (llama_cpp_style) {
|
|
||||||
blocks["proj.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
|
blocks["proj.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
|
||||||
embed_dim,
|
embed_dim,
|
||||||
{patch_size, patch_size},
|
{patch_size, patch_size},
|
||||||
{patch_size, patch_size},
|
{patch_size, patch_size},
|
||||||
{0, 0},
|
{0, 0},
|
||||||
{1, 1},
|
{1, 1},
|
||||||
bias));
|
false));
|
||||||
blocks["proj.1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
|
blocks["proj.1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
|
||||||
embed_dim,
|
embed_dim,
|
||||||
{patch_size, patch_size},
|
{patch_size, patch_size},
|
||||||
{patch_size, patch_size},
|
{patch_size, patch_size},
|
||||||
{0, 0},
|
{0, 0},
|
||||||
{1, 1},
|
{1, 1},
|
||||||
bias));
|
false));
|
||||||
} else {
|
} else {
|
||||||
std::tuple<int, int, int> kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size};
|
std::tuple<int, int, int> kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size};
|
||||||
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
|
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
|
||||||
@ -592,7 +641,7 @@ namespace LLM {
|
|||||||
temporal_patch_size,
|
temporal_patch_size,
|
||||||
ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size));
|
ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size));
|
||||||
|
|
||||||
if (llama_cpp_style) {
|
if (split_patch_embed) {
|
||||||
auto proj_0 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.0"]);
|
auto proj_0 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.0"]);
|
||||||
auto proj_1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.1"]);
|
auto proj_1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.1"]);
|
||||||
|
|
||||||
@ -605,6 +654,10 @@ namespace LLM {
|
|||||||
x1 = proj_1->forward(ctx, x1);
|
x1 = proj_1->forward(ctx, x1);
|
||||||
|
|
||||||
x = ggml_add(ctx->ggml_ctx, x0, x1);
|
x = ggml_add(ctx->ggml_ctx, x0, x1);
|
||||||
|
if (bias) {
|
||||||
|
auto b = ggml_reshape_4d(ctx->ggml_ctx, params["bias"], 1, 1, embed_dim, 1);
|
||||||
|
x = ggml_add_inplace(ctx->ggml_ctx, x, b);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
auto proj = std::dynamic_pointer_cast<Conv3d>(blocks["proj"]);
|
auto proj = std::dynamic_pointer_cast<Conv3d>(blocks["proj"]);
|
||||||
|
|
||||||
@ -797,7 +850,7 @@ namespace LLM {
|
|||||||
spatial_merge_size(vision_params.spatial_merge_size),
|
spatial_merge_size(vision_params.spatial_merge_size),
|
||||||
num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast<int>(std::sqrt(vision_params.num_position_embeddings)) : 0),
|
num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast<int>(std::sqrt(vision_params.num_position_embeddings)) : 0),
|
||||||
fullatt_block_indexes(vision_params.fullatt_block_indexes) {
|
fullatt_block_indexes(vision_params.fullatt_block_indexes) {
|
||||||
blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(llama_cpp_style,
|
blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(vision_params.split_patch_embed,
|
||||||
arch_,
|
arch_,
|
||||||
vision_params.patch_size,
|
vision_params.patch_size,
|
||||||
vision_params.temporal_patch_size,
|
vision_params.temporal_patch_size,
|
||||||
@ -1571,11 +1624,11 @@ namespace LLM {
|
|||||||
public:
|
public:
|
||||||
LLMRunner(LLMArch arch,
|
LLMRunner(LLMArch arch,
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
bool enable_vision_ = false)
|
bool enable_vision_ = false,
|
||||||
: GGMLRunner(backend, params_backend),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: GGMLRunner(backend, weight_manager),
|
||||||
config(LLMConfig::detect_from_weights(tensor_storage_map, prefix, arch)),
|
config(LLMConfig::detect_from_weights(tensor_storage_map, prefix, arch)),
|
||||||
enable_vision(enable_vision_) {
|
enable_vision(enable_vision_) {
|
||||||
if (enable_vision && !config.have_vision_weight) {
|
if (enable_vision && !config.have_vision_weight) {
|
||||||
@ -1822,11 +1875,11 @@ namespace LLM {
|
|||||||
|
|
||||||
LLMEmbedder(LLMArch arch,
|
LLMEmbedder(LLMArch arch,
|
||||||
ggml_backend_t backend,
|
ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
bool enable_vision = false)
|
bool enable_vision = false,
|
||||||
: model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) {
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: model(arch, backend, tensor_storage_map, prefix, enable_vision, weight_manager) {
|
||||||
if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
|
if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
|
||||||
tokenizer = std::make_shared<MistralTokenizer>();
|
tokenizer = std::make_shared<MistralTokenizer>();
|
||||||
} else if (arch == LLMArch::GPT_OSS_20B) {
|
} else if (arch == LLMArch::GPT_OSS_20B) {
|
||||||
@ -1840,13 +1893,6 @@ namespace LLM {
|
|||||||
model.get_param_tensors(tensors, prefix);
|
model.get_param_tensors(tensors, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool alloc_params_buffer() {
|
|
||||||
if (!model.alloc_params_buffer()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
|
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
|
||||||
std::pair<int, int> attn_range,
|
std::pair<int, int> attn_range,
|
||||||
size_t max_length = 0,
|
size_t max_length = 0,
|
||||||
@ -2062,7 +2108,8 @@ namespace LLM {
|
|||||||
ggml_backend_t backend = sd_backend_cpu_init();
|
ggml_backend_t backend = sd_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_COUNT;
|
ggml_type model_data_type = GGML_TYPE_COUNT;
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) {
|
if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
||||||
return;
|
return;
|
||||||
@ -2080,24 +2127,20 @@ namespace LLM {
|
|||||||
LLMArch arch = LLMArch::QWEN3;
|
LLMArch arch = LLMArch::QWEN3;
|
||||||
|
|
||||||
std::shared_ptr<LLMEmbedder> llm = std::make_shared<LLMEmbedder>(arch,
|
std::shared_ptr<LLMEmbedder> llm = std::make_shared<LLMEmbedder>(arch,
|
||||||
backend,
|
|
||||||
backend,
|
backend,
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"text_encoders.llm",
|
"text_encoders.llm",
|
||||||
true);
|
true,
|
||||||
|
model_manager);
|
||||||
|
|
||||||
if (!llm->alloc_params_buffer()) {
|
if (!model_manager->register_runner_params("LLM test",
|
||||||
LOG_ERROR("llm model allocation failed");
|
*llm,
|
||||||
return;
|
"text_encoders.llm",
|
||||||
}
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
|
backend,
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
backend) ||
|
||||||
llm->get_param_tensors(tensors, "text_encoders.llm");
|
!model_manager->validate_registered_tensors()) {
|
||||||
|
LOG_ERROR("register llm tensors with model manager failed");
|
||||||
bool success = model_loader.load_tensors(tensors);
|
|
||||||
|
|
||||||
if (!success) {
|
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
#ifndef __SD_MODEL_TE_T5_HPP__
|
#ifndef __SD_MODEL_TE_T5_HPP__
|
||||||
#define __SD_MODEL_TE_T5_HPP__
|
#define __SD_MODEL_TE_T5_HPP__
|
||||||
|
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
@ -12,6 +12,7 @@
|
|||||||
|
|
||||||
#include "core/ggml_extend.hpp"
|
#include "core/ggml_extend.hpp"
|
||||||
#include "model_loader.h"
|
#include "model_loader.h"
|
||||||
|
#include "model_manager.h"
|
||||||
#include "tokenizers/t5_unigram_tokenizer.h"
|
#include "tokenizers/t5_unigram_tokenizer.h"
|
||||||
|
|
||||||
struct T5Config {
|
struct T5Config {
|
||||||
@ -334,11 +335,11 @@ struct T5Runner : public GGMLRunner {
|
|||||||
std::vector<int> relative_position_bucket_vec;
|
std::vector<int> relative_position_bucket_vec;
|
||||||
|
|
||||||
T5Runner(ggml_backend_t backend,
|
T5Runner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
bool is_umt5 = false)
|
bool is_umt5 = false,
|
||||||
: GGMLRunner(backend, params_backend),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: GGMLRunner(backend, weight_manager),
|
||||||
config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) {
|
config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) {
|
||||||
model = T5(config);
|
model = T5(config);
|
||||||
model.init(params_ctx, tensor_storage_map, prefix);
|
model.init(params_ctx, tensor_storage_map, prefix);
|
||||||
@ -477,24 +478,17 @@ struct T5Embedder {
|
|||||||
T5Runner model;
|
T5Runner model;
|
||||||
|
|
||||||
T5Embedder(ggml_backend_t backend,
|
T5Embedder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
bool is_umt5 = false)
|
bool is_umt5 = false,
|
||||||
: model(backend, params_backend, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: model(backend, tensor_storage_map, prefix, is_umt5, weight_manager), tokenizer(is_umt5) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||||
model.get_param_tensors(tensors, prefix);
|
model.get_param_tensors(tensors, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool alloc_params_buffer() {
|
|
||||||
if (!model.alloc_params_buffer()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
|
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
|
||||||
size_t max_length = 0,
|
size_t max_length = 0,
|
||||||
bool padding = false) {
|
bool padding = false) {
|
||||||
@ -579,7 +573,8 @@ struct T5Embedder {
|
|||||||
ggml_backend_t backend = sd_backend_cpu_init();
|
ggml_backend_t backend = sd_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_F16;
|
ggml_type model_data_type = GGML_TYPE_F16;
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path)) {
|
if (!model_loader.init_from_file_and_convert_name(file_path)) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
||||||
return;
|
return;
|
||||||
@ -592,19 +587,16 @@ struct T5Embedder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, backend, tensor_storage_map, "", true);
|
std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, tensor_storage_map, "", true, model_manager);
|
||||||
|
|
||||||
if (!t5->alloc_params_buffer()) {
|
if (!model_manager->register_runner_params("T5 test",
|
||||||
LOG_ERROR("t5 params buffer allocation failed");
|
*t5,
|
||||||
return;
|
"",
|
||||||
}
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
backend,
|
||||||
t5->get_param_tensors(tensors, "");
|
backend) ||
|
||||||
|
!model_manager->validate_registered_tensors()) {
|
||||||
bool success = model_loader.load_tensors(tensors);
|
LOG_ERROR("register t5 tensors with model manager failed");
|
||||||
|
|
||||||
if (!success) {
|
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__
|
#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__
|
||||||
#define __SD_MODEL_UPSCALER_ESRGAN_HPP__
|
#define __SD_MODEL_UPSCALER_ESRGAN_HPP__
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -229,9 +229,9 @@ struct ESRGAN : public GGMLRunner {
|
|||||||
std::unique_ptr<RRDBNet> rrdb_net;
|
std::unique_ptr<RRDBNet> rrdb_net;
|
||||||
|
|
||||||
ESRGAN(ggml_backend_t backend,
|
ESRGAN(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const String2TensorStorage& tensor_storage_map = {})
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: GGMLRunner(backend, params_backend),
|
: GGMLRunner(backend, weight_manager),
|
||||||
config(ESRGANConfig::detect_from_weights(tensor_storage_map)),
|
config(ESRGANConfig::detect_from_weights(tensor_storage_map)),
|
||||||
rrdb_net(std::make_unique<RRDBNet>(config)) {
|
rrdb_net(std::make_unique<RRDBNet>(config)) {
|
||||||
rrdb_net->init(params_ctx, tensor_storage_map, "");
|
rrdb_net->init(params_ctx, tensor_storage_map, "");
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
|
#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
|
||||||
#define __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
|
#define __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -433,9 +433,9 @@ namespace LTXVUpsampler {
|
|||||||
std::unique_ptr<LatentUpsampler> model;
|
std::unique_ptr<LatentUpsampler> model;
|
||||||
|
|
||||||
LatentUpsamplerRunner(ggml_backend_t backend,
|
LatentUpsamplerRunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const String2TensorStorage& tensor_storage_map)
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: GGMLRunner(backend, params_backend),
|
: GGMLRunner(backend, weight_manager),
|
||||||
config(LatentUpsamplerConfig::detect_from_weights(tensor_storage_map)) {
|
config(LatentUpsamplerConfig::detect_from_weights(tensor_storage_map)) {
|
||||||
if (config.dims != 3 || (!config.spatial_upsample && !config.temporal_upsample) ||
|
if (config.dims != 3 || (!config.spatial_upsample && !config.temporal_upsample) ||
|
||||||
config.spatial_up_num < 1 || config.spatial_down_den < 1 || config.temporal_up_factor < 1) {
|
config.spatial_up_num < 1 || config.spatial_down_den < 1 || config.temporal_up_factor < 1) {
|
||||||
|
|||||||
@ -213,9 +213,9 @@ protected:
|
|||||||
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
|
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
float get_alpha() {
|
ggml_tensor* get_alpha(GGMLRunnerContext* ctx) {
|
||||||
float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
|
auto mix_factor = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["mix_factor"]);
|
||||||
return sigmoid(alpha);
|
return ggml_sigmoid(ctx->ggml_ctx, mix_factor);
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -250,10 +250,12 @@ public:
|
|||||||
|
|
||||||
x = time_stack->forward(ctx, x); // b t c (h w)
|
x = time_stack->forward(ctx, x); // b t c (h w)
|
||||||
|
|
||||||
float alpha = get_alpha();
|
auto alpha = get_alpha(ctx);
|
||||||
x = ggml_add(ctx->ggml_ctx,
|
x = ggml_add(ctx->ggml_ctx,
|
||||||
ggml_ext_scale(ctx->ggml_ctx, x, alpha),
|
x_mix,
|
||||||
ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha));
|
ggml_mul(ctx->ggml_ctx,
|
||||||
|
ggml_sub(ctx->ggml_ctx, x, x_mix),
|
||||||
|
alpha));
|
||||||
|
|
||||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
|
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
|
||||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
|
x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
|
||||||
@ -664,13 +666,13 @@ struct AutoEncoderKL : public VAE {
|
|||||||
AutoEncoderKLModel ae;
|
AutoEncoderKLModel ae;
|
||||||
|
|
||||||
AutoEncoderKL(ggml_backend_t backend,
|
AutoEncoderKL(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
bool decode_only = false,
|
bool decode_only = false,
|
||||||
bool use_video_decoder = false,
|
bool use_video_decoder = false,
|
||||||
SDVersion version = VERSION_SD1)
|
SDVersion version = VERSION_SD1,
|
||||||
: VAE(version, backend, params_backend, prefix), decode_only(decode_only) {
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: VAE(version, backend, prefix, weight_manager), decode_only(decode_only) {
|
||||||
if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
|
if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
|
||||||
scale_factor = 0.18215f;
|
scale_factor = 0.18215f;
|
||||||
shift_factor = 0.f;
|
shift_factor = 0.f;
|
||||||
@ -680,7 +682,7 @@ struct AutoEncoderKL : public VAE {
|
|||||||
} else if (sd_version_is_sd3(version)) {
|
} else if (sd_version_is_sd3(version)) {
|
||||||
scale_factor = 1.5305f;
|
scale_factor = 1.5305f;
|
||||||
shift_factor = 0.0609f;
|
shift_factor = 0.0609f;
|
||||||
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
|
} else if (sd_version_uses_flux_vae(version)) {
|
||||||
scale_factor = 0.3611f;
|
scale_factor = 0.3611f;
|
||||||
shift_factor = 0.1159f;
|
shift_factor = 0.1159f;
|
||||||
} else if (sd_version_uses_flux2_vae(version)) {
|
} else if (sd_version_uses_flux2_vae(version)) {
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
|
#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
|
||||||
#define __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
|
#define __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
#include "core/ggml_extend.hpp"
|
#include "core/ggml_extend.hpp"
|
||||||
#include "model_loader.h"
|
#include "model_loader.h"
|
||||||
|
#include "model_manager.h"
|
||||||
|
|
||||||
namespace LTXV {
|
namespace LTXV {
|
||||||
|
|
||||||
@ -1001,10 +1002,10 @@ namespace LTXV {
|
|||||||
sd::Tensor<float> bwe_skip_filter_tensor;
|
sd::Tensor<float> bwe_skip_filter_tensor;
|
||||||
|
|
||||||
LTXAudioVAERunner(ggml_backend_t backend,
|
LTXAudioVAERunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string& prefix = "")
|
const std::string& prefix = "",
|
||||||
: GGMLRunner(backend, params_backend),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: GGMLRunner(backend, weight_manager),
|
||||||
weight_prefix(prefix),
|
weight_prefix(prefix),
|
||||||
config(LTXAudioVAEConfig::detect_from_weights(tensor_storage_map)),
|
config(LTXAudioVAEConfig::detect_from_weights(tensor_storage_map)),
|
||||||
model(config) {
|
model(config) {
|
||||||
@ -1019,7 +1020,7 @@ namespace LTXV {
|
|||||||
model.get_param_tensors(tensors, weight_prefix);
|
model.get_param_tensors(tensors, weight_prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t get_params_buffer_size() {
|
size_t get_params_mem_size() {
|
||||||
return model.get_params_mem_size();
|
return model.get_params_mem_size();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1066,7 +1067,8 @@ namespace LTXV {
|
|||||||
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
LOG_INFO("loading ltx audio vae from '%s'", model_path.c_str());
|
LOG_INFO("loading ltx audio vae from '%s'", model_path.c_str());
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!model_loader.init_from_file(model_path)) {
|
if (!model_loader.init_from_file(model_path)) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
||||||
return;
|
return;
|
||||||
@ -1074,20 +1076,17 @@ namespace LTXV {
|
|||||||
|
|
||||||
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||||
auto ltx_audio_vae = std::make_shared<LTXAudioVAERunner>(backend,
|
auto ltx_audio_vae = std::make_shared<LTXAudioVAERunner>(backend,
|
||||||
backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
prefix);
|
prefix,
|
||||||
|
model_manager);
|
||||||
|
|
||||||
if (!ltx_audio_vae->alloc_params_buffer()) {
|
if (!model_manager->register_runner_params("LTX audio VAE test",
|
||||||
LOG_ERROR("ltx audio vae buffer allocation failed");
|
*ltx_audio_vae,
|
||||||
return;
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
}
|
backend,
|
||||||
|
backend) ||
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
!model_manager->validate_registered_tensors()) {
|
||||||
ltx_audio_vae->get_param_tensors(tensors);
|
LOG_ERROR("register ltx audio vae tensors with model manager failed");
|
||||||
|
|
||||||
if (!model_loader.load_tensors(tensors)) {
|
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -957,8 +957,8 @@ namespace LTXVAE {
|
|||||||
|
|
||||||
ggml_tensor* scaled_timestep = timestep;
|
ggml_tensor* scaled_timestep = timestep;
|
||||||
if (timestep_conditioning) {
|
if (timestep_conditioning) {
|
||||||
auto multiplier = ggml_ext_backend_tensor_get_f32(params["timestep_scale_multiplier"]);
|
auto multiplier = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["timestep_scale_multiplier"]);
|
||||||
scaled_timestep = ggml_ext_scale(ctx->ggml_ctx, timestep, multiplier);
|
scaled_timestep = ggml_mul(ctx->ggml_ctx, timestep, multiplier);
|
||||||
}
|
}
|
||||||
|
|
||||||
x = conv_in->forward(ctx, x, causal_decoder);
|
x = conv_in->forward(ctx, x, causal_decoder);
|
||||||
@ -1008,8 +1008,8 @@ namespace LTXVAE {
|
|||||||
|
|
||||||
ggml_tensor* scaled_timestep = timestep;
|
ggml_tensor* scaled_timestep = timestep;
|
||||||
if (timestep_conditioning && timestep != nullptr) {
|
if (timestep_conditioning && timestep != nullptr) {
|
||||||
auto multiplier = ggml_ext_backend_tensor_get_f32(params["timestep_scale_multiplier"]);
|
auto multiplier = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["timestep_scale_multiplier"]);
|
||||||
scaled_timestep = ggml_ext_scale(ctx->ggml_ctx, timestep, multiplier);
|
scaled_timestep = ggml_mul(ctx->ggml_ctx, timestep, multiplier);
|
||||||
}
|
}
|
||||||
|
|
||||||
// conv_in with feat_map for left temporal context
|
// conv_in with feat_map for left temporal context
|
||||||
@ -1223,11 +1223,11 @@ struct LTXVideoVAE : public VAE {
|
|||||||
LTXVAE::VideoVAE vae;
|
LTXVAE::VideoVAE vae;
|
||||||
|
|
||||||
LTXVideoVAE(ggml_backend_t backend,
|
LTXVideoVAE(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string& prefix,
|
const std::string& prefix,
|
||||||
bool decode_only = true,
|
bool decode_only = true,
|
||||||
SDVersion version = VERSION_LTXAV)
|
SDVersion version = VERSION_LTXAV,
|
||||||
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
: decode_only(decode_only),
|
: decode_only(decode_only),
|
||||||
ltx_vae_version(LTXVAE::detect_ltx_vae_version(tensor_storage_map, prefix)),
|
ltx_vae_version(LTXVAE::detect_ltx_vae_version(tensor_storage_map, prefix)),
|
||||||
timestep_conditioning(LTXVAE::detect_ltx_vae_timestep_conditioning(tensor_storage_map, prefix)),
|
timestep_conditioning(LTXVAE::detect_ltx_vae_timestep_conditioning(tensor_storage_map, prefix)),
|
||||||
@ -1239,7 +1239,7 @@ struct LTXVideoVAE : public VAE {
|
|||||||
patch_size,
|
patch_size,
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
prefix),
|
prefix),
|
||||||
VAE(version, backend, params_backend, prefix) {
|
VAE(version, backend, prefix, weight_manager) {
|
||||||
vae.init(params_ctx, tensor_storage_map, prefix);
|
vae.init(params_ctx, tensor_storage_map, prefix);
|
||||||
decode_timestep_tensor.values()[0] = vae.decode_timestep;
|
decode_timestep_tensor.values()[0] = vae.decode_timestep;
|
||||||
}
|
}
|
||||||
@ -1426,7 +1426,7 @@ struct LTXVideoVAE : public VAE {
|
|||||||
const sd::Tensor<float>& z,
|
const sd::Tensor<float>& z,
|
||||||
bool decode_graph) override {
|
bool decode_graph) override {
|
||||||
if (!decode_graph && decode_only) {
|
if (!decode_graph && decode_only) {
|
||||||
LOG_ERROR("LTX video VAE encode requires encoder weights; create the context with vae_decode_only=false");
|
LOG_ERROR("LTX video VAE encode requires encoder weights");
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
sd::Tensor<float> input = z;
|
sd::Tensor<float> input = z;
|
||||||
@ -1521,7 +1521,8 @@ struct LTXVideoVAE : public VAE {
|
|||||||
ggml_backend_t backend = sd_backend_cpu_init();
|
ggml_backend_t backend = sd_backend_cpu_init();
|
||||||
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
|
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());
|
||||||
|
|
||||||
ModelLoader model_loader;
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
if (!model_loader.init_from_file_and_convert_name(model_path, "vae.")) {
|
if (!model_loader.init_from_file_and_convert_name(model_path, "vae.")) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
||||||
return;
|
return;
|
||||||
@ -1529,22 +1530,19 @@ struct LTXVideoVAE : public VAE {
|
|||||||
|
|
||||||
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
|
||||||
std::shared_ptr<LTXVideoVAE> vae = std::make_shared<LTXVideoVAE>(backend,
|
std::shared_ptr<LTXVideoVAE> vae = std::make_shared<LTXVideoVAE>(backend,
|
||||||
backend,
|
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"first_stage_model",
|
"first_stage_model",
|
||||||
true,
|
true,
|
||||||
VERSION_LTXAV);
|
VERSION_LTXAV,
|
||||||
|
model_manager);
|
||||||
|
|
||||||
if (!vae->alloc_params_buffer()) {
|
if (!model_manager->register_runner_params("LTX VAE test",
|
||||||
LOG_ERROR("vae buffer allocation failed");
|
*vae,
|
||||||
return;
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
}
|
backend,
|
||||||
|
backend) ||
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
!model_manager->validate_registered_tensors()) {
|
||||||
vae->get_param_tensors(tensors);
|
LOG_ERROR("register ltx vae tensors with model manager failed");
|
||||||
|
|
||||||
if (!model_loader.load_tensors(tensors)) {
|
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -623,12 +623,12 @@ struct TinyImageAutoEncoder : public VAE {
|
|||||||
bool decode_only = false;
|
bool decode_only = false;
|
||||||
|
|
||||||
TinyImageAutoEncoder(ggml_backend_t backend,
|
TinyImageAutoEncoder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
bool decoder_only = true,
|
bool decoder_only = true,
|
||||||
SDVersion version = VERSION_SD1)
|
SDVersion version = VERSION_SD1,
|
||||||
: VAE(version, backend, params_backend, "tae"),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: VAE(version, backend, "tae", weight_manager),
|
||||||
decode_only(decoder_only),
|
decode_only(decoder_only),
|
||||||
taesd(decoder_only, version) {
|
taesd(decoder_only, version) {
|
||||||
scale_input = false;
|
scale_input = false;
|
||||||
@ -686,12 +686,12 @@ struct TinyVideoAutoEncoder : public VAE {
|
|||||||
bool is_wide = false;
|
bool is_wide = false;
|
||||||
|
|
||||||
TinyVideoAutoEncoder(ggml_backend_t backend,
|
TinyVideoAutoEncoder(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map,
|
const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
bool decoder_only = true,
|
bool decoder_only = true,
|
||||||
SDVersion version = VERSION_WAN2)
|
SDVersion version = VERSION_WAN2,
|
||||||
: VAE(version, backend, params_backend, "tae"),
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: VAE(version, backend, "tae", weight_manager),
|
||||||
decode_only(decoder_only) {
|
decode_only(decoder_only) {
|
||||||
for (auto tensor_storage : tensor_storage_map) {
|
for (auto tensor_storage : tensor_storage_map) {
|
||||||
if (tensor_storage.first.find(prefix + ".3.conv.6.weight") != std::string::npos) {
|
if (tensor_storage.first.find(prefix + ".3.conv.6.weight") != std::string::npos) {
|
||||||
|
|||||||
@ -1,8 +1,9 @@
|
|||||||
#ifndef __SD_MODEL_VAE_VAE_HPP__
|
#ifndef __SD_MODEL_VAE_VAE_HPP__
|
||||||
#define __SD_MODEL_VAE_VAE_HPP__
|
#define __SD_MODEL_VAE_VAE_HPP__
|
||||||
|
|
||||||
#include "core/tensor_ggml.hpp"
|
#include "core/tensor_ggml.hpp"
|
||||||
#include "model/common/block.hpp"
|
#include "model/common/block.hpp"
|
||||||
|
#include "model_manager.h"
|
||||||
|
|
||||||
struct VAE : public GGMLRunner {
|
struct VAE : public GGMLRunner {
|
||||||
protected:
|
protected:
|
||||||
@ -63,8 +64,11 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
VAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend, const std::string& weight_prefix = "")
|
VAE(SDVersion version,
|
||||||
: version(version), weight_prefix(weight_prefix), GGMLRunner(backend, params_backend) {}
|
ggml_backend_t backend,
|
||||||
|
const std::string& weight_prefix = "",
|
||||||
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: version(version), weight_prefix(weight_prefix), GGMLRunner(backend, weight_manager) {}
|
||||||
|
|
||||||
int get_scale_factor() {
|
int get_scale_factor() {
|
||||||
int scale_factor = 8;
|
int scale_factor = 8;
|
||||||
@ -224,8 +228,10 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct FakeVAE : public VAE {
|
struct FakeVAE : public VAE {
|
||||||
FakeVAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend)
|
FakeVAE(SDVersion version,
|
||||||
: VAE(version, backend, params_backend) {}
|
ggml_backend_t backend,
|
||||||
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: VAE(version, backend, "", weight_manager) {}
|
||||||
|
|
||||||
int get_encoder_output_channels(int input_channels) {
|
int get_encoder_output_channels(int input_channels) {
|
||||||
return input_channels;
|
return input_channels;
|
||||||
|
|||||||
@ -1124,12 +1124,12 @@ namespace WAN {
|
|||||||
WanVAE ae;
|
WanVAE ae;
|
||||||
|
|
||||||
WanVAERunner(ggml_backend_t backend,
|
WanVAERunner(ggml_backend_t backend,
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
const String2TensorStorage& tensor_storage_map = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
bool decode_only = false,
|
bool decode_only = false,
|
||||||
SDVersion version = VERSION_WAN2)
|
SDVersion version = VERSION_WAN2,
|
||||||
: VAE(version, backend, params_backend, prefix), decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V) {
|
std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
|
||||||
|
: VAE(version, backend, prefix, weight_manager), decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V) {
|
||||||
ae.init(params_ctx, tensor_storage_map, prefix);
|
ae.init(params_ctx, tensor_storage_map, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1327,27 +1327,24 @@ namespace WAN {
|
|||||||
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
ggml_backend_t backend = sd_backend_cpu_init();
|
ggml_backend_t backend = sd_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_F16;
|
ggml_type model_data_type = GGML_TYPE_F16;
|
||||||
std::shared_ptr<WanVAERunner> vae = std::make_shared<WanVAERunner>(backend, backend, String2TensorStorage{}, "first_stage_model", false, VERSION_WAN2_2_TI2V);
|
auto model_manager = std::make_shared<ModelManager>();
|
||||||
|
std::shared_ptr<WanVAERunner> vae = std::make_shared<WanVAERunner>(backend, String2TensorStorage{}, "first_stage_model", false, VERSION_WAN2_2_TI2V, model_manager);
|
||||||
{
|
{
|
||||||
LOG_INFO("loading from '%s'", file_path.c_str());
|
LOG_INFO("loading from '%s'", file_path.c_str());
|
||||||
|
|
||||||
if (!vae->alloc_params_buffer()) {
|
ModelLoader& model_loader = model_manager->loader();
|
||||||
LOG_ERROR("vae buffer allocation failed");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
|
||||||
vae->get_param_tensors(tensors);
|
|
||||||
|
|
||||||
ModelLoader model_loader;
|
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path, "vae.")) {
|
if (!model_loader.init_from_file_and_convert_name(file_path, "vae.")) {
|
||||||
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors);
|
if (!model_manager->register_runner_params("Wan VAE test",
|
||||||
|
*vae,
|
||||||
if (!success) {
|
ModelManager::ResidencyMode::ParamBackend,
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
backend,
|
||||||
|
backend) ||
|
||||||
|
!model_manager->validate_registered_tensors()) {
|
||||||
|
LOG_ERROR("register wan vae tensors with model manager failed");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -485,6 +485,9 @@ SDVersion ModelLoader::get_sd_version() {
|
|||||||
if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
|
if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
|
||||||
return VERSION_Z_IMAGE;
|
return VERSION_Z_IMAGE;
|
||||||
}
|
}
|
||||||
|
if (tensor_storage.name.find("double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight") != std::string::npos) {
|
||||||
|
return VERSION_BOOGU_IMAGE;
|
||||||
|
}
|
||||||
if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
|
if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
|
||||||
return VERSION_ERNIE_IMAGE;
|
return VERSION_ERNIE_IMAGE;
|
||||||
}
|
}
|
||||||
@ -1002,6 +1005,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
|
|||||||
std::atomic<size_t> tensor_idx(0);
|
std::atomic<size_t> tensor_idx(0);
|
||||||
std::atomic<bool> failed(false);
|
std::atomic<bool> failed(false);
|
||||||
std::vector<std::thread> workers;
|
std::vector<std::thread> workers;
|
||||||
|
std::mutex rpc_backend_mutex;
|
||||||
|
|
||||||
for (int i = 0; i < n_threads; ++i) {
|
for (int i = 0; i < n_threads; ++i) {
|
||||||
workers.emplace_back([&, file_path, is_zip]() {
|
workers.emplace_back([&, file_path, is_zip]() {
|
||||||
@ -1158,7 +1162,19 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb,
|
|||||||
|
|
||||||
if (dst_tensor->buffer != nullptr && !ggml_backend_buffer_is_host(dst_tensor->buffer)) {
|
if (dst_tensor->buffer != nullptr && !ggml_backend_buffer_is_host(dst_tensor->buffer)) {
|
||||||
t0 = ggml_time_ms();
|
t0 = ggml_time_ms();
|
||||||
|
|
||||||
|
// RPC backends require serialized access to prevent concurrency issues
|
||||||
|
const char* buffer_type_name = ggml_backend_buft_name(ggml_backend_buffer_get_type(dst_tensor->buffer));
|
||||||
|
bool is_rpc_buffer = buffer_type_name != nullptr &&
|
||||||
|
std::string(buffer_type_name).find("RPC") != std::string::npos;
|
||||||
|
|
||||||
|
if (is_rpc_buffer) {
|
||||||
|
std::lock_guard<std::mutex> lock(rpc_backend_mutex);
|
||||||
ggml_backend_tensor_set(dst_tensor, convert_buf, 0, ggml_nbytes(dst_tensor));
|
ggml_backend_tensor_set(dst_tensor, convert_buf, 0, ggml_nbytes(dst_tensor));
|
||||||
|
} else {
|
||||||
|
ggml_backend_tensor_set(dst_tensor, convert_buf, 0, ggml_nbytes(dst_tensor));
|
||||||
|
}
|
||||||
|
|
||||||
t1 = ggml_time_ms();
|
t1 = ggml_time_ms();
|
||||||
copy_to_backend_time_ms.fetch_add(t1 - t0);
|
copy_to_backend_time_ms.fetch_add(t1 - t0);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -147,6 +147,17 @@ bool ModelManager::register_param_tensors(const std::string& desc,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ModelManager::load_all_params_eagerly() {
|
||||||
|
std::vector<TensorState*> all_states;
|
||||||
|
all_states.reserve(tensor_states_.size());
|
||||||
|
for (const auto& s : tensor_states_) {
|
||||||
|
if (s != nullptr) {
|
||||||
|
all_states.push_back(s.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return load_tensors_to_params_backend(all_states);
|
||||||
|
}
|
||||||
|
|
||||||
bool ModelManager::validate_registered_tensors() {
|
bool ModelManager::validate_registered_tensors() {
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
for (const auto& state : tensor_states_) {
|
for (const auto& state : tensor_states_) {
|
||||||
@ -469,7 +480,7 @@ bool ModelManager::mmap_params(const std::vector<TensorState*>& states,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, true);
|
auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, writable_mmap_);
|
||||||
if (mmap_store.empty()) {
|
if (mmap_store.empty()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -492,7 +503,7 @@ bool ModelManager::mmap_params(const std::vector<TensorState*>& states,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ModelManager::can_mmap_storage(const TensorState& state) const {
|
bool ModelManager::can_mmap_storage(const TensorState& state) const {
|
||||||
if (!enable_mmap_ || state.residency_mode != ResidencyMode::Resident) {
|
if (!enable_mmap_ || state.residency_mode != ResidencyMode::ParamBackend) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (state.compute_backend == nullptr || state.params_backend == nullptr) {
|
if (state.compute_backend == nullptr || state.params_backend == nullptr) {
|
||||||
@ -577,13 +588,8 @@ bool ModelManager::alloc_params_buffers(const std::vector<TensorState*>& states,
|
|||||||
for (TensorState* state : states) {
|
for (TensorState* state : states) {
|
||||||
ggml_tensor* tensor = state->tensor;
|
ggml_tensor* tensor = state->tensor;
|
||||||
size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(params_buft, tensor), alignment);
|
size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(params_buft, tensor), alignment);
|
||||||
if (max_size > 0 && tensor_size > max_size) {
|
// Some backends, e.g. Vulkan, report a preferred chunk size here rather than a
|
||||||
LOG_ERROR("model manager tensor '%s' is too large for params buffer: %zu > %zu",
|
// hard per-tensor allocation limit. Oversized tensors are allocated alone.
|
||||||
ggml_get_name(tensor),
|
|
||||||
tensor_size,
|
|
||||||
max_size);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!chunk.empty() && max_size > 0 && chunk_size + tensor_size > max_size) {
|
if (!chunk.empty() && max_size > 0 && chunk_size + tensor_size > max_size) {
|
||||||
if (!alloc_chunk(chunk, chunk_size)) {
|
if (!alloc_chunk(chunk, chunk_size)) {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@ -16,7 +16,7 @@ class ModelManager : public RunnerWeightManager {
|
|||||||
public:
|
public:
|
||||||
enum class ResidencyMode {
|
enum class ResidencyMode {
|
||||||
Disk,
|
Disk,
|
||||||
Resident,
|
ParamBackend,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LoraSpec {
|
struct LoraSpec {
|
||||||
@ -33,7 +33,7 @@ private:
|
|||||||
ggml_tensor* tensor = nullptr;
|
ggml_tensor* tensor = nullptr;
|
||||||
std::string desc;
|
std::string desc;
|
||||||
|
|
||||||
ResidencyMode residency_mode = ResidencyMode::Resident;
|
ResidencyMode residency_mode = ResidencyMode::ParamBackend;
|
||||||
ggml_backend_t compute_backend = nullptr;
|
ggml_backend_t compute_backend = nullptr;
|
||||||
ggml_backend_t params_backend = nullptr;
|
ggml_backend_t params_backend = nullptr;
|
||||||
bool metadata_validated = false;
|
bool metadata_validated = false;
|
||||||
@ -69,6 +69,7 @@ private:
|
|||||||
uint64_t current_lora_epoch_ = 0;
|
uint64_t current_lora_epoch_ = 0;
|
||||||
int n_threads_ = 0;
|
int n_threads_ = 0;
|
||||||
bool enable_mmap_ = false;
|
bool enable_mmap_ = false;
|
||||||
|
bool writable_mmap_ = false;
|
||||||
|
|
||||||
void finish_compute_backend_usage(const std::vector<TensorState*>& states);
|
void finish_compute_backend_usage(const std::vector<TensorState*>& states);
|
||||||
void release_all();
|
void release_all();
|
||||||
@ -110,6 +111,7 @@ public:
|
|||||||
model_loader_.set_n_threads(n_threads);
|
model_loader_.set_n_threads(n_threads);
|
||||||
}
|
}
|
||||||
void set_enable_mmap(bool enable_mmap) { enable_mmap_ = enable_mmap; }
|
void set_enable_mmap(bool enable_mmap) { enable_mmap_ = enable_mmap; }
|
||||||
|
void set_writable_mmap(bool writable_mmap) { writable_mmap_ = writable_mmap; }
|
||||||
void set_common_ignore_tensors(std::set<std::string> ignore_tensors);
|
void set_common_ignore_tensors(std::set<std::string> ignore_tensors);
|
||||||
void set_loras(std::vector<LoraSpec> loras, SDVersion version);
|
void set_loras(std::vector<LoraSpec> loras, SDVersion version);
|
||||||
|
|
||||||
@ -121,7 +123,44 @@ public:
|
|||||||
ggml_backend_t compute_backend,
|
ggml_backend_t compute_backend,
|
||||||
ggml_backend_t params_backend,
|
ggml_backend_t params_backend,
|
||||||
size_t* registered_tensor_size = nullptr);
|
size_t* registered_tensor_size = nullptr);
|
||||||
|
|
||||||
|
template <typename Runner>
|
||||||
|
bool register_runner_params(const std::string& desc,
|
||||||
|
Runner& runner,
|
||||||
|
ResidencyMode residency_mode,
|
||||||
|
ggml_backend_t compute_backend,
|
||||||
|
ggml_backend_t params_backend,
|
||||||
|
size_t* registered_tensor_size = nullptr) {
|
||||||
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
|
runner.get_param_tensors(tensors);
|
||||||
|
return register_param_tensors(desc,
|
||||||
|
std::move(tensors),
|
||||||
|
residency_mode,
|
||||||
|
compute_backend,
|
||||||
|
params_backend,
|
||||||
|
registered_tensor_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Runner>
|
||||||
|
bool register_runner_params(const std::string& desc,
|
||||||
|
Runner& runner,
|
||||||
|
const std::string& prefix,
|
||||||
|
ResidencyMode residency_mode,
|
||||||
|
ggml_backend_t compute_backend,
|
||||||
|
ggml_backend_t params_backend,
|
||||||
|
size_t* registered_tensor_size = nullptr) {
|
||||||
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
|
runner.get_param_tensors(tensors, prefix);
|
||||||
|
return register_param_tensors(desc,
|
||||||
|
std::move(tensors),
|
||||||
|
residency_mode,
|
||||||
|
compute_backend,
|
||||||
|
params_backend,
|
||||||
|
registered_tensor_size);
|
||||||
|
}
|
||||||
|
|
||||||
bool validate_registered_tensors();
|
bool validate_registered_tensors();
|
||||||
|
bool load_all_params_eagerly();
|
||||||
|
|
||||||
bool prepare_params(const std::vector<ggml_tensor*>& tensors) override;
|
bool prepare_params(const std::vector<ggml_tensor*>& tensors) override;
|
||||||
void release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) override;
|
void release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) override;
|
||||||
|
|||||||
@ -184,6 +184,27 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
|
|||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string convert_qwen3_vl_vision_name(std::string name) {
|
||||||
|
static const std::vector<std::pair<std::string, std::string>> qwen3_vl_vision_name_map{
|
||||||
|
{"mm.0.", "merger.linear_fc1."},
|
||||||
|
{"mm.2.", "merger.linear_fc2."},
|
||||||
|
{"v.post_ln.", "merger.norm."},
|
||||||
|
{"v.position_embd.weight", "pos_embed.weight"},
|
||||||
|
{"v.patch_embd.weight.1", "patch_embed.proj.1.weight"},
|
||||||
|
{"v.patch_embd.weight", "patch_embed.proj.0.weight"},
|
||||||
|
{"v.patch_embd.bias", "patch_embed.bias"},
|
||||||
|
{"v.blk.", "blocks."},
|
||||||
|
{"attn_qkv.", "attn.qkv."},
|
||||||
|
{"attn_out.", "attn.proj."},
|
||||||
|
{"ffn_up.", "mlp.linear_fc1."},
|
||||||
|
{"ffn_down.", "mlp.linear_fc2."},
|
||||||
|
{"ln1.", "norm1."},
|
||||||
|
{"ln2.", "norm2."},
|
||||||
|
};
|
||||||
|
replace_with_name_map(name, qwen3_vl_vision_name_map);
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
// ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
|
// ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
|
||||||
std::string convert_diffusers_unet_to_original_sd1(std::string name) {
|
std::string convert_diffusers_unet_to_original_sd1(std::string name) {
|
||||||
// (stable-diffusion, HF Diffusers)
|
// (stable-diffusion, HF Diffusers)
|
||||||
@ -1154,6 +1175,10 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
|
|||||||
|
|
||||||
replace_with_prefix_map(name, prefix_map);
|
replace_with_prefix_map(name, prefix_map);
|
||||||
|
|
||||||
|
if (sd_version_is_boogu_image(version) && starts_with(name, "text_encoders.llm.visual.")) {
|
||||||
|
name = convert_qwen3_vl_vision_name(std::move(name));
|
||||||
|
}
|
||||||
|
|
||||||
// diffusion model
|
// diffusion model
|
||||||
{
|
{
|
||||||
for (const auto& prefix : diffuison_model_prefix_vec) {
|
for (const auto& prefix : diffuison_model_prefix_vec) {
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <optional>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
@ -63,6 +64,82 @@ namespace sd::guidance {
|
|||||||
return uncond;
|
return uncond;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<float> parse_guidance_schedule_from_spec(std::string spec) {
|
||||||
|
std::vector<float> schedule;
|
||||||
|
|
||||||
|
while (!spec.empty()) {
|
||||||
|
auto sep = spec.find('+');
|
||||||
|
auto segment = spec.substr(0, sep);
|
||||||
|
|
||||||
|
auto x = segment.find('x');
|
||||||
|
if (x == std::string::npos) {
|
||||||
|
LOG_ERROR("Invalid guidance schedule segment: '%s' (expected <guidance>x<count>)", segment.c_str());
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
float guidance;
|
||||||
|
int count;
|
||||||
|
|
||||||
|
auto guidance_str = segment.substr(0, x);
|
||||||
|
auto count_str = segment.substr(x + 1);
|
||||||
|
|
||||||
|
try {
|
||||||
|
size_t idx = 0;
|
||||||
|
guidance = std::stof(guidance_str, &idx);
|
||||||
|
if (idx != guidance_str.size()) {
|
||||||
|
LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str());
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
} catch (const std::exception&) {
|
||||||
|
LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str());
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
size_t idx = 0;
|
||||||
|
count = std::stoi(count_str, &idx);
|
||||||
|
if (idx != count_str.size()) {
|
||||||
|
LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str());
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
} catch (const std::exception&) {
|
||||||
|
LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str());
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (count <= 0) {
|
||||||
|
LOG_ERROR("Guidance schedule count must be positive");
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
schedule.insert(schedule.end(), count, guidance);
|
||||||
|
|
||||||
|
if (sep == std::string::npos) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
spec = spec.substr(sep + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return schedule;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> parse_guidance_schedule(const char* extra_sample_args) {
|
||||||
|
std::vector<float> guidance_schedule;
|
||||||
|
std::string guidance_schedule_str = "";
|
||||||
|
for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "extra sample arg")) {
|
||||||
|
float parsed = 0.0f;
|
||||||
|
if (key == "guidance_schedule") {
|
||||||
|
guidance_schedule_str = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!guidance_schedule_str.empty()) {
|
||||||
|
guidance_schedule = parse_guidance_schedule_from_spec(guidance_schedule_str);
|
||||||
|
}
|
||||||
|
return guidance_schedule;
|
||||||
|
}
|
||||||
|
|
||||||
ClassifierFreeGuidance::ClassifierFreeGuidance(float guidance_scale,
|
ClassifierFreeGuidance::ClassifierFreeGuidance(float guidance_scale,
|
||||||
float image_guidance_scale)
|
float image_guidance_scale)
|
||||||
: guidance_scale_(guidance_scale),
|
: guidance_scale_(guidance_scale),
|
||||||
@ -70,8 +147,10 @@ namespace sd::guidance {
|
|||||||
}
|
}
|
||||||
|
|
||||||
GuiderOutput ClassifierFreeGuidance::forward(const GuidanceInput& input,
|
GuiderOutput ClassifierFreeGuidance::forward(const GuidanceInput& input,
|
||||||
GuiderOutput previous) const {
|
GuiderOutput previous,
|
||||||
|
std::optional<float> scale_override) const {
|
||||||
(void)previous;
|
(void)previous;
|
||||||
|
float guidance_scale = scale_override.value_or(guidance_scale_);
|
||||||
|
|
||||||
GuiderOutput output;
|
GuiderOutput output;
|
||||||
if (!has_tensor(input.pred_cond)) {
|
if (!has_tensor(input.pred_cond)) {
|
||||||
@ -86,14 +165,14 @@ namespace sd::guidance {
|
|||||||
const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
|
const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
|
||||||
output.pred = pred_img_uncond +
|
output.pred = pred_img_uncond +
|
||||||
image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
|
image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
|
||||||
guidance_scale_ * (pred_cond - pred_uncond);
|
guidance_scale * (pred_cond - pred_uncond);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
|
output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond);
|
||||||
}
|
}
|
||||||
} else if (has_tensor(input.pred_img_uncond)) {
|
} else if (has_tensor(input.pred_img_uncond)) {
|
||||||
const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
|
const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
|
||||||
output.pred = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
|
output.pred = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond);
|
||||||
}
|
}
|
||||||
|
|
||||||
return output;
|
return output;
|
||||||
@ -128,8 +207,10 @@ namespace sd::guidance {
|
|||||||
}
|
}
|
||||||
|
|
||||||
GuiderOutput AdaptiveProjectedGuidance::forward(const GuidanceInput& input,
|
GuiderOutput AdaptiveProjectedGuidance::forward(const GuidanceInput& input,
|
||||||
GuiderOutput previous) const {
|
GuiderOutput previous,
|
||||||
|
std::optional<float> scale_override) const {
|
||||||
(void)previous;
|
(void)previous;
|
||||||
|
float guidance_scale = scale_override.value_or(guidance_scale_);
|
||||||
|
|
||||||
GuiderOutput output;
|
GuiderOutput output;
|
||||||
if (!has_tensor(input.pred_cond)) {
|
if (!has_tensor(input.pred_cond)) {
|
||||||
@ -144,13 +225,13 @@ namespace sd::guidance {
|
|||||||
const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
|
const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
|
||||||
output.pred = pred_img_uncond +
|
output.pred = pred_img_uncond +
|
||||||
image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
|
image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
|
||||||
guidance_scale_ * (pred_cond - pred_uncond);
|
guidance_scale * (pred_cond - pred_uncond);
|
||||||
} else {
|
} else {
|
||||||
output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
|
output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond);
|
||||||
}
|
}
|
||||||
} else if (has_tensor(input.pred_img_uncond)) {
|
} else if (has_tensor(input.pred_img_uncond)) {
|
||||||
const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
|
const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
|
||||||
output.pred = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
|
output.pred = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond);
|
||||||
}
|
}
|
||||||
if (!has_tensor(input.pred_uncond) && !has_tensor(input.pred_img_uncond)) {
|
if (!has_tensor(input.pred_uncond) && !has_tensor(input.pred_img_uncond)) {
|
||||||
return output;
|
return output;
|
||||||
@ -162,7 +243,7 @@ namespace sd::guidance {
|
|||||||
sd::Tensor<float> deltas = calculate_guidance_delta(pred_cond,
|
sd::Tensor<float> deltas = calculate_guidance_delta(pred_cond,
|
||||||
pred_uncond,
|
pred_uncond,
|
||||||
pred_img_uncond,
|
pred_img_uncond,
|
||||||
guidance_scale_,
|
guidance_scale,
|
||||||
image_guidance_scale_);
|
image_guidance_scale_);
|
||||||
if (params_.momentum != 0.0f) {
|
if (params_.momentum != 0.0f) {
|
||||||
if (momentum_buffer_.shape() != deltas.shape()) {
|
if (momentum_buffer_.shape() != deltas.shape()) {
|
||||||
@ -239,7 +320,8 @@ namespace sd::guidance {
|
|||||||
}
|
}
|
||||||
|
|
||||||
GuiderOutput SkipLayerGuidance::forward(const GuidanceInput& input,
|
GuiderOutput SkipLayerGuidance::forward(const GuidanceInput& input,
|
||||||
GuiderOutput output) const {
|
GuiderOutput output,
|
||||||
|
std::optional<float> /*scale_override*/) const {
|
||||||
if (scale_ == 0.0f || !is_enabled_for_step(input) || !input.predict_skip_layer) {
|
if (scale_ == 0.0f || !is_enabled_for_step(input) || !input.predict_skip_layer) {
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <optional>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "core/tensor.hpp"
|
#include "core/tensor.hpp"
|
||||||
@ -27,6 +28,7 @@ namespace sd::guidance {
|
|||||||
AdaptiveProjectedGuidanceParams parse_adaptive_projected_guidance_args(const char* extra_sample_args);
|
AdaptiveProjectedGuidanceParams parse_adaptive_projected_guidance_args(const char* extra_sample_args);
|
||||||
bool is_adaptive_projected_guidance_enabled(const AdaptiveProjectedGuidanceParams& params);
|
bool is_adaptive_projected_guidance_enabled(const AdaptiveProjectedGuidanceParams& params);
|
||||||
bool parse_skip_layer_guidance_uncond_arg(const char* extra_sample_args);
|
bool parse_skip_layer_guidance_uncond_arg(const char* extra_sample_args);
|
||||||
|
std::vector<float> parse_guidance_schedule(const char* extra_sample_args);
|
||||||
|
|
||||||
struct GuidanceInput {
|
struct GuidanceInput {
|
||||||
int step = 0;
|
int step = 0;
|
||||||
@ -42,7 +44,8 @@ namespace sd::guidance {
|
|||||||
public:
|
public:
|
||||||
virtual ~BaseGuidance() = default;
|
virtual ~BaseGuidance() = default;
|
||||||
virtual GuiderOutput forward(const GuidanceInput& input,
|
virtual GuiderOutput forward(const GuidanceInput& input,
|
||||||
GuiderOutput previous) const = 0;
|
GuiderOutput previous,
|
||||||
|
std::optional<float> scale_override = std::nullopt) const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
class ClassifierFreeGuidance : public BaseGuidance {
|
class ClassifierFreeGuidance : public BaseGuidance {
|
||||||
@ -54,7 +57,8 @@ namespace sd::guidance {
|
|||||||
float image_guidance_scale);
|
float image_guidance_scale);
|
||||||
|
|
||||||
GuiderOutput forward(const GuidanceInput& input,
|
GuiderOutput forward(const GuidanceInput& input,
|
||||||
GuiderOutput previous) const override;
|
GuiderOutput previous,
|
||||||
|
std::optional<float> scale_override = std::nullopt) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
class AdaptiveProjectedGuidance : public BaseGuidance {
|
class AdaptiveProjectedGuidance : public BaseGuidance {
|
||||||
@ -69,7 +73,8 @@ namespace sd::guidance {
|
|||||||
AdaptiveProjectedGuidanceParams params);
|
AdaptiveProjectedGuidanceParams params);
|
||||||
|
|
||||||
GuiderOutput forward(const GuidanceInput& input,
|
GuiderOutput forward(const GuidanceInput& input,
|
||||||
GuiderOutput previous) const override;
|
GuiderOutput previous,
|
||||||
|
std::optional<float> scale_override = std::nullopt) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
class SkipLayerGuidance : public BaseGuidance {
|
class SkipLayerGuidance : public BaseGuidance {
|
||||||
@ -88,7 +93,8 @@ namespace sd::guidance {
|
|||||||
const std::vector<int>& layers() const;
|
const std::vector<int>& layers() const;
|
||||||
|
|
||||||
GuiderOutput forward(const GuidanceInput& input,
|
GuiderOutput forward(const GuidanceInput& input,
|
||||||
GuiderOutput previous) const override;
|
GuiderOutput previous,
|
||||||
|
std::optional<float> scale_override = std::nullopt) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace sd::guidance
|
} // namespace sd::guidance
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -134,7 +134,8 @@ std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t
|
|||||||
std::vector<int32_t> bpe_tokens;
|
std::vector<int32_t> bpe_tokens;
|
||||||
std::vector<std::string> token_strs;
|
std::vector<std::string> token_strs;
|
||||||
|
|
||||||
auto splited_texts = split_with_special_tokens(text, special_tokens);
|
std::string normalized_text = normalize_before_split ? normalize(text) : text;
|
||||||
|
auto splited_texts = split_with_special_tokens(normalized_text, special_tokens);
|
||||||
|
|
||||||
for (auto& splited_text : splited_texts) {
|
for (auto& splited_text : splited_texts) {
|
||||||
if (is_special_token(splited_text)) {
|
if (is_special_token(splited_text)) {
|
||||||
@ -159,7 +160,7 @@ std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string token_str = normalize(token);
|
std::string token_str = normalize_before_split ? token : normalize(token);
|
||||||
std::u32string utf32_token;
|
std::u32string utf32_token;
|
||||||
if (byte_level_bpe) {
|
if (byte_level_bpe) {
|
||||||
for (int i = 0; i < token_str.length(); i++) {
|
for (int i = 0; i < token_str.length(); i++) {
|
||||||
|
|||||||
@ -25,6 +25,7 @@ CLIPTokenizer::CLIPTokenizer(int pad_token_id, const std::string& merges_utf8_st
|
|||||||
end_of_word_suffix = "</w>";
|
end_of_word_suffix = "</w>";
|
||||||
add_bos_token = true;
|
add_bos_token = true;
|
||||||
add_eos_token = true;
|
add_eos_token = true;
|
||||||
|
normalize_before_split = true;
|
||||||
|
|
||||||
if (merges_utf8_str.size() > 0) {
|
if (merges_utf8_str.size() > 0) {
|
||||||
load_from_merges(merges_utf8_str);
|
load_from_merges(merges_utf8_str);
|
||||||
|
|||||||
@ -15,6 +15,7 @@ protected:
|
|||||||
bool add_bos_token = false;
|
bool add_bos_token = false;
|
||||||
bool add_eos_token = false;
|
bool add_eos_token = false;
|
||||||
bool pad_left = false;
|
bool pad_left = false;
|
||||||
|
bool normalize_before_split = false;
|
||||||
std::string end_of_word_suffix;
|
std::string end_of_word_suffix;
|
||||||
|
|
||||||
virtual std::string decode_token(int token_id) const = 0;
|
virtual std::string decode_token(int token_id) const = 0;
|
||||||
|
|||||||
@ -39,17 +39,12 @@ void UpscalerGGML::set_stream_layers_enabled(bool enabled) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
|
bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
|
||||||
bool offload_params_to_cpu,
|
|
||||||
int n_threads) {
|
int n_threads) {
|
||||||
ggml_log_set(ggml_log_callback_default, nullptr);
|
ggml_log_set(ggml_log_callback_default, nullptr);
|
||||||
|
|
||||||
std::string error;
|
std::string error;
|
||||||
if (!backend_manager.init(backend_spec.c_str(),
|
if (!backend_manager.init(backend_spec.c_str(),
|
||||||
params_backend_spec.c_str(),
|
params_backend_spec.c_str(),
|
||||||
offload_params_to_cpu,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
&error)) {
|
&error)) {
|
||||||
LOG_ERROR("upscaler backend config failed: %s", error.c_str());
|
LOG_ERROR("upscaler backend config failed: %s", error.c_str());
|
||||||
return false;
|
return false;
|
||||||
@ -90,8 +85,8 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
|
|||||||
model_loader.set_wtype_override(model_data_type);
|
model_loader.set_wtype_override(model_data_type);
|
||||||
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
|
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
|
||||||
esrgan_upscaler = std::make_shared<ESRGAN>(backend_for(SDBackendModule::UPSCALER),
|
esrgan_upscaler = std::make_shared<ESRGAN>(backend_for(SDBackendModule::UPSCALER),
|
||||||
params_backend_for(SDBackendModule::UPSCALER),
|
model_loader.get_tensor_storage_map(),
|
||||||
model_loader.get_tensor_storage_map());
|
model_manager);
|
||||||
if (esrgan_upscaler == nullptr || esrgan_upscaler->rrdb_net == nullptr) {
|
if (esrgan_upscaler == nullptr || esrgan_upscaler->rrdb_net == nullptr) {
|
||||||
LOG_ERROR("init esrgan model from metadata failed: '%s'", esrgan_path.c_str());
|
LOG_ERROR("init esrgan model from metadata failed: '%s'", esrgan_path.c_str());
|
||||||
return false;
|
return false;
|
||||||
@ -104,10 +99,9 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
|
|||||||
|
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
esrgan_upscaler->get_param_tensors(tensors);
|
esrgan_upscaler->get_param_tensors(tensors);
|
||||||
esrgan_upscaler->set_weight_manager(model_manager);
|
|
||||||
if (!model_manager->register_param_tensors("ESRGAN",
|
if (!model_manager->register_param_tensors("ESRGAN",
|
||||||
std::move(tensors),
|
std::move(tensors),
|
||||||
ModelManager::ResidencyMode::Resident,
|
backend_manager.params_backend_is_disk(SDBackendModule::UPSCALER) ? ModelManager::ResidencyMode::Disk : ModelManager::ResidencyMode::ParamBackend,
|
||||||
backend_for(SDBackendModule::UPSCALER),
|
backend_for(SDBackendModule::UPSCALER),
|
||||||
params_backend_for(SDBackendModule::UPSCALER)) ||
|
params_backend_for(SDBackendModule::UPSCALER)) ||
|
||||||
!model_manager->validate_registered_tensors()) {
|
!model_manager->validate_registered_tensors()) {
|
||||||
@ -179,7 +173,6 @@ struct upscaler_ctx_t {
|
|||||||
};
|
};
|
||||||
|
|
||||||
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
||||||
bool offload_params_to_cpu,
|
|
||||||
bool direct,
|
bool direct,
|
||||||
int n_threads,
|
int n_threads,
|
||||||
int tile_size,
|
int tile_size,
|
||||||
@ -196,7 +189,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
|
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, n_threads)) {
|
||||||
delete upscaler_ctx->upscaler;
|
delete upscaler_ctx->upscaler;
|
||||||
upscaler_ctx->upscaler = nullptr;
|
upscaler_ctx->upscaler = nullptr;
|
||||||
free(upscaler_ctx);
|
free(upscaler_ctx);
|
||||||
|
|||||||
@ -32,7 +32,6 @@ struct UpscalerGGML {
|
|||||||
~UpscalerGGML();
|
~UpscalerGGML();
|
||||||
|
|
||||||
bool load_from_file(const std::string& esrgan_path,
|
bool load_from_file(const std::string& esrgan_path,
|
||||||
bool offload_params_to_cpu,
|
|
||||||
int n_threads);
|
int n_threads);
|
||||||
void set_max_graph_vram_bytes(size_t max_vram_bytes);
|
void set_max_graph_vram_bytes(size_t max_vram_bytes);
|
||||||
void set_stream_layers_enabled(bool enabled);
|
void set_stream_layers_enabled(bool enabled);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user