mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
3 Commits
00b0a0053d
...
125acc845f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
125acc845f | ||
|
|
29c61c8c29 | ||
|
|
2570565dfa |
2
.gitmodules
vendored
2
.gitmodules
vendored
@ -1,3 +1,3 @@
|
||||
[submodule "ggml"]
|
||||
path = ggml
|
||||
url = https://github.com/leejet/ggml.git
|
||||
url = https://github.com/ggml-org/ggml.git
|
||||
|
||||
68
README.md
68
README.md
@ -4,19 +4,33 @@
|
||||
|
||||
# stable-diffusion.cpp
|
||||
|
||||
Inference of Stable Diffusion and Flux in pure C/C++
|
||||
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
|
||||
|
||||
***Note that this project is under active development. \
|
||||
API and command-line parameters may change frequently.***
|
||||
|
||||
## Features
|
||||
|
||||
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
|
||||
- Super lightweight and without external dependencies
|
||||
- SD1.x, SD2.x, SDXL and [SD3/SD3.5](./docs/sd3.md) support
|
||||
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
|
||||
- [Flux-dev/Flux-schnell Support](./docs/flux.md)
|
||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
||||
- [Chroma](./docs/chroma.md)
|
||||
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
|
||||
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
|
||||
- Supported models
|
||||
- Image Models
|
||||
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
|
||||
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
|
||||
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
|
||||
- [SD3/SD3.5](./docs/sd3.md)
|
||||
- [Flux-dev/Flux-schnell](./docs/flux.md)
|
||||
- [Chroma](./docs/chroma.md)
|
||||
- Image Edit Models
|
||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
||||
- Video Models
|
||||
- [Wan2.1/Wan2.2](./docs/wan.md)
|
||||
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
|
||||
- Control Net support with SD 1.5
|
||||
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
|
||||
- Latent Consistency Models support (LCM/LCM-LoRA)
|
||||
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
|
||||
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
|
||||
- 16-bit, 32-bit float support
|
||||
- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
|
||||
- Accelerated memory-efficient CPU inference
|
||||
@ -26,15 +40,9 @@ Inference of Stable Diffusion and Flux in pure C/C++
|
||||
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
|
||||
- No need to convert to `.ggml` or `.gguf` anymore!
|
||||
- Flash Attention for memory usage optimization
|
||||
- Original `txt2img` and `img2img` mode
|
||||
- Negative prompt
|
||||
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
|
||||
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
|
||||
- Latent Consistency Models support (LCM/LCM-LoRA)
|
||||
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
|
||||
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
|
||||
- VAE tiling processing for reduce memory usage
|
||||
- Control Net support with SD 1.5
|
||||
- Sampling method
|
||||
- `Euler A`
|
||||
- `Euler`
|
||||
@ -287,8 +295,10 @@ arguments:
|
||||
If threads <= 0, then threads will be set to the number of CPU physical cores
|
||||
-m, --model [MODEL] path to full model
|
||||
--diffusion-model path to the standalone diffusion model
|
||||
--high-noise-diffusion-model path to the standalone high noise diffusion model
|
||||
--clip_l path to the clip-l text encoder
|
||||
--clip_g path to the clip-g text encoder
|
||||
--clip_vision path to the clip-vision encoder
|
||||
--t5xxl path to the t5xxl text encoder
|
||||
--vae [VAE] path to vae
|
||||
--taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
||||
@ -303,8 +313,9 @@ arguments:
|
||||
If not specified, the default is the type of the weight file
|
||||
--tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
|
||||
--lora-model-dir [DIR] lora model directory
|
||||
-i, --init-img [IMAGE] path to the input image, required by img2img
|
||||
-i, --init-img [IMAGE] path to the init image, required by img2img
|
||||
--mask [MASK] path to the mask image, required by img2img with mask
|
||||
-i, --end-img [IMAGE] path to the end image, required by flf2v
|
||||
--control-image [IMAGE] path to image condition, control net
|
||||
-r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
|
||||
-o, --output OUTPUT path to write result image to (default: ./output.png)
|
||||
@ -319,6 +330,23 @@ arguments:
|
||||
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
|
||||
--skip-layer-start START SLG enabling point: (default: 0.01)
|
||||
--skip-layer-end END SLG disabling point: (default: 0.2)
|
||||
--scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
|
||||
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
|
||||
sampling method (default: "euler_a")
|
||||
--steps STEPS number of sample steps (default: 20)
|
||||
--high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)
|
||||
--high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
|
||||
--high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5)
|
||||
--high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
|
||||
0 means disabled, a value of 2.5 is nice for sd3.5 medium
|
||||
--high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)
|
||||
--high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])
|
||||
--high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)
|
||||
--high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)
|
||||
--high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
|
||||
--high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
|
||||
(high noise) sampling method (default: "euler_a")
|
||||
--high-noise-steps STEPS (high noise) number of sample steps (default: 20)
|
||||
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
|
||||
--strength STRENGTH strength for noising/unnoising (default: 0.75)
|
||||
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20)
|
||||
@ -326,14 +354,10 @@ arguments:
|
||||
1.0 corresponds to full destruction of information in init image
|
||||
-H, --height H image height, in pixel space (default: 512)
|
||||
-W, --width W image width, in pixel space (default: 512)
|
||||
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
|
||||
sampling method (default: "euler_a")
|
||||
--steps STEPS number of sample steps (default: 20)
|
||||
--rng {std_default, cuda} RNG (default: cuda)
|
||||
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
|
||||
-b, --batch-count COUNT number of images to generate
|
||||
--scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
|
||||
--clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
|
||||
--clip-skip N ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
|
||||
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
|
||||
--vae-tiling process vae in tiles to reduce memory usage
|
||||
--vae-on-cpu keep vae in cpu (for low vram)
|
||||
@ -351,6 +375,8 @@ arguments:
|
||||
--chroma-disable-dit-mask disable dit mask for chroma
|
||||
--chroma-enable-t5-mask enable t5 mask for chroma
|
||||
--chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
|
||||
--video-frames video frames (default: 1)
|
||||
--fps fps (default: 24)
|
||||
-v, --verbose print extra info
|
||||
```
|
||||
|
||||
@ -438,3 +464,5 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
|
||||
- [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
|
||||
- [generative-models](https://github.com/Stability-AI/generative-models/)
|
||||
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
|
||||
- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
|
||||
- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
|
||||
BIN
assets/wan/Wan2.1_1.3B_t2v.mp4
Normal file
BIN
assets/wan/Wan2.1_1.3B_t2v.mp4
Normal file
Binary file not shown.
BIN
assets/wan/Wan2.1_14B_flf2v.mp4
Normal file
BIN
assets/wan/Wan2.1_14B_flf2v.mp4
Normal file
Binary file not shown.
BIN
assets/wan/Wan2.1_14B_i2v.mp4
Normal file
BIN
assets/wan/Wan2.1_14B_i2v.mp4
Normal file
Binary file not shown.
BIN
assets/wan/Wan2.1_14B_t2v.mp4
Normal file
BIN
assets/wan/Wan2.1_14B_t2v.mp4
Normal file
Binary file not shown.
BIN
assets/wan/Wan2.2_14B_flf2v.mp4
Normal file
BIN
assets/wan/Wan2.2_14B_flf2v.mp4
Normal file
Binary file not shown.
BIN
assets/wan/Wan2.2_14B_i2v.mp4
Normal file
BIN
assets/wan/Wan2.2_14B_i2v.mp4
Normal file
Binary file not shown.
BIN
assets/wan/Wan2.2_14B_t2i.png
Normal file
BIN
assets/wan/Wan2.2_14B_t2i.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 594 KiB |
BIN
assets/wan/Wan2.2_14B_t2v.mp4
Normal file
BIN
assets/wan/Wan2.2_14B_t2v.mp4
Normal file
Binary file not shown.
BIN
assets/wan/Wan2.2_14B_t2v_lora.mp4
Normal file
BIN
assets/wan/Wan2.2_14B_t2v_lora.mp4
Normal file
Binary file not shown.
BIN
assets/wan/Wan2.2_5B_i2v.mp4
Normal file
BIN
assets/wan/Wan2.2_5B_i2v.mp4
Normal file
Binary file not shown.
BIN
assets/wan/Wan2.2_5B_t2v.mp4
Normal file
BIN
assets/wan/Wan2.2_5B_t2v.mp4
Normal file
Binary file not shown.
141
docs/wan.md
Normal file
141
docs/wan.md
Normal file
@ -0,0 +1,141 @@
|
||||
# How to Use
|
||||
|
||||
## Download weights
|
||||
|
||||
- Download Wan
|
||||
- Wan2.1
|
||||
- Wan2.1 T2V 1.3B
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
||||
- Wan2.1 T2V 14B
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
||||
- gguf: https://huggingface.co/city96/Wan2.1-T2V-14B-gguf/tree/main
|
||||
- Wan2.1 I2V 14B 480P
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
||||
- gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/tree/main
|
||||
- Wan2.1 I2V 14B 720P
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
||||
- gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-720P-gguf/tree/main
|
||||
- Wan2.1 FLF2V 14B 720P
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
||||
- gguf: https://huggingface.co/city96/Wan2.1-FLF2V-14B-720P-gguf/tree/main
|
||||
- Wan2.2
|
||||
- Wan2.2 TI2V 5B
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
|
||||
- gguf: https://huggingface.co/QuantStack/Wan2.2-TI2V-5B-GGUF/tree/main
|
||||
- Wan2.2 T2V A14B
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
|
||||
- gguf: https://huggingface.co/QuantStack/Wan2.2-T2V-A14B-GGUF/tree/main
|
||||
- Wan2.2 I2V A14B
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
|
||||
- gguf: https://huggingface.co/QuantStack/Wan2.2-I2V-A14B-GGUF/tree/main
|
||||
- Download vae
|
||||
- wan_2.1_vae (for all the wan model except Wan2.2 TI2V 5B)
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors
|
||||
- wan_2.2_vae (for Wan2.2 TI2V 5B only)
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors
|
||||
- Download umt5_xxl
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors
|
||||
- gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main
|
||||
|
||||
- Download clip_vison_h (for Wan2.1 I2V/FLF2V only)
|
||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/clip_vision/clip_vision_h.safetensors
|
||||
|
||||
|
||||
## Examples
|
||||
|
||||
Since GitHub does not support AVI files, the file I uploaded was converted from AVI to MP4.
|
||||
|
||||
### Wan2.1 T2V 1.3B
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1_t2v_1.3B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33
|
||||
```
|
||||
|
||||
<video src=../assets/wan/Wan2.1_1.3B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
|
||||
### Wan2.1 T2V 14B
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-t2v-14b-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33
|
||||
```
|
||||
|
||||
<video src=../assets/wan/Wan2.1_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
|
||||
|
||||
|
||||
### Wan2.1 I2V 14B
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-i2v-14b-480p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png
|
||||
```
|
||||
|
||||
<video src=../assets/wan/Wan2.1_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
|
||||
### Wan2.2 T2V A14B
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33
|
||||
```
|
||||
|
||||
<video src=../assets/wan/Wan2.2_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
|
||||
### Wan2.2 I2V A14B
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png
|
||||
```
|
||||
|
||||
<video src=../assets/wan/Wan2.2_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
|
||||
### Wan2.2 T2V A14B T2I
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu
|
||||
```
|
||||
|
||||
<img width="832" height="480" alt="Wan2 2_14B_t2i" src="../assets/wan/Wan2.2_14B_t2i.png" />
|
||||
|
||||
### Wan2.2 T2V 14B with Lora
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat<lora:wan2.2_t2v_lightx2v_4steps_lora_v1.1_low_noise:1><lora:|high_noise|wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise:1>" --cfg-scale 3.5 --sampling-method euler --steps 4 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 4 -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --lora-model-dir ..\..\ComfyUI\models\loras --video-frames 33
|
||||
```
|
||||
|
||||
<video src=../assets/wan/Wan2.2_14B_t2v_lora.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
|
||||
|
||||
|
||||
### Wan2.2 TI2V 5B
|
||||
|
||||
#### T2V
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33
|
||||
```
|
||||
|
||||
<video src=../assets/wan/Wan2.2_5B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
|
||||
#### I2V
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\cat_with_sd_cpp_42.png
|
||||
```
|
||||
|
||||
<video src=../assets/wan/Wan2.2_5B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
|
||||
### Wan2.1 FLF2V 14B
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-flf2v-14b-720p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png
|
||||
```
|
||||
|
||||
|
||||
<video src=../assets/wan/Wan2.1_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
|
||||
### Wan2.2 FLF2V 14B
|
||||
|
||||
```
|
||||
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -p "glass flower blossom" -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png
|
||||
```
|
||||
|
||||
<video src=../assets/wan/Wan2.2_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
||||
@ -262,9 +262,9 @@ void print_usage(int argc, const char* argv[]) {
|
||||
printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n");
|
||||
printf(" Might lower quality, since it implies converting k and v to f16.\n");
|
||||
printf(" This might crash if it is not supported by the backend.\n");
|
||||
printf(" --diffusion-conv-direct use Conv2d direct in the diffusion model");
|
||||
printf(" --diffusion-conv-direct use Conv2d direct in the diffusion model\n");
|
||||
printf(" This might crash if it is not supported by the backend.\n");
|
||||
printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)");
|
||||
printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)\n");
|
||||
printf(" This might crash if it is not supported by the backend.\n");
|
||||
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
|
||||
printf(" --canny apply canny preprocessor (edge detection)\n");
|
||||
|
||||
2
ggml
2
ggml
@ -1 +1 @@
|
||||
Subproject commit 70268874234fa06cf058739a221bfbe94129f330
|
||||
Subproject commit 5fdc78fff274094e2a1b155928131983362d8a71
|
||||
231
gguf_reader.hpp
Normal file
231
gguf_reader.hpp
Normal file
@ -0,0 +1,231 @@
|
||||
#ifndef __GGUF_READER_HPP__
|
||||
#define __GGUF_READER_HPP__
|
||||
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "util.h"
|
||||
|
||||
struct GGUFTensorInfo {
|
||||
std::string name;
|
||||
ggml_type type;
|
||||
std::vector<int64_t> shape;
|
||||
size_t offset;
|
||||
};
|
||||
|
||||
enum class GGUFMetadataType : uint32_t {
|
||||
UINT8 = 0,
|
||||
INT8 = 1,
|
||||
UINT16 = 2,
|
||||
INT16 = 3,
|
||||
UINT32 = 4,
|
||||
INT32 = 5,
|
||||
FLOAT32 = 6,
|
||||
BOOL = 7,
|
||||
STRING = 8,
|
||||
ARRAY = 9,
|
||||
UINT64 = 10,
|
||||
INT64 = 11,
|
||||
FLOAT64 = 12,
|
||||
};
|
||||
|
||||
class GGUFReader {
|
||||
private:
|
||||
std::vector<GGUFTensorInfo> tensors_;
|
||||
size_t data_offset_;
|
||||
size_t alignment_ = 32; // default alignment is 32
|
||||
|
||||
template <typename T>
|
||||
bool safe_read(std::ifstream& fin, T& value) {
|
||||
fin.read(reinterpret_cast<char*>(&value), sizeof(T));
|
||||
return fin.good();
|
||||
}
|
||||
|
||||
bool safe_read(std::ifstream& fin, char* buffer, size_t size) {
|
||||
fin.read(buffer, size);
|
||||
return fin.good();
|
||||
}
|
||||
|
||||
bool safe_seek(std::ifstream& fin, std::streamoff offset, std::ios::seekdir dir) {
|
||||
fin.seekg(offset, dir);
|
||||
return fin.good();
|
||||
}
|
||||
|
||||
bool read_metadata(std::ifstream& fin) {
|
||||
uint64_t key_len = 0;
|
||||
if (!safe_read(fin, key_len))
|
||||
return false;
|
||||
|
||||
std::string key(key_len, '\0');
|
||||
if (!safe_read(fin, (char*)key.data(), key_len))
|
||||
return false;
|
||||
|
||||
uint32_t type = 0;
|
||||
if (!safe_read(fin, type))
|
||||
return false;
|
||||
|
||||
if (key == "general.alignment") {
|
||||
uint32_t align_val = 0;
|
||||
if (!safe_read(fin, align_val))
|
||||
return false;
|
||||
|
||||
if (align_val != 0 && (align_val & (align_val - 1)) == 0) {
|
||||
alignment_ = align_val;
|
||||
LOG_DEBUG("Found alignment: %zu", alignment_);
|
||||
} else {
|
||||
LOG_ERROR("Invalid alignment value %u, fallback to default %zu", align_val, alignment_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (static_cast<GGUFMetadataType>(type)) {
|
||||
case GGUFMetadataType::UINT8:
|
||||
case GGUFMetadataType::INT8:
|
||||
case GGUFMetadataType::BOOL:
|
||||
return safe_seek(fin, 1, std::ios::cur);
|
||||
|
||||
case GGUFMetadataType::UINT16:
|
||||
case GGUFMetadataType::INT16:
|
||||
return safe_seek(fin, 2, std::ios::cur);
|
||||
|
||||
case GGUFMetadataType::UINT32:
|
||||
case GGUFMetadataType::INT32:
|
||||
case GGUFMetadataType::FLOAT32:
|
||||
return safe_seek(fin, 4, std::ios::cur);
|
||||
|
||||
case GGUFMetadataType::UINT64:
|
||||
case GGUFMetadataType::INT64:
|
||||
case GGUFMetadataType::FLOAT64:
|
||||
return safe_seek(fin, 8, std::ios::cur);
|
||||
|
||||
case GGUFMetadataType::STRING: {
|
||||
uint64_t len = 0;
|
||||
if (!safe_read(fin, len))
|
||||
return false;
|
||||
return safe_seek(fin, len, std::ios::cur);
|
||||
}
|
||||
|
||||
case GGUFMetadataType::ARRAY: {
|
||||
uint32_t elem_type = 0;
|
||||
uint64_t len = 0;
|
||||
if (!safe_read(fin, elem_type))
|
||||
return false;
|
||||
if (!safe_read(fin, len))
|
||||
return false;
|
||||
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
if (!read_metadata(fin))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
default:
|
||||
LOG_ERROR("Unknown metadata type=%u", type);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
GGUFTensorInfo read_tensor_info(std::ifstream& fin) {
|
||||
GGUFTensorInfo info;
|
||||
|
||||
uint64_t name_len;
|
||||
if (!safe_read(fin, name_len))
|
||||
throw std::runtime_error("read tensor name length failed");
|
||||
|
||||
info.name.resize(name_len);
|
||||
if (!safe_read(fin, (char*)info.name.data(), name_len))
|
||||
throw std::runtime_error("read tensor name failed");
|
||||
|
||||
uint32_t n_dims;
|
||||
if (!safe_read(fin, n_dims))
|
||||
throw std::runtime_error("read tensor dims failed");
|
||||
|
||||
info.shape.resize(n_dims);
|
||||
for (uint32_t i = 0; i < n_dims; i++) {
|
||||
if (!safe_read(fin, info.shape[i]))
|
||||
throw std::runtime_error("read tensor shape failed");
|
||||
}
|
||||
|
||||
if (n_dims > GGML_MAX_DIMS) {
|
||||
for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
|
||||
info.shape[GGML_MAX_DIMS - 1] *= info.shape[i]; // stack to last dim;
|
||||
}
|
||||
info.shape.resize(GGML_MAX_DIMS);
|
||||
n_dims = GGML_MAX_DIMS;
|
||||
}
|
||||
|
||||
uint32_t type;
|
||||
if (!safe_read(fin, type))
|
||||
throw std::runtime_error("read tensor type failed");
|
||||
info.type = static_cast<ggml_type>(type);
|
||||
|
||||
if (!safe_read(fin, info.offset))
|
||||
throw std::runtime_error("read tensor offset failed");
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
public:
|
||||
bool load(const std::string& file_path) {
|
||||
std::ifstream fin(file_path, std::ios::binary);
|
||||
if (!fin) {
|
||||
LOG_ERROR("failed to open '%s'", file_path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// --- Header ---
|
||||
char magic[4];
|
||||
if (!safe_read(fin, magic, 4) || strncmp(magic, "GGUF", 4) != 0) {
|
||||
LOG_ERROR("not a valid GGUF file");
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t version;
|
||||
if (!safe_read(fin, version))
|
||||
return false;
|
||||
|
||||
uint64_t tensor_count, metadata_kv_count;
|
||||
if (!safe_read(fin, tensor_count))
|
||||
return false;
|
||||
if (!safe_read(fin, metadata_kv_count))
|
||||
return false;
|
||||
|
||||
LOG_DEBUG("GGUF v%u, tensor_count=%llu, metadata_kv_count=%llu",
|
||||
version, (unsigned long long)tensor_count, (unsigned long long)metadata_kv_count);
|
||||
|
||||
// --- Read Metadata ---
|
||||
for (uint64_t i = 0; i < metadata_kv_count; i++) {
|
||||
if (!read_metadata(fin)) {
|
||||
LOG_ERROR("read meta data failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Tensor Infos ---
|
||||
tensors_.clear();
|
||||
try {
|
||||
for (uint64_t i = 0; i < tensor_count; i++) {
|
||||
tensors_.push_back(read_tensor_info(fin));
|
||||
}
|
||||
} catch (const std::runtime_error& e) {
|
||||
LOG_ERROR("%s", e.what());
|
||||
return false;
|
||||
}
|
||||
|
||||
data_offset_ = static_cast<size_t>(fin.tellg());
|
||||
if ((data_offset_ % alignment_) != 0) {
|
||||
data_offset_ = ((data_offset_ + alignment_ - 1) / alignment_) * alignment_;
|
||||
}
|
||||
fin.close();
|
||||
return true;
|
||||
}
|
||||
|
||||
const std::vector<GGUFTensorInfo>& tensors() const { return tensors_; }
|
||||
size_t data_offset() const { return data_offset_; }
|
||||
};
|
||||
|
||||
#endif // __GGUF_READER_HPP__
|
||||
48
model.cpp
48
model.cpp
@ -6,6 +6,7 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "gguf_reader.hpp"
|
||||
#include "model.h"
|
||||
#include "stable-diffusion.h"
|
||||
#include "util.h"
|
||||
@ -1055,24 +1056,37 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
|
||||
gguf_context* ctx_gguf_ = NULL;
|
||||
ggml_context* ctx_meta_ = NULL;
|
||||
|
||||
auto on_tensor_shape_read = [](const int64_t* ne, uint32_t n_dims, struct gguf_tensor_shape* shape) -> bool {
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (i < n_dims) {
|
||||
shape->ne[i] = ne[i];
|
||||
} else {
|
||||
shape->ne[i] = 1;
|
||||
}
|
||||
}
|
||||
for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
|
||||
shape->ne[GGML_MAX_DIMS - 1] *= ne[i]; // stack to last dim;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
ctx_gguf_ = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read);
|
||||
ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_});
|
||||
if (!ctx_gguf_) {
|
||||
LOG_ERROR("failed to open '%s'", file_path.c_str());
|
||||
return false;
|
||||
LOG_ERROR("failed to open '%s' with gguf_init_from_file. Try to open it with GGUFReader.", file_path.c_str());
|
||||
GGUFReader gguf_reader;
|
||||
if (!gguf_reader.load(file_path)) {
|
||||
LOG_ERROR("failed to open '%s' with GGUFReader.", file_path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t data_offset = gguf_reader.data_offset();
|
||||
for (const auto& gguf_tensor_info : gguf_reader.tensors()) {
|
||||
std::string name = gguf_tensor_info.name;
|
||||
if (!starts_with(name, prefix)) {
|
||||
name = prefix + name;
|
||||
}
|
||||
|
||||
TensorStorage tensor_storage(
|
||||
name,
|
||||
gguf_tensor_info.type,
|
||||
gguf_tensor_info.shape.data(),
|
||||
gguf_tensor_info.shape.size(),
|
||||
file_index,
|
||||
data_offset + gguf_tensor_info.offset);
|
||||
|
||||
// LOG_DEBUG("%s %s", name.c_str(), tensor_storage.to_string().c_str());
|
||||
|
||||
tensor_storages.push_back(tensor_storage);
|
||||
add_preprocess_tensor_storage_types(tensor_storages_types, tensor_storage.name, tensor_storage.type);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int n_tensors = gguf_get_n_tensors(ctx_gguf_);
|
||||
|
||||
2
model.h
2
model.h
@ -123,7 +123,7 @@ struct TensorStorage {
|
||||
|
||||
TensorStorage() = default;
|
||||
|
||||
TensorStorage(const std::string& name, ggml_type type, int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
|
||||
TensorStorage(const std::string& name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
|
||||
: name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
|
||||
for (int i = 0; i < n_dims; i++) {
|
||||
this->ne[i] = ne[i];
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user