Compare commits

..

327 Commits

Author SHA1 Message Date
leejet
545fac4f3f
refactor: simplify sample cache flow (#1350) 2026-03-17 00:28:03 +08:00
Tay
5265a5efa1
perf(z-image): switch to fused SwiGLU kernel (#1302) 2026-03-17 00:27:46 +08:00
leejet
84cbd88df1
style: remove redundant struct qualifiers for consistent C/C++ type usage (#1349) 2026-03-16 22:17:22 +08:00
Daniele
997bb11fb6
fix: correct encoder channels for flux2 (#1346) 2026-03-16 22:16:43 +08:00
leejet
862a6586cb
feat: add embedded WebUI (#1207) 2026-03-16 00:26:57 +08:00
leejet
61d8331ef3 ci: avoid cuda docker build timeout by using -j16 2026-03-15 18:39:29 +08:00
leejet
acc3bf1fdc
refactor: optimize the VAE architecture (#1345) 2026-03-15 16:57:42 +08:00
Kevin Nause
83eabd7c01
ci: add CUDA Dockerfile (#1314) 2026-03-15 16:46:01 +08:00
Wagner Bruna
630ee03f23
refactor: move all cache parameter defaults to the library (#1327) 2026-03-15 16:43:46 +08:00
Wagner Bruna
f6968bc589
chore: remove SD_FAST_SOFTMAX build flag (#1338) 2026-03-15 16:42:47 +08:00
rmatif
adfef62900
feat: add generic DiT support to spectrum cache (#1336) 2026-03-15 16:41:05 +08:00
JusteLeo
6fa7ca9317
docs: add Anima2 gguf download link to anima.md (#1335) 2026-03-15 16:40:14 +08:00
leejet
d6dd6d7b55
refactor: remove ununsed encode_video (#1332) 2026-03-10 00:36:09 +08:00
rmatif
dea4980f4e
feat: add spectrum caching method (#1322) 2026-03-10 00:35:32 +08:00
leejet
c8fb3d2458
fix: resolve SD1 Pix2Pix issue (#1329) 2026-03-08 00:28:05 +08:00
stduhpf
3d33caaef8
fix: make tiling work better when using circular (#1299) 2026-03-08 00:25:07 +08:00
WinkelCode
9b424db0f4
ci: change workflow owner of "actions-commit-hash" from "pr-mpt" to "prompt" (#1323) 2026-03-08 00:23:23 +08:00
rmatif
d95062737e
fix: ucache: normalize reuse error (#1313) 2026-03-04 23:50:45 +08:00
Korsar13
7c880f80c7
fix: avoid sd-server memory leak (#1316) 2026-03-04 23:47:38 +08:00
leejet
aaa8a51bd8 docs: update sd-cli/sd-server docs 2026-03-04 00:41:17 +08:00
leejet
ba35dd734e
refactor: introduce ggml_ext_zeros_like/ggml_ext_ones_like (#1312) 2026-03-04 00:36:52 +08:00
bssrdf
d41f5fff69
perf: improved flux attention qkv unpacking (#1306) 2026-03-04 00:36:32 +08:00
Korsar13
810ef0cf76
fix: reset weight adapter for models if no loras in request (#1307) 2026-03-04 00:34:07 +08:00
leejet
5792c66879
feat: support some non-standard Anima weight names (#1305) 2026-03-01 22:01:29 +08:00
Wagner Bruna
39d54702a6
feat: accept legacy image parameter on v1/images/edits (#1270) 2026-03-01 22:00:50 +08:00
Wagner Bruna
60889bc9a1
fix: correct sdapi LoRA file handling (#1276) 2026-03-01 21:57:06 +08:00
leejet
e64baa3611
refactor: reuse DiT's patchify/unpatchify functions (#1304) 2026-03-01 21:44:51 +08:00
leejet
cec4aedcfd docs: add anima docs 2026-03-01 15:32:25 +08:00
rmatif
4cdfff5ff2
feat: add Anima support (#1296) 2026-03-01 15:23:18 +08:00
leejet
0752cc9d3a
fix: resolve image quality degradation issue (#1297) 2026-02-26 00:26:21 +08:00
Wagner Bruna
b314d80ad0
feat: turn flow_shift into a generation parameter (#1289)
* feat: turn flow_shift into a generation parameter

* format code

* simplify set_shift/set_parameters

* fix sd_sample_params_to_str

* remove unused variable

* update docs

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-02-26 00:26:04 +08:00
leejet
c9cd49701a
fix: safely handle whitespace and consecutive newlines (#1288) 2026-02-19 20:54:42 +08:00
akleine
c5eb1e4137
fix: avoid black images if using an invalid VAE (for SDXL) (#1273) 2026-02-19 20:54:18 +08:00
leejet
636d3cb6ff
refactor: reorganize the vocab file structure (#1271) 2026-02-11 00:44:17 +08:00
Wagner Bruna
adea272225
feat(server): use image and command-line dimensions by default on server (#1262) 2026-02-11 00:42:50 +08:00
Mario Limonciello
45ce78a3ae
ci: correct rocm artifact of linux (#1269) 2026-02-10 23:19:28 +08:00
leejet
28ef93c0e1
refactor: reorganize the file structure (#1266) 2026-02-10 23:13:35 +08:00
leejet
3296545090
feat: add extra_c_crossattns support for llm embedder (#1265) 2026-02-10 00:00:17 +08:00
akleine
d60fb27560
fix: avoid unwanted file extension changes (#1257) 2026-02-09 23:59:43 +08:00
Wagner Bruna
c7ccafbd6f
fix: correct sdapi handling of cfg_scale and steps (#1260) 2026-02-09 23:34:19 +08:00
stduhpf
aa0b899397
fix: improve handling of VAE decode failures (#1222) 2026-02-09 23:29:41 +08:00
Mario Limonciello
5e264372ce
ci: add a github action to generate a Linux ROCm artifact (#1258) 2026-02-09 23:23:06 +08:00
leejet
f0f641a142
feat(server): add lora support to sdapi (#1256) 2026-02-08 00:11:16 +08:00
stduhpf
9f56833e14
feat: optimize LoKr at runtime (#1233) 2026-02-08 00:08:09 +08:00
Roj234
65891d74cc
fix: avoid the issue of NaN for qwen-image on certain devices (#1249) 2026-02-04 23:49:05 +08:00
leejet
f957fa3d2a
feat: add --fa option (#1242) 2026-02-01 21:44:54 +08:00
leejet
c252e03c6b sync: update ggml 2026-02-01 20:54:23 +08:00
rmatif
e63daba33d
feat: add res_multistep, res_2s sampler and bong tangent scheduler (#1234) 2026-02-01 20:05:27 +08:00
stduhpf
3959109281
fix: improve LoCon support with other naming conventions (#1239) 2026-02-01 20:00:16 +08:00
leejet
e411520407 docs: add z-image-base example 2026-01-28 21:47:36 +08:00
leejet
43e829f219
refactor: unify the processing of attention mask (#1230) 2026-01-26 00:33:34 +08:00
leejet
7837232631
perf: make dit faster (#1228) 2026-01-25 22:50:10 +08:00
Equious
4ccce027b2
fix: correct mask and control image loading in cli (#1229) 2026-01-25 22:47:52 +08:00
leejet
fa61ea744d
fix: set default lora_model_dir to . (#1224) 2026-01-23 22:13:59 +08:00
leejet
5e4579c11d
feat: use image width and height when not explicitly set (#1206) 2026-01-22 23:54:41 +08:00
Wagner Bruna
329571131d
chore: clarify warning about missing model files (#1219) 2026-01-21 22:34:11 +08:00
leejet
a48b4a3ade docs: add FLUX.2-klein support to news 2026-01-19 23:56:50 +08:00
stduhpf
b87fe13afd
feat: support new chroma radiance "x0_x32_proto" (#1209) 2026-01-19 23:51:26 +08:00
Oleg Skutte
e50e1f253d
feat: add taef2 support (#1211) 2026-01-19 23:39:36 +08:00
leejet
c6206fb351 fix: set VAE conv scale for all SDXL variants 2026-01-19 23:21:48 +08:00
akleine
639091fbe9
feat: add support for Segmind's Vega model (#1195) 2026-01-19 23:15:47 +08:00
leejet
9293016c9d docs: update esrgan.md 2026-01-19 23:00:50 +08:00
leejet
2efd19978d
fix: use Unix timestamp for field instead of ISO string (#1205) 2026-01-19 00:21:29 +08:00
Wagner Bruna
61659ef299
feat: add basic sdapi support to sd-server (#1197)
* feat: add basic sdapi support to sd-server

Compatible with AUTOMATIC1111 / Forge.

* fix img2img with no mask

* add more parameter validation

* eliminate MSVC warnings

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-01-19 00:21:11 +08:00
leejet
9565c7f6bd
add support for flux2 klein (#1193)
* add support for flux2 klein 4b

* add support for flux2 klein 8b

* use attention_mask in Flux.2 klein LLMEmbedder

* update docs
2026-01-18 01:17:33 +08:00
Wagner Bruna
fbce16e02d
fix: avoid undefined behavior on image mask allocation failure (#1198) 2026-01-18 01:14:56 +08:00
akleine
7010bb4dff
feat: support for SDXS-512 model (#1180)
* feat: add U-Net specials of SDXS

* docs: update distilled_sd.md for SDXS-512

* feat: for SDXS use AutoencoderTiny as the primary VAE

* docs: update distilled_sd.md for SDXS-512

* fix: SDXS code cleaning after review by stduhpf

* format code

* fix sdxs with --taesd-preview-only

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-01-14 01:14:57 +08:00
Wagner Bruna
48d3161a8d
feat: add sd-server API support for steps, sampler and scheduler (#1173) 2026-01-14 00:34:27 +08:00
Weiqi Gao
271b594e74
sync: update ggml (#1187) 2026-01-14 00:28:55 +08:00
leejet
885e62ea82
refactor: replace ggml_ext_attention with ggml_ext_attention_ext (#1185) 2026-01-11 16:34:13 +08:00
rmatif
0e52afc651
feat: enable vae tiling for vid gen (#1152)
* enable vae tiling for vid gen

* format code

* eliminate compilation warning

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-01-08 23:23:05 +08:00
leejet
27b5f17401 ci: only push Docker images on master or release 2026-01-08 23:03:32 +08:00
Flavio Bizzarri
dfe6d6c664
fix: missing newline after seed in sd_img_gen_params_to_str (#1183) 2026-01-08 22:52:22 +08:00
leejet
9be0b91927 docs: fix safetensors file extension notation 2026-01-06 23:31:03 +08:00
evanreichard
e7e83ed4d1
fix(server): use has_file for mask multipart detection (#1178) 2026-01-06 23:16:05 +08:00
Wagner Bruna
c5602a676c
feat: prioritize gguf and safetensors formats for embeddings and LoRAs (#1169) 2026-01-05 23:58:09 +08:00
Nuno
c34730d9b4
chore: downgrade ubuntu base image in musa container image (#1176)
Signed-off-by: rare-magma <rare-magma@posteo.eu>
2026-01-05 23:56:34 +08:00
Nuno
fdcacc1ebb
ci: cancel old github action runs (#1172)
* ci: cancel old github action runs

Signed-off-by: rare-magma <rare-magma@posteo.eu>

* ci: adjust concurrency to avoid canceling non-PR workflows

---------

Signed-off-by: rare-magma <rare-magma@posteo.eu>
Co-authored-by: leejet <leejet714@gmail.com>
2026-01-05 23:52:34 +08:00
Nuno
496ec9421e
chore: add Linux Vulkan build and Docker image workflows (#1164) 2026-01-05 23:42:12 +08:00
leejet
05006cd6e1
chore: use CMAKE_BUILD_TYPE (#1175) 2026-01-05 23:29:22 +08:00
leejet
b90b1ee9cf
chore: eliminate compilation warnings under MSVC (#1170) 2026-01-04 22:26:57 +08:00
leejet
2cef4badb8 chore: use Release build for windows-latest-cmake 2026-01-04 22:26:09 +08:00
Daniele
a119a4da9a
fix: avoid issues when sigma_min is close to 0 (#1138) 2026-01-04 22:05:01 +08:00
Jay4242
6eefd2d49a
feat: support random seed flag (#1163) 2026-01-04 21:57:50 +08:00
leejet
4ff2c8c74b
refactor: simplify logic for saving results (#1149) 2025-12-28 23:27:27 +08:00
leejet
51bd9c8004 chore: reformat named cache params description into single line 2025-12-28 22:53:07 +08:00
Wagner Bruna
d0d836ae74
feat: support mmap for model loading (#1059) 2025-12-28 22:38:29 +08:00
leejet
a2d83dd0c8
refactor: move pmid condition logic into get_pmid_condition (#1148) 2025-12-27 16:48:15 +08:00
Wagner Bruna
cc107714d7
fix: consistently pass 2nd-order samplers half steps as negatives (#1095) 2025-12-27 15:54:18 +08:00
leejet
37c9860b79
fix: handle redirected UTF-8 output correctly on Windows (#1147) 2025-12-27 15:43:19 +08:00
leejet
ccb6b0ac9d
feat: add __index_timestep_zero__ support (#1146) 2025-12-26 22:07:40 +08:00
Weiqi Gao
df4efe26bd
feat: add png sequence output for vid_gen (#1117) 2025-12-26 22:06:13 +08:00
leejet
860a78e248
fix: avoid crash when using taesd for preview only (#1141) 2025-12-24 23:30:12 +08:00
leejet
a0adcfb148
feat: add support for qwen image edit 2511 (#1096) 2025-12-24 23:00:08 +08:00
leejet
3d5fdd7b37
feat: add support for more underline loras (#1135) 2025-12-24 22:59:23 +08:00
Weiqi Gao
3e6c428c27
chore: use Ninja on Windows to speed up build process (#1120) 2025-12-24 22:53:17 +08:00
张春乔
96fcb13fc0
feat: add --serve-html-path option to example server (#1123) 2025-12-24 22:43:09 +08:00
leejet
3e812460cf
fix: correct ggml_pad_ext (#1133) 2025-12-23 21:37:07 +08:00
leejet
98916e8256 docs: update README.md 2025-12-22 23:58:28 +08:00
rmatif
298b11069f
feat: add more caching methods (#1066) 2025-12-22 23:52:11 +08:00
leejet
30a91138f8 fix: add the missing } 2025-12-21 21:53:38 +08:00
leejet
c6937ba44a fix: correct the parsing of --convert-name opotion 2025-12-21 21:47:50 +08:00
leejet
ca5b1969a8
feat: do not convert tensor names by default in convert mode (#1122) 2025-12-21 18:40:10 +08:00
Phylliida Dev
50ff966445
feat: add seamless texture generation support (#914)
* global bool

* reworked circular to global flag

* cleaner implementation of tiling support in sd cpp

* cleaned rope

* working simplified but still need wraps

* Further clean of rope

* resolve flux conflict

* switch to pad op circular only

* Set ggml to most recent

* Revert ggml temp

* Update ggml to most recent

* Revert unneded flux change

* move circular flag to the GGMLRunnerContext

* Pass through circular param in all places where conv is called

* fix of constant and minor cleanup

* Added back --circular option

* Conv2d circular in vae and various models

* Fix temporal padding for qwen image and other vaes

* Z Image circular tiling

* x and y axis seamless only

* First attempt at chroma seamless x and y

* refactor into pure x and y, almost there

* Fix crash on chroma

* Refactor into cleaner variable choices

* Removed redundant set_circular_enabled

* Sync ggml

* simplify circular parameter

* format code

* no need to perform circular pad on the clip

* simplify circular_axes setting

* unify function naming

* remove unnecessary member variables

* simplify rope

---------

Co-authored-by: Phylliida <phylliidadev@gmail.com>
Co-authored-by: leejet <leejet714@gmail.com>
2025-12-21 18:06:47 +08:00
leejet
88ec9d30b1
feat: add scale_rope support (#1121) 2025-12-21 15:40:21 +08:00
stduhpf
60abda56e0
feat: select vulkan device with env variable (#629) 2025-12-21 15:35:38 +08:00
stduhpf
23fce0bd84
feat: add support for Chroma Radiance x0 (#1091)
* Add x0 Flux pred (+prepare for others)

* Fix convert models with empty tensors

* patch_32 exp support attempt

* improve support for patch_32

* follow official pipeline

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-12-20 00:55:57 +08:00
Wagner Bruna
7c88c4765c
chore: give feedback about cfg values smaller than 1 (#1088) 2025-12-19 23:41:52 +08:00
Weiqi Gao
1f77545cf8
docs: document usage of tae for VRAM reduction using wan (#1108) 2025-12-19 23:31:09 +08:00
leejet
8e9f3a4d9e
feat: add support for underline style lora of flux (#1103)
* feat: add support for underline style lora of flux

* add support for underline style lora of t5

* add more protected tokens
2025-12-18 21:44:16 +08:00
Wagner Bruna
78e15bd4af
feat: default to LCM scheduler for LCM sampling (#1109)
* feat: default to LCM scheduler for LCM sampling

* fix bug and attempt to get default scheduler for vid_gen when none is set

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-12-18 21:43:39 +08:00
Daniele
97cf2efe45
feat: add KL Optimal scheduler (#1098) 2025-12-18 21:02:55 +08:00
leejet
bda7fab9f2 chore: remove unused debug code 2025-12-17 23:43:37 +08:00
leejet
c2e18c86e8
fix: make flash attn work with high noise diffusion model (#1111) 2025-12-17 23:28:59 +08:00
leejet
c3ad6a13e1
refactor: optimize the printing of version log (#1102) 2025-12-16 23:11:27 +08:00
leejet
ebe9d26a72
feat: supports correct UTF-8 printing on windows (#1101) 2025-12-16 23:00:41 +08:00
stduhpf
9fa7f415df
feat: add taehv support for Wan/Qwen (#937) 2025-12-16 22:57:34 +08:00
akleine
a23262dfde
fix: added a clean exit in ModelLoader::load_tensors if OOM (#1097) 2025-12-16 22:45:10 +08:00
Wagner Bruna
e687913bf1
chore: remove lora_model_dir parameter (#1100) 2025-12-16 22:37:45 +08:00
Wagner Bruna
200cb6f2ca
fix: avoid crash with VAE tiling and certain image sizes (#1090) 2025-12-15 23:51:40 +08:00
leejet
43a70e819b
fix: add lora info to image metadata (#1086) 2025-12-14 01:24:15 +08:00
Kirill A. Korinsky
614f8736df
sync: update ggml (#1082) 2025-12-14 01:23:34 +08:00
stduhpf
d96b4152d6
perf: optimize ggml_ext_chunk (#1084) 2025-12-14 01:22:41 +08:00
rmatif
8f05f5bc6e
feat: add support for custom scheduler (#694)
---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-12-13 16:20:02 +08:00
leejet
15d0f82760
feat(server): do not parse lora fromt client-side prompts (#1083) 2025-12-13 14:27:47 +08:00
xxnuo
6888fcb581
feat: server add default_gen_params to override default args (#1050) 2025-12-13 14:22:32 +08:00
leejet
2aecdd57ca
feat: simple openai image generation api compatiple server (#1037) 2025-12-13 13:53:21 +08:00
leejet
11ab095230
fix: resolve embedding loading issue when calling generate_image multiple times (#1078) 2025-12-12 23:08:12 +08:00
Wagner Bruna
a3a88fc9b2
fix: avoid crash loading LoRAs with bf16 weights (#1077) 2025-12-12 22:36:54 +08:00
leejet
8823dc48bc
feat: align the spatial size to the corresponding multiple (#1073) 2025-12-10 23:15:08 +08:00
Pedrito
1ac5a616de
feat: support custom upscale tile size (#896) 2025-12-10 22:25:19 +08:00
leejet
d939f6e86a
refactor: optimize the handling of LoRA models (#1070) 2025-12-10 00:26:07 +08:00
Wagner Bruna
e72aea796e
feat: embed version string and git commit hash (#1008) 2025-12-09 22:38:54 +08:00
wuhei
a908436729
docs: update download link for Stable Diffusion v1.5 (#1063) 2025-12-09 22:06:16 +08:00
stduhpf
583a02e29e
feat: add Flux.2 VAE proj matrix for previews (#1017) 2025-12-09 22:00:45 +08:00
leejet
96c3e64057
refactor: optimize the handling of embedding (#1068)
* optimize the handling of embedding

* support case-insensitive embedding names
2025-12-08 23:59:04 +08:00
Weiqi Gao
0392273e10
chore: add compute kernels to Windows CUDA build (#1062)
* Fix syntax for CUDA architecture definitions

* Extend CUDA support to GTX 10 Series to RTX 50 Series

* update cuda installer step version to install cuda 12.8.1

* Remove unsupported compute capability
2025-12-07 22:12:50 +08:00
leejet
bf1a388b44 docs: update logo 2025-12-07 15:09:32 +08:00
leejet
c9005337a8 docs: update logo 2025-12-07 14:56:21 +08:00
leejet
2f0bd31a84
feat: add ovis image support (#1057) 2025-12-07 12:32:56 +08:00
leejet
bfbb929790
feat: do not convert bf16 to f32 (#1055) 2025-12-06 23:55:51 +08:00
leejet
689e44c9a8
fix: correct ggml_ext_silu_act (#1056) 2025-12-06 23:55:28 +08:00
leejet
985aedda32
refactor: optimize the handling of pred type (#1048) 2025-12-04 23:31:55 +08:00
leejet
3f3610b5cd
chore: optimize lora log (#1047) 2025-12-04 22:44:58 +08:00
Wagner Bruna
118683de8a
fix: correct preview method selection (#1038) 2025-12-04 22:43:16 +08:00
stduhpf
bcc9c0d0b3
feat: handle ggml compute failures without crashing the program (#1003)
* Feat: handle compute failures more gracefully

* fix Unreachable code after return

Co-authored-by: idostyle <idostyl3@googlemail.com>

* adjust z_image.hpp

---------

Co-authored-by: idostyle <idostyl3@googlemail.com>
Co-authored-by: leejet <leejet714@gmail.com>
2025-12-04 22:04:27 +08:00
leejet
5865b5e703
refactor: split SDParams to SDCliParams/SDContextParams/SDGenerationParams (#1032) 2025-12-03 22:31:46 +08:00
stduhpf
edf2cb3846
fix: fix CosXL not being detected (#989) 2025-12-03 22:25:02 +08:00
Wagner Bruna
99e17232a4
fix: prevent NaN issues with Z-Image on certain ROCm setups (#1034) 2025-12-03 22:19:34 +08:00
leejet
710169df5c docs: update news 2025-12-01 22:46:15 +08:00
Wagner Bruna
e4c50f1de5
chore: add sd_ prefix to a few functions (#967) 2025-12-01 22:43:52 +08:00
rmatif
0743a1b3b5
fix: fix vae tiling for flux2 (#1025) 2025-12-01 22:41:56 +08:00
leejet
34a6fd4e60
feat: add z-image support (#1020)
* add z-image support

* use flux_latent_rgb_proj for z-image

* fix qwen3 rope type

* add support for qwen3 4b gguf

* add support for diffusers format lora

* fix nan issue that occurs when using CUDA with k-quants weights

* add z-image docs
2025-12-01 22:39:43 +08:00
leejet
3c1187ce83 docs: correct the time of adding flux2 support 2025-11-30 12:40:56 +08:00
leejet
20eb674100
fix: avoid crash when the lora file is not found using immediately mode (#1022) 2025-11-30 12:19:37 +08:00
leejet
bc80225336
fix: make the immediate LoRA apply mode work better when using Vulkan (#1021) 2025-11-30 12:08:25 +08:00
leejet
ab7e8d285e docs: update news 2025-11-30 11:51:23 +08:00
Wagner Bruna
673dbdda17
fix: add missing line cleanup for s/it progress display (#891) 2025-11-30 11:45:30 +08:00
Wagner Bruna
0249509a30
refactor: add user data pointer to the image preview callback (#1001) 2025-11-30 11:34:17 +08:00
leejet
52b67c538b
feat: add flux2 support (#1016)
* add flux2 support

* rename qwenvl to llm

* add Flux2FlowDenoiser

* update docs
2025-11-30 11:32:56 +08:00
leejet
20345888a3
refactor: optimize the handling of sample method (#999) 2025-11-22 14:00:25 +08:00
akleine
490c51d963
feat: report success/failure when saving PNG/JPG output (#912) 2025-11-22 13:57:44 +08:00
Wagner Bruna
45c46779af
feat: add LCM scheduler (#983) 2025-11-22 13:53:31 +08:00
leejet
869d023416
refactor: optimize the handling of scheduler (#998) 2025-11-22 12:48:53 +08:00
akleine
e9bc3b6c06
fix: check the PhotoMaker id_embeds tensor ONLY in PhotoMaker V2 mode (#987) 2025-11-22 12:47:40 +08:00
Wagner Bruna
b542894fb9
fix: avoid crash on default video preview path (#997)
Co-authored-by: masamaru-san
2025-11-22 12:46:27 +08:00
leejet
5498cc0d67
feat: add Wan2.1-I2V-1.3B(SkyReels) support (#988) 2025-11-19 23:56:46 +08:00
stduhpf
aa2b8e0ca5
fix: patch 1x1 conv weights at runtime (#986) 2025-11-19 23:27:23 +08:00
rmatif
a14e2b321d
feat: add easycache support (#940) 2025-11-19 23:19:32 +08:00
leejet
28ffb6c13d
fix: resolve issue with concat multiple LoRA output diffs at runtime (#985) 2025-11-17 22:56:07 +08:00
leejet
b88cc32346
fix: avoid using same type but diff instances for rng and sampler_rng (#982) 2025-11-16 23:37:14 +08:00
leejet
f532972d60
fix: avoid precision issues on vulkan backend (#980) 2025-11-16 20:57:08 +08:00
leejet
d5b05f70c6
feat: support independent sampler rng (#978) 2025-11-16 17:11:02 +08:00
akleine
6d6dc1b8ed
fix: make PhotoMakerV2 more robust by image count check (#970) 2025-11-16 17:10:48 +08:00
Wagner Bruna
199e675cc7
feat: support for --tensor-type-rules on generation modes (#932) 2025-11-16 17:07:32 +08:00
leejet
742a7333c3
feat: add cpu rng (#977) 2025-11-16 14:48:15 +08:00
Wagner Bruna
e8eb3791c8
fix: typo in --lora-apply-mode help (#972) 2025-11-16 14:48:00 +08:00
Wagner Bruna
aa44e06890
fix: avoid crash with LoRAs and type override (#974) 2025-11-16 14:47:36 +08:00
Daniele
6448430dbb
feat: add break pseudo token support (#422)
---------

Co-authored-by: Urs Ganse <urs.ganse@helsinki.fi>
2025-11-16 14:45:20 +08:00
leejet
347710f68f
feat: support applying LoRA at runtime (#969) 2025-11-13 21:48:44 +08:00
lcy
59ebdf0bb5
chrore: enable Windows ROCm(HIP) build release (#956)
* build: fix missing commit sha in macOS and Ubuntu build zip name

The build workflows for macOS and Ubuntu incorrectly check for the
"main" branch instead of "master" when retrieving the commit hash for
naming the build artifacts.

* build: correct Vulkan SDK installation condition in build workflow

* build: Enable Windows ROCm(HIP) build release

Refer to the build workflow of llama.cpp to add a Windows ROCm (HIP)
build release to the workflow.
Since there are many differences between the HIP build and other
builds, this commit add a separate "windows-latest-cmake-hip" job,
instead of enabling the ROCm matrix entry in the existing Windows
build job.

Main differences include:

- Install ROCm SDK from AMD official installer.
- Add a cache step for ROCm installation and a ccache step for build
  processing, since the HIP build takes much longer time than other
  builds.
- Include the ROCm/HIP artifact in the release assets.
2025-11-12 00:28:55 +08:00
Flavio Bizzarri
4ffcbcaed7
fix: specify enum modifier in sd_set_preview_callback signature (#959) 2025-11-12 00:27:23 +08:00
leejet
694f0d9235
refactor: optimize the logic for name conversion and the processing of the LoRA model (#955) 2025-11-10 00:12:20 +08:00
stduhpf
8ecdf053ac
feat: add image preview support (#522) 2025-11-10 00:12:02 +08:00
leejet
ee89afc878
fix: resolve issue with pmid (#957) 2025-11-09 22:47:53 +08:00
akleine
d2d3944f50
feat: add support for SD2.x with TINY U-Nets (#939) 2025-11-09 22:47:37 +08:00
akleine
0fa3e1a383
fix: prevent core dump in PM V2 in case of incomplete cmd line (#950) 2025-11-09 22:36:43 +08:00
leejet
c2d8ffc22c
fix: compatibility for models with modified tensor shapes (#951) 2025-11-07 23:04:41 +08:00
stduhpf
fb748bb8a4
fix: TAE encoding (#935) 2025-11-07 22:58:59 +08:00
leejet
8f6c5c217b
refactor: simplify the model loading logic (#933)
* remove String2GGMLType

* remove preprocess_tensor

* fix clip init

* simplify the logic for reading weights
2025-11-03 21:21:34 +08:00
leejet
6103d86e2c
refactor: introduce GGMLRunnerContext (#928)
* introduce GGMLRunnerContext

* add Flash Attention enable control through GGMLRunnerContext

* add conv2d_direct enable control through GGMLRunnerContext
2025-11-02 02:11:04 +08:00
stduhpf
c42826b77c
fix: resolve multiple inpainting issues (#926)
* Fix inpainting masked image being broken by side effect

* Fix unet inpainting concat not being set

* Fix Flex.2 inpaint mode crash (+ use scale factor)
2025-11-02 02:10:32 +08:00
Wagner Bruna
945d9a9ee3
docs: add Koboldcpp as an available UI (#930) 2025-11-02 02:03:01 +08:00
Wagner Bruna
353e708844
docs: update ggml and llama.cpp URLs (#931) 2025-11-02 02:02:44 +08:00
leejet
dd75fc081c
refactor: unify the naming style of ggml extension functions (#921) 2025-10-28 23:26:48 +08:00
stduhpf
77eb95f8e4
docs: fix taesd direct download link (#917) 2025-10-28 23:26:23 +08:00
Wagner Bruna
8a45d0ff7f
chore: clean up stb includes (#919) 2025-10-28 23:25:45 +08:00
leejet
9e28be6479
feat: add chroma radiance support (#910)
* add chroma radiance support

* fix ci

* simply generate_init_latent

* workaround: avoid ggml cuda error

* format code

* add chroma radiance doc
2025-10-25 23:56:14 +08:00
akleine
062490aa7c
feat: add SSD1B and tiny-sd support (#897)
* feat: add code and doc for running SSD1B models

* Added some more lines to support SD1.x with TINY U-Nets too.

* support SSD-1B.safetensors

* fix sdv1.5 diffusers format loader

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-10-25 23:35:54 +08:00
stduhpf
faabc5ad3c
feat: allow models to run without all text encoder(s) (#645) 2025-10-25 22:00:56 +08:00
leejet
69b9511ce9 sync: update ggml 2025-10-24 00:32:45 +08:00
stduhpf
917f7bfe99
fix: support --flow-shift for flux models with default pred (#913) 2025-10-23 21:35:18 +08:00
leejet
48e0a28ddf
feat: add shift factor support (#903) 2025-10-23 01:20:29 +08:00
leejet
d05e46ca5e
chore: add .clang-tidy configuration and apply modernize checks (#902) 2025-10-18 23:23:40 +08:00
Wagner Bruna
64a7698347
chore: report number of Qwen layers as info (#901) 2025-10-18 23:22:01 +08:00
leejet
0723ee51c9
refactor: optimize option printing (#900) 2025-10-18 17:50:30 +08:00
leejet
90ef5f8246
feat: add auto-resize support for reference images (was Qwen-Image-Edit only) (#898) 2025-10-18 16:37:09 +08:00
leejet
db6f4791b4
feat: add wtype stat (#899) 2025-10-17 23:40:32 +08:00
leejet
b25785bc10 sync: update ggml 2025-10-17 21:46:39 +08:00
leejet
0585e2609d docs: split README sections (build, performance, etc.) into separate docs 2025-10-16 23:22:06 +08:00
leejet
683d6d08a8 chore: add github issue template 2025-10-16 21:04:41 +08:00
leejet
40a6a8710e
fix: resolve precision issues in SDXL VAE under fp16 (#888)
* fix: resolve precision issues in SDXL VAE under fp16

* add --force-sdxl-vae-conv-scale option

* update docs
2025-10-15 23:01:00 +08:00
Daniele
e3702585cb
feat: added prediction argument (#334) 2025-10-15 23:00:10 +08:00
cmdr2
a7d6d296c7
chore: allow building ggml as a separate shared lib (#468) 2025-10-15 22:10:26 +08:00
leejet
2e9242e37f
feat: add Qwen Image Edit support (#877)
* add ref latent support for qwen image

* optimize clip_preprocess and fix get_first_stage_encoding

* add qwen2vl vit support

* add qwen image edit support

* fix qwen image edit pipeline

* add mmproj file support

* support dynamic number of Qwen image transformer blocks

* set prompt_template_encode_start_idx every time

* to_add_out precision fix

* to_out.0 precision fix

* update docs
2025-10-13 23:17:18 +08:00
Wagner Bruna
c64994dc1d
fix: better progress display for second-order samplers (#834) 2025-10-13 22:12:48 +08:00
Wagner Bruna
5436f6b814
fix: correct canny preprocessor (#861) 2025-10-13 22:02:35 +08:00
leejet
1c32fa03bc
fix: avoid generating black images when running T5 on the GPU (#882) 2025-10-13 00:01:06 +08:00
Wagner Bruna
9727c6bb98
fix: resolve VAE tiling problem in Qwen Image (#873) 2025-10-12 23:45:53 +08:00
leejet
beb99a2de2
feat: add Qwen Image support (#851)
* add qwen tokenizer

* add qwen2.5 vl support

* mv qwen.hpp -> qwenvl.hpp

* add qwen image model

* add qwen image t2i pipeline

* fix qwen image flash attn

* add qwen image i2i pipline

* change encoding of vocab_qwen.hpp to utf8

* fix get_first_stage_encoding

* apply jeffbolz f32 patch

https://github.com/leejet/stable-diffusion.cpp/pull/851#issuecomment-3335515302

* fix the issue that occurs when using CUDA with k-quants weights

* optimize the handling of the FeedForward precision fix

* to_add_out precision fix

* update docs
2025-10-12 23:23:19 +08:00
Wagner Bruna
aa68b875b9
refactor: deal with default img-cfg-scale at the library level (#869) 2025-10-12 23:17:52 +08:00
Wagner Bruna
5b261b9cee
feat: add a stand-alone upscale mode (#865)
* feat: add a stand-alone upscale mode

* fix prompt option check

* format code

* update README.md

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-10-12 23:10:02 +08:00
Pedrito
e70d0205ca
feat: add support for more esrgan models & x2 & x1 models (#855) 2025-10-12 22:53:31 +08:00
leejet
02af48a97f
chore: fix vulkan ci (#878) 2025-10-11 00:40:57 +08:00
leejet
e12d5e0aaf
fix: ensure directory iteration results are sorted by filename (#858) 2025-10-11 00:18:39 +08:00
Serkan Sahin
940a2018e1
chore: fix dockerfile libgomp1 dependency + improvements (#852) 2025-10-11 00:17:45 +08:00
Sharuzzaman Ahmat Raslan
b451728b2f
docs: update README.md (#866) 2025-10-11 00:11:10 +08:00
stduhpf
11f436c483
feat: add support for Flux Controls and Flex.2 (#692) 2025-10-11 00:06:57 +08:00
leejet
35843c77ea
fix: optimize the handling of embedding weight (#859) 2025-09-25 23:09:59 +08:00
leejet
6ad46bb700 sync: update ggml 2025-09-25 21:57:43 +08:00
leejet
1ba30ce005 sync: update ggml 2025-09-25 00:38:38 +08:00
leejet
2abe9451c4
fix: optimize the handling of CLIP embedding weight (#840) 2025-09-25 00:28:20 +08:00
Wagner Bruna
f3140eadbb
fix: tensor loading thread count (#854) 2025-09-25 00:26:38 +08:00
Stefan-Olt
98ba155fc6
docs: HipBLAS / ROCm build instruction fix (#843) 2025-09-25 00:03:05 +08:00
Wagner Bruna
513f36d495
docs: include Vulkan compatibility for LoRA quants (#845) 2025-09-25 00:01:10 +08:00
rmatif
1e0d2821bb
fix: correct tensor deduplication logic (#844) 2025-09-24 23:22:40 +08:00
leejet
fd693ac6a2
refactor: remove unused --normalize-input parameter (#835) 2025-09-18 00:12:53 +08:00
Wagner Bruna
171b2222a5
fix: avoid segfault for pix2pix models without reference images (#766)
* fix: avoid segfault for pix2pix models with no reference images

* fix: default to empty reference on pix2pix models to avoid segfault

* use resize instead of reserve

* format code

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-18 00:11:38 +08:00
leejet
567f9f14f0 fix: avoid multithreading issues in the model loader 2025-09-18 00:00:15 +08:00
leejet
1e5f207006
chore: fix workflow (#836) 2025-09-17 22:11:55 +08:00
leejet
79426d578e chore: set release tag by commit count 2025-09-16 23:24:36 +08:00
vmobilis
97ad3e7ff9
refactor: simplify DPM++ (2S) Ancestral (#667) 2025-09-16 23:05:25 +08:00
Erik Scholz
8909523e92
refactor: move tiling cacl and debug print into the tiling code branch (#833) 2025-09-16 22:46:56 +08:00
rmatif
8376dfba2a
feat: add sgm_uniform scheduler, simple scheduler, and support for NitroFusion (#675)
* feat: Add timestep shift and two new schedulers

* update readme

* fix spaces

* format code

* simplify SGMUniformSchedule

* simplify shifted_timestep logic

* avoid conflict

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-16 22:42:09 +08:00
leejet
0ebe6fe118
refactor: simplify the logic of pm id image loading (#827) 2025-09-14 22:50:21 +08:00
rmatif
55c2e05d98
feat: optimize tensor loading time (#790)
* opt tensor loading

* fix build failure

* revert the changes

* allow the use of n_threads

* fix lora loading

* optimize lora loading

* add mutex

* use atomic

* fix build

* fix potential duplicate issue

* avoid duplicate lookup of lora tensor

* fix progeress bar

* remove unused remove_duplicates

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-14 22:48:35 +08:00
leejet
52a97b3ac1
feat: add vace support (#819)
* add wan vace t2v support

* add --vace-strength option

* add vace i2v support

* fix the processing of vace_context

* add vace v2v support

* update docs
2025-09-14 16:57:33 +08:00
stduhpf
2c9b1e2594
feat: add VAE encoding tiling support and adaptive overlap (#484)
* implement  tiling vae encode support

* Tiling (vae/upscale): adaptative overlap

* Tiling: fix edge case

* Tiling: fix crash when less than 2 tiles per dim

* remove extra dot

* Tiling: fix edge cases for adaptative overlap

* tiling: fix edge case

* set vae tile size via env var

* vae tiling: refactor again, base on smaller buffer for alignment

* Use bigger tiles for encode (to match compute buffer size)

* Fix edge case when tile is bigger than latent

* non-square VAE tiling (#3)

* refactor tile number calculation

* support non-square tiles

* add env var to change tile overlap

* add safeguards and better error messages for SD_TILE_OVERLAP

* add safeguards and include overlapping factor for SD_TILE_SIZE

* avoid rounding issues when specifying SD_TILE_SIZE as a factor

* lower SD_TILE_OVERLAP limit

* zero-init empty output buffer

* Fix decode latent size

* fix encode

* tile size params instead of env

* Tiled vae parameter validation (#6)

* avoid crash with invalid tile sizes, use 0 for default

* refactor default tile size, limit overlap factor

* remove explicit parameter for relative tile size

* limit encoding tile to latent size

* unify code style and format code

* update docs

* fix get_tile_sizes in decode_first_stage

---------

Co-authored-by: Wagner Bruna <wbruna@users.noreply.github.com>
Co-authored-by: leejet <leejet714@gmail.com>
2025-09-14 16:00:29 +08:00
leejet
288e2d63c0 docs: update docs 2025-09-14 14:24:24 +08:00
leejet
dc46993b55
feat: increase work_ctx memory buffer size (#814) 2025-09-14 13:19:20 +08:00
Richard Palethorpe
a6a8569ea0
feat: Add SYCL Dockerfile (#651) 2025-09-14 13:02:59 +08:00
Erik Scholz
9e7befa320
fix: harden for large files (#643) 2025-09-14 12:44:19 +08:00
Wagner Bruna
c607fc3ed4
feat: use Euler sampling by default for SD3 and Flux (#753)
Thank you for your contribution.
2025-09-14 12:34:41 +08:00
Wagner Bruna
b54bec3f18
fix: do not force VAE type to f32 on SDXL (#716)
This seems to be a leftover from the initial SDXL support: it's
not enough to avoid NaN issues, and it's not not needed for the
fixed sdxl-vae-fp16-fix .
2025-09-14 12:19:59 +08:00
Wagner Bruna
5869987fe4
fix: make weight override more robust against ggml changes (#760) 2025-09-14 12:15:53 +08:00
Wagner Bruna
48956ffb87
feat: reduce CLIP memory usage with no embeddings (#768) 2025-09-14 12:08:00 +08:00
Wagner Bruna
ddc4a18b92
fix: make tiled VAE reuse the compute buffer (#821) 2025-09-14 11:41:50 +08:00
leejet
fce6afcc6a
feat: add sd3 flash attn support (#815) 2025-09-11 23:24:29 +08:00
Erik Scholz
49d6570c43
feat: add SmoothStep Scheduler (#813) 2025-09-11 23:17:46 +08:00
clibdev
6bbaf161ad
chore: add install() support in CMakeLists.txt (#540) 2025-09-11 22:24:16 +08:00
clibdev
87cdbd5978
feat: use log_printf to print ggml logs (#545) 2025-09-11 22:16:05 +08:00
leejet
b017918106
chore: remove sd3 flash attention warn (#812) 2025-09-10 22:21:02 +08:00
Wagner Bruna
ac5a215998
fix: use {} for params init instead of memset (#781) 2025-09-10 21:49:29 +08:00
Wagner Bruna
abb36d66b5
chore: update flash attention warnings (#805) 2025-09-10 21:38:21 +08:00
Wagner Bruna
ff4fdbb88d
fix: accept NULL in sd_img_gen_params_t::input_id_images_path (#809) 2025-09-10 21:22:55 +08:00
Markus Hartung
abb115cd02
fix: clarify lora quant support and small fixes (#792) 2025-09-08 22:39:25 +08:00
leejet
c648001030
feat: add detailed tensor loading time stat (#793) 2025-09-07 22:51:44 +08:00
stduhpf
c587a43c99
feat: support incrementing ref image index (omni-kontext) (#755)
* kontext: support  ref images indices

* lora: support x_embedder

* update help message

* Support for negative indices

* support for OmniControl (offsets at index 0)

* c++11 compat

* add --increase-ref-index option

* simplify the logic and fix some issues

* update README.md

* remove unused variable

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-07 22:35:16 +08:00
leejet
f8fe4e7db9
fix: add flash attn support check (#803) 2025-09-07 21:29:06 +08:00
leejet
1c07fb6fb1 docs: update docs/wan.md 2025-09-07 12:07:20 +08:00
leejet
675208dcb6 chore: update to c++17 2025-09-07 12:04:17 +08:00
leejet
d7f430cd69 docs: update docs and help message 2025-09-07 02:26:44 +08:00
stduhpf
141a4b4113
feat: add flow shift parameter (for SD3 and Wan) (#780)
* Add flow shift parameter (for SD3 and Wan)

* unify code style and fix some issues

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-07 02:16:59 +08:00
stduhpf
21ce9fe2cf
feat: add support for timestep boundary based automatic expert routing in Wan MoE (#779)
* Wan MoE: Automatic expert routing based on timestep boundary

* unify code style and fix some issues

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-07 01:44:10 +08:00
leejet
cb1d975e96
feat: add wan2.1/2.2 support (#778)
* add wan vae suppport

* add wan model support

* add umt5 support

* add wan2.1 t2i support

* make flash attn work with wan

* make wan a little faster

* add wan2.1 t2v support

* add wan gguf support

* add offload params to cpu support

* add wan2.1 i2v support

* crop image before resize

* set default fps to 16

* add diff lora support

* fix wan2.1 i2v

* introduce sd_sample_params_t

* add wan2.2 t2v support

* add wan2.2 14B i2v support

* add wan2.2 ti2v support

* add high noise lora support

* sync: update ggml submodule url

* avoid build failure on linux

* avoid build failure

* update ggml

* update ggml

* fix sd_version_is_wan

* update ggml, fix cpu im2col_3d

* fix ggml_nn_attention_ext mask

* add cache support to ggml runner

* fix the issue of illegal memory access

* unify image loading processing

* add wan2.1/2.2 FLF2V support

* fix end_image mask

* update to latest ggml

* add GGUFReader

* update docs
2025-09-06 18:08:03 +08:00
Wagner Bruna
2eb3845df5
fix: typo in the verbose long flag (#783) 2025-09-04 00:49:01 +08:00
stduhpf
4c6475f917
feat: show usage on unknown arg (#767) 2025-09-01 21:38:34 +08:00
SmallAndSoft
f0fa7ddc40
docs: add compile option needed by Ninja (#770) 2025-09-01 21:35:25 +08:00
SmallAndSoft
a7c7905c6d
docs: add missing dash to docs/chroma.md (#771) 2025-09-01 21:34:34 +08:00
Wagner Bruna
eea77cbad9
feat: throttle model loading progress updates (#782)
Some terminals have slow display latency, so frequent output
during model loading can actually slow down the process.

Also, since tensor loading times can vary a lot, the progress
display now shows the average across past iterations instead
of just the last one.
2025-09-01 21:32:01 +08:00
NekopenDev
0e86d90ee4
chore: add Nvidia 30 series (cuda arch 86) to build 2025-09-01 21:21:34 +08:00
leejet
5900ef6605 sync: update ggml, make cuda im2col a little faster 2025-08-03 01:29:40 +08:00
Daniele
5b8996f74a
Conv2D direct support (#744)
* Conv2DDirect for VAE stage

* Enable only for Vulkan, reduced duplicated code

* Cmake option to use conv2d direct

* conv2d direct always on for opencl

* conv direct as a flag

* fix merge typo

* Align conv2d behavior to flash attention's

* fix readme

* add conv2d direct for controlnet

* add conv2d direct for esrgan

* clean code, use enable_conv2d_direct/get_all_blocks

* format code

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-08-03 01:25:17 +08:00
Wagner Bruna
f7f05fb185
chore: avoid setting GGML_MAX_NAME when building against external ggml (#751)
An external ggml will most likely have been built with the default
GGML_MAX_NAME value (64), which would be inconsistent with the value
set by our build (128). That would be an ODR violation, and it could
easily cause memory corruption issues due to the different
sizeof(struct ggml_tensor) values.

For now, when linking against an external ggml, we demand it has been
patched with a bigger GGML_MAX_NAME, since we can't check against a
value defined only at build time.
2025-08-03 01:24:40 +08:00
Seas0
6167e2927a
feat: support build against system installed GGML library (#749) 2025-08-02 11:03:18 +08:00
leejet
f6b9aa1a43 refector: optimize the usage of tensor_types 2025-07-28 23:18:29 +08:00
Wagner Bruna
7eb30d00e5
feat: add missing models and parameters to image metadata (#743)
* feat: add new scheduler types, clip skip and vae to image embedded params

- If a non default scheduler is set, include it in the 'Sampler' tag in the data
embedded into the final image.
- If a custom VAE path is set, include the vae name (without path and extension)
in embedded image params under a `VAE:` tag.
- If a custom Clip skip is set, include that Clip skip value in embedded image
params under a `Clip skip:` tag.

* feat: add separate diffusion and text models to metadata

---------

Co-authored-by: one-lithe-rune <skapusniak@lithe-runes.com>
2025-07-28 22:00:27 +08:00
stduhpf
59080d3ce1
feat: change image dimensions requirement for DiT models (#742) 2025-07-28 21:58:17 +08:00
R0CKSTAR
8c3c788f31
feat: upgrade musa sdk to rc4.2.0 (#732) 2025-07-28 21:51:11 +08:00
leejet
f54524f620 sync: update ggml 2025-07-28 21:50:12 +08:00
leejet
eed97a5e1d sync: update ggml 2025-07-24 23:04:08 +08:00
Ettore Di Giacinto
fb86bf4cb0
docs: add LocalAI to README's UIs (#741) 2025-07-24 22:39:26 +08:00
leejet
bd1eaef93e fix: convert f64 to f32 and i64 to i32 when loading weights 2025-07-24 00:59:38 +08:00
Erik Scholz
ab835f7d39
fix: correct head dim check and L_k padding of flash attention (#736) 2025-07-24 00:57:45 +08:00
Daniele
26f3f61d37
docs: add sd.cpp-webui as an available frontend (#738) 2025-07-23 23:51:57 +08:00
Oleg Skutte
1896b28ef2
fix: make --taesd work (#731) 2025-07-15 00:45:22 +08:00
leejet
0739361bfe fix: avoid macOS build failed 2025-07-13 20:18:10 +08:00
leejet
ca0bd9396e
refactor: update c api (#728) 2025-07-13 18:48:42 +08:00
stduhpf
a772dca27a
feat: add Instruct-Pix2pix/CosXL-Edit support (#679)
* Instruct-p2p support

* support 2 conditionings cfg

* Do not re-encode the exact same image twice

* fixes for 2-cfg

* Fix pix2pix latent inputs + improve inpainting a bit + fix naming

* prepare for other pix2pix-like models

* Support sdxl ip2p

* fix reference image embeddings

* Support 2-cond cfg properly in cli

* fix typo in help

* Support masks for ip2p models

* unify code style

* delete unused code

* use edit mode

* add img_cond

* format code

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-07-12 15:36:45 +08:00
Wagner Bruna
6d84a30c66
feat: overriding quant types for specific tensors on model conversion (#724) 2025-07-08 00:11:38 +08:00
stduhpf
dafc32d0dd
feat: add support for f64/i64 and clip_g diffusers model (#681) 2025-07-06 23:24:55 +08:00
idostyle
225162f270
fix: mark encoder.embed_tokens.weight as unused tensor (#721) 2025-07-06 23:10:10 +08:00
leejet
b9e4718fac fix: correct --chroma-enable-t5-mask argument 2025-07-06 11:11:47 +08:00
leejet
1ce1c1adca feat: make lora graph size variable 2025-07-05 22:44:22 +08:00
stduhpf
19fbfd8639
feat: override text encoders for unet models (#682) 2025-07-04 22:19:47 +08:00
Wagner Bruna
76c72628b1
fix: fix a few typos on cli help and error messages (#714) 2025-07-04 22:15:41 +08:00
vmobilis
3bae667f3d
fix: break the line after skipping tensors in VAE (#591) 2025-07-03 22:50:42 +08:00
stduhpf
8d0819c548
fix: actually use embeddings with SDXL (#657) 2025-07-03 22:39:57 +08:00
Binozo
7a8ff2e819
docs: add golang cgo bindings to README (#635) 2025-07-02 23:19:49 +08:00
rmatif
0927e8e322
docs: add Android app to README (#647) 2025-07-02 23:18:16 +08:00
stduhpf
83ef4e44ce
feat: add T5 with llama.cpp naming convention support (#654) 2025-07-02 23:13:00 +08:00
leejet
7dac89ad75 refector: reuse some code 2025-07-01 23:33:50 +08:00
stduhpf
9251756086
feat: add CosXL support (#683) 2025-07-01 23:13:04 +08:00
leejet
ecf5db97ae chore: fix windows build and release 2025-07-01 23:05:48 +08:00
stduhpf
ea46fd6948
fix: force zero-initialize output of tiling (#703) 2025-07-01 23:01:29 +08:00
leejet
23de7fc44a chore: avoid warnings when building on linux 2025-06-30 23:49:52 +08:00
rmatif
d42fd59464
feat: add OpenCL backend support (#680) 2025-06-30 23:32:23 +08:00
Wagner Bruna
0d8b39f0ba
fix: avoid crash on sdxl loras (#658)
Some SDXL LoRAs (eg. PCM) can exceed 12k nodes.
2025-06-30 23:29:32 +08:00
R0CKSTAR
539b5b9374
fix: fix musa docker build (#662)
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
2025-06-30 23:27:40 +08:00
Wagner Bruna
b1fc16b504
fix: allow resetting clip_skip to its default value (#697) 2025-06-30 23:23:21 +08:00
leejet
d6c87dce5c docs: add chroma doc 2025-06-29 23:58:15 +08:00
leejet
a28d04dd81 fix: fix the issue in parsing --chroma-disable-dit-mask 2025-06-29 23:52:36 +08:00
leejet
45d0ebb30c style: format code 2025-06-29 23:40:55 +08:00
stduhpf
b1cc40c35c
feat: add Chroma support (#696)
---------

Co-authored-by: Green Sky <Green-Sky@users.noreply.github.com>
Co-authored-by: leejet <leejet714@gmail.com>
2025-06-29 23:36:42 +08:00
leejet
884e23eeeb docs: add kontext doc 2025-06-29 10:35:31 +08:00
stduhpf
c9b5735116
feat: add FLUX.1 Kontext dev support (#707)
* Kontext support
* add edit mode

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-06-29 10:08:53 +08:00
vmobilis
10c6501bd0
fix missing argument in prototype of stbi_write_jpg (#613) 2025-03-09 12:30:10 +08:00
167 changed files with 1440819 additions and 13246 deletions

10
.clang-tidy Normal file
View File

@ -0,0 +1,10 @@
Checks: >
modernize-make-shared,
modernize-use-nullptr,
modernize-use-override,
modernize-pass-by-value,
modernize-return-braced-init-list,
modernize-deprecated-headers,
HeaderFilterRegex: '^$'
WarningsAsErrors: ''
FormatStyle: none

View File

@ -1,4 +1,5 @@
build*/
docs/
test/
.cache/

73
.github/ISSUE_TEMPLATE/bug_report.yml vendored Normal file
View File

@ -0,0 +1,73 @@
name: 🐞 Bug Report
description: Report a bug or unexpected behavior
title: "[Bug] "
labels: ["bug"]
body:
- type: markdown
attributes:
value: |
Please use this template and include as many details as possible to help us reproduce and fix the issue.
- type: textarea
id: commit
attributes:
label: Git commit
description: Which commit are you trying to compile?
placeholder: |
$git rev-parse HEAD
40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
validations:
required: true
- type: input
id: os
attributes:
label: Operating System & Version
placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
validations:
required: true
- type: dropdown
id: backends
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
- type: input
id: cmd_arguments
attributes:
label: Command-line arguments used
placeholder: The full command line you ran (with all flags)
validations:
required: true
- type: textarea
id: steps_to_reproduce
attributes:
label: Steps to reproduce
placeholder: A step-by-step list of what you did
validations:
required: true
- type: textarea
id: expected_behavior
attributes:
label: What you expected to happen
placeholder: Describe the expected behavior or result
validations:
required: true
- type: textarea
id: actual_behavior
attributes:
label: What actually happened
placeholder: Describe what you saw instead (errors, logs, crash, etc.)
validations:
required: true
- type: textarea
id: logs_and_errors
attributes:
label: Logs / error messages / stack trace
placeholder: Paste complete logs or error output
- type: textarea
id: additional_info
attributes:
label: Additional context / environment details
placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.

View File

@ -0,0 +1,33 @@
name: 💡 Feature Request
description: Suggest a new feature or improvement
title: "[Feature] "
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
Thank you for suggesting an improvement! Please fill in the fields below.
- type: input
id: summary
attributes:
label: Feature Summary
placeholder: A one-line summary of the feature youd like
validations:
required: true
- type: textarea
id: description
attributes:
label: Detailed Description
placeholder: What problem does this solve? How do you expect it to work?
validations:
required: true
- type: textarea
id: alternatives
attributes:
label: Alternatives you considered
placeholder: Any alternative designs or workarounds you tried
- type: textarea
id: additional_context
attributes:
label: Additional context
placeholder: Any extra information (use cases, related functionalities, constraints)

View File

@ -21,11 +21,13 @@ on:
"**/*.c",
"**/*.cpp",
"**/*.cu",
"examples/server/frontend/**",
]
pull_request:
types: [opened, synchronize, reopened]
paths:
[
".github/workflows/**",
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
@ -33,11 +35,16 @@ on:
"**/*.c",
"**/*.cpp",
"**/*.cu",
"examples/server/frontend/**",
]
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
ubuntu-latest-cmake:
runs-on: ubuntu-latest
@ -49,6 +56,16 @@ jobs:
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Dependencies
id: depends
run: |
@ -65,8 +82,8 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
@ -92,6 +109,143 @@ jobs:
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
ubuntu-latest-cmake-vulkan:
runs-on: ubuntu-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libvulkan-dev glslc
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DSD_BUILD_SHARED_LIBS=ON -DSD_VULKAN=ON
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
run: |
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
build-and-push-docker-images:
name: Build and push container images
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
id-token: write
attestations: write
artifact-metadata: write
strategy:
matrix:
variant: [musa, sycl, vulkan, cuda]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: prompt/actions-commit-hash@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to the container registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@v1.3.1
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
- name: Build and push Docker image
id: build-push
uses: docker/build-push-action@v6
with:
platforms: linux/amd64
push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
file: Dockerfile.${{ matrix.variant }}
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
labels: ${{ steps.meta.outputs.labels }}
annotations: ${{ steps.meta.outputs.annotations }}
macOS-latest-cmake:
runs-on: macos-latest
@ -102,6 +256,16 @@ jobs:
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Dependencies
id: depends
run: |
@ -118,8 +282,8 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
@ -146,10 +310,10 @@ jobs:
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
windows-latest-cmake:
runs-on: windows-2019
runs-on: windows-2022
env:
VULKAN_VERSION: 1.3.261.1
VULKAN_VERSION: 1.4.328.1
strategy:
matrix:
@ -163,10 +327,8 @@ jobs:
- build: "avx512"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;80;75"
# - build: "rocm5.5"
# defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
- build: 'vulkan'
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
- build: "vulkan"
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
steps:
- name: Clone
@ -175,44 +337,45 @@ jobs:
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Install cuda-toolkit
id: cuda-toolkit
if: ${{ matrix.build == 'cuda12' }}
uses: Jimver/cuda-toolkit@v0.2.19
uses: Jimver/cuda-toolkit@v0.2.22
with:
cuda: "12.6.2"
cuda: "12.8.1"
method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Install rocm-toolkit
id: rocm-toolkit
if: ${{ matrix.build == 'rocm5.5' }}
uses: Cyberhan123/rocm-toolkit@v0.1.0
with:
rocm: "5.5.0"
- name: Install Ninja
id: install-ninja
if: ${{ matrix.build == 'rocm5.5' }}
uses: urkle/action-get-ninja@v1
with:
version: 1.11.1
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'vulkan' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Activate MSVC environment
id: msvc_dev_cmd
uses: ilammy/msvc-dev-cmd@v1
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe -DCMAKE_BUILD_TYPE=Release ${{ matrix.defines }}
cmake --build .
- name: Check AVX512F support
id: check_avx512f
@ -230,7 +393,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
uses: prompt/actions-commit-hash@v2
- name: Pack artifacts
id: pack_artifacts
@ -254,7 +417,7 @@ jobs:
- name: Copy and pack Cuda runtime
id: pack_cuda_runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
$dst='.\build\bin\cudart\'
@ -262,7 +425,7 @@ jobs:
7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
uses: actions/upload-artifact@v4
with:
name: sd-cudart-sd-bin-win-cu12-x64.zip
@ -277,6 +440,264 @@ jobs:
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-hip:
runs-on: windows-2022
env:
HIPSDK_INSTALLER_VERSION: "25.Q3"
GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Cache ROCm Installation
id: cache-rocm
uses: actions/cache@v4
with:
path: C:\Program Files\AMD\ROCm
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
evict-old-files: 1d
- name: Install ROCm
if: steps.cache-rocm.outputs.cache-hit != 'true'
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$completed = $proc.WaitForExit(600000)
if (-not $completed) {
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
$proc.Kill()
exit 1
}
if ($proc.ExitCode -ne 0) {
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
exit 1
}
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
run: |
# Find and test ROCm installation
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
if (-not $clangPath) {
Write-Error "ROCm installation not found"
exit 1
}
& $clangPath.FullName --version
# Set HIP_PATH environment variable for later steps
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
- name: Build
run: |
mkdir build
cd build
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake .. `
-G "Unix Makefiles" `
-DSD_HIPBLAS=ON `
-DSD_BUILD_SHARED_LIBS=ON `
-DGGML_NATIVE=OFF `
-DCMAKE_C_COMPILER=clang `
-DCMAKE_CXX_COMPILER=clang++ `
-DCMAKE_BUILD_TYPE=Release `
-DGPU_TARGETS="${{ env.GPU_TARGETS }}"
cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: prompt/actions-commit-hash@v2
- name: Pack artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
ubuntu-latest-rocm:
runs-on: ubuntu-latest
container: rocm/dev-ubuntu-24.04:7.2
env:
ROCM_VERSION: "7.2"
UBUNTU_VERSION: "24.04"
GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
steps:
- run: apt-get update && apt-get install -y git
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Free disk space
run: |
# Remove preinstalled SDKs and caches not needed for this job
sudo rm -rf /usr/share/dotnet || true
sudo rm -rf /usr/local/lib/android || true
sudo rm -rf /opt/ghc || true
sudo rm -rf /usr/local/.ghcup || true
sudo rm -rf /opt/hostedtoolcache || true
# Remove old package lists and caches
sudo rm -rf /var/lib/apt/lists/* || true
sudo apt clean
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt install -y \
cmake \
hip-dev \
hipblas-dev \
ninja-build \
rocm-dev \
zip
# Clean apt caches to recover disk space
sudo apt clean
sudo rm -rf /var/lib/apt/lists/* || true
- name: Setup ROCm Environment
run: |
# Add ROCm to PATH for current session
echo "/opt/rocm/bin" >> $GITHUB_PATH
# Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
# Remove library files for architectures we're not building for to save disk space
echo "Cleaning up unneeded architecture files..."
cd /opt/rocm/lib/rocblas/library
# Keep only our target architectures
for file in *; do
if printf '%s' "$file" | grep -q 'gfx'; then
if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
echo "Removing $file" &&
sudo rm -f "$file";
fi
fi
done
cd /opt/rocm/lib/hipblaslt/library
for file in *; do
if printf '%s' "$file" | grep -q 'gfx'; then
if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
echo "Removing $file" &&
sudo rm -f "$file";
fi
fi
done
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -G Ninja \
-DCMAKE_CXX_COMPILER=amdclang++ \
-DCMAKE_C_COMPILER=amdclang \
-DCMAKE_BUILD_TYPE=Release \
-DSD_HIPBLAS=ON \
-DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DSD_BUILD_SHARED_LIBS=ON
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: prompt/actions-commit-hash@v2
- name: Prepare artifacts
id: prepare_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
# Copy licenses
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
# Move ROCm runtime libraries (to avoid double space consumption)
sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
- name: Fetch system info
id: system-info
run: |
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -284,10 +705,19 @@ jobs:
needs:
- ubuntu-latest-cmake
- ubuntu-latest-cmake-vulkan
- ubuntu-latest-rocm
- build-and-push-docker-images
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-hip
steps:
- name: Clone
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
@ -296,20 +726,27 @@ jobs:
pattern: sd-*
merge-multiple: true
- name: Get commit count
id: commit_count
run: |
echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
- name: Get commit hash
id: commit
uses: pr-mpt/actions-commit-hash@v2
uses: prompt/actions-commit-hash@v2
- name: Create release
id: create_release
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
uses: anzz1/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
tag_name: ${{ format('{0}-{1}-{2}', env.BRANCH_NAME, steps.commit_count.outputs.count, steps.commit.outputs.short) }}
- name: Upload release
id: upload_release
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
uses: actions/github-script@v3
with:
github-token: ${{secrets.GITHUB_TOKEN}}

6
.gitignore vendored
View File

@ -1,13 +1,15 @@
build*/
cmake-build-*/
test/
.vscode/
.idea/
.cache/
*.swp
.vscode/
*.bat
*.bin
*.exe
*.gguf
output*.png
models*
*.log
*.log
preview.png

5
.gitmodules vendored
View File

@ -1,3 +1,6 @@
[submodule "ggml"]
path = ggml
url = https://github.com/ggerganov/ggml.git
url = https://github.com/ggml-org/ggml.git
[submodule "examples/server/frontend"]
path = examples/server/frontend
url = https://github.com/leejet/stable-ui.git

View File

@ -8,6 +8,11 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()
if (MSVC)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
endif()
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
@ -28,10 +33,12 @@ option(SD_CUDA "sd: cuda backend" OFF)
option(SD_HIPBLAS "sd: rocm backend" OFF)
option(SD_METAL "sd: metal backend" OFF)
option(SD_VULKAN "sd: vulkan backend" OFF)
option(SD_OPENCL "sd: opencl backend" OFF)
option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF)
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON)
if(SD_CUDA)
@ -52,44 +59,81 @@ if (SD_VULKAN)
add_definitions(-DSD_USE_VULKAN)
endif ()
if (SD_OPENCL)
message("-- Use OpenCL as backend stable-diffusion")
set(GGML_OPENCL ON)
add_definitions(-DSD_USE_OPENCL)
endif ()
if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
endif ()
if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
endif()
set(SD_LIB stable-diffusion)
file(GLOB SD_LIB_SOURCES
"*.h"
"*.cpp"
"*.hpp"
"src/*.h"
"src/*.cpp"
"src/*.hpp"
"src/vocab/*.h"
"src/vocab/*.cpp"
)
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
endif()
if(NOT SDCPP_BUILD_VERSION)
set(SDCPP_BUILD_VERSION unknown)
endif()
message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
if(NOT SDCPP_BUILD_COMMIT)
set(SDCPP_BUILD_COMMIT unknown)
endif()
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
set_property(
SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/version.cpp
APPEND PROPERTY COMPILE_DEFINITIONS
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
)
# we can get only one share lib
if(SD_BUILD_SHARED_LIBS)
message("-- Build shared library")
message(${SD_LIB_SOURCES})
set(BUILD_SHARED_LIBS OFF)
if(NOT SD_BUILD_SHARED_GGML_LIB)
set(BUILD_SHARED_LIBS OFF)
endif()
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
add_definitions(-DSD_BUILD_SHARED_LIB)
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
else()
message("-- Build static library")
set(BUILD_SHARED_LIBS OFF)
if(NOT SD_BUILD_SHARED_GGML_LIB)
set(BUILD_SHARED_LIBS OFF)
endif()
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
endif()
@ -111,23 +155,38 @@ endif()
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
# see https://github.com/ggerganov/ggml/pull/682
add_definitions(-DGGML_MAX_NAME=128)
if (NOT SD_USE_SYSTEM_GGML)
# see https://github.com/ggerganov/ggml/pull/682
add_definitions(-DGGML_MAX_NAME=128)
endif()
# deps
# Only add ggml if it hasn't been added yet
if (NOT TARGET ggml)
add_subdirectory(ggml)
if (SD_USE_SYSTEM_GGML)
find_package(ggml REQUIRED)
if (NOT ggml_FOUND)
message(FATAL_ERROR "System-installed GGML library not found.")
endif()
add_library(ggml ALIAS ggml::ggml)
else()
add_subdirectory(ggml)
endif()
endif()
add_subdirectory(thirdparty)
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
target_include_directories(${SD_LIB} PUBLIC . include)
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
if (SD_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()
set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)

View File

@ -1,17 +1,23 @@
ARG UBUNTU_VERSION=22.04
ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && apt-get install -y build-essential git cmake
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
WORKDIR /sd.cpp
COPY . .
RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
RUN cmake . -B ./build
RUN cmake --build ./build --config Release --parallel
FROM ubuntu:$UBUNTU_VERSION as runtime
FROM ubuntu:$UBUNTU_VERSION AS runtime
COPY --from=build /sd.cpp/build/bin/sd /sd
RUN apt-get update && \
apt-get install --yes --no-install-recommends libgomp1 && \
apt-get clean
ENTRYPOINT [ "/sd" ]
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ]

25
Dockerfile.cuda Normal file
View File

@ -0,0 +1,25 @@
ARG CUDA_VERSION=12.6.3
ARG UBUNTU_VERSION=24.04
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
WORKDIR /sd.cpp
COPY . .
ARG CUDACXX=/usr/local/cuda/bin/nvcc
RUN cmake . -B ./build -DSD_CUDA=ON
RUN cmake --build ./build --config Release -j$(nproc)
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
RUN apt-get update && \
apt-get install --yes --no-install-recommends libgomp1 && \
apt-get clean
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ]

View File

@ -1,19 +1,24 @@
ARG MUSA_VERSION=rc3.1.1
ARG MUSA_VERSION=rc4.2.0
ARG UBUNTU_VERSION=22.04
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
RUN apt-get update && apt-get install -y cmake
RUN apt-get update && apt-get install -y ccache cmake git
WORKDIR /sd.cpp
COPY . .
RUN mkdir build && cd build && \
cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
-DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build . --config Release
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
COPY --from=build /sd.cpp/build/bin/sd /sd
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd" ]
ENTRYPOINT [ "/sd-cli" ]

20
Dockerfile.sycl Normal file
View File

@ -0,0 +1,20 @@
ARG SYCL_VERSION=2025.1.0-0
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
RUN apt-get update && apt-get install -y cmake
WORKDIR /sd.cpp
COPY . .
RUN mkdir build && cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build . --config Release -j$(nproc)
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ]

23
Dockerfile.vulkan Normal file
View File

@ -0,0 +1,23 @@
ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc
WORKDIR /sd.cpp
COPY . .
RUN cmake . -B ./build -DSD_VULKAN=ON
RUN cmake --build ./build --config Release --parallel
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install --yes --no-install-recommends libgomp1 libvulkan1 mesa-vulkan-drivers && \
apt-get clean
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ]

383
README.md
View File

@ -1,39 +1,90 @@
<p align="center">
<img src="./assets/cat_with_sd_cpp_42.png" width="360x">
<img src="./assets/logo.png" width="360x">
</p>
# stable-diffusion.cpp
Inference of Stable Diffusion and Flux in pure C/C++
<div align="center">
<a href="https://trendshift.io/repositories/9714" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9714" alt="leejet%2Fstable-diffusion.cpp | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</div>
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
***Note that this project is under active development. \
API and command-line option may change frequently.***
## 🔥Important News
* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**
👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**
👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**
👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**
👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**
👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**
👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**
👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
## Features
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
- Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
- Super lightweight and without external dependencies
- SD1.x, SD2.x, SDXL and [SD3/SD3.5](./docs/sd3.md) support
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
- [Flux-dev/Flux-schnell Support](./docs/flux.md)
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
- 16-bit, 32-bit float support
- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
- Accelerated memory-efficient CPU inference
- Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
- AVX, AVX2 and AVX512 support for x86 architectures
- Full CUDA, Metal, Vulkan and SYCL backend for GPU acceleration.
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
- No need to convert to `.ggml` or `.gguf` anymore!
- Supported models
- Image Models
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- [Chroma](./docs/chroma.md)
- [Chroma1-Radiance](./docs/chroma_radiance.md)
- [Qwen Image](./docs/qwen_image.md)
- [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
- Video Models
- [Wan2.1/Wan2.2](./docs/wan.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
- Control Net support with SD 1.5
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
- Latent Consistency Models support (LCM/LCM-LoRA)
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
- Supported backends
- CPU (AVX, AVX2 and AVX512 support for x86 architectures)
- CUDA
- Vulkan
- Metal
- OpenCL
- SYCL
- Supported weight formats
- Pytorch checkpoint (`.ckpt` or `.pth`)
- Safetensors (`.safetensors`)
- GGUF (`.gguf`)
- Supported platforms
- Linux
- Mac OS
- Windows
- Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
- Flash Attention for memory usage optimization
- Original `txt2img` and `img2img` mode
- Negative prompt
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
- Latent Consistency Models support (LCM/LCM-LoRA)
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
- VAE tiling processing for reduce memory usage
- Control Net support with SD 1.5
- Sampling method
- `Euler A`
- `Euler`
@ -43,266 +94,53 @@ Inference of Stable Diffusion and Flux in pure C/C++
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
- `DPM++ 2S a`
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
- Cross-platform reproducibility
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
- `--rng cpu`, consistent with the `comfyui RNG`
- Embedds generation parameters into png output as webui-compatible text string
- Supported platforms
- Linux
- Mac OS
- Windows
- Android (via Termux)
### TODO
## Quick Start
- [ ] More sampling methods
- [ ] Make inference faster
- The current implementation of ggml_conv_2d is slow and has high memory usage
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
- [ ] Implement Inpainting support
### Get the sd executable
## Usage
- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
- Or build from source by following the [build guide](./docs/build.md)
For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
If the built product does not meet your requirements, you can choose to build it manually.
### Download model weights
### Get the Code
- download weights(.ckpt or .safetensors or .gguf). For example
- Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
```
git clone --recursive https://github.com/leejet/stable-diffusion.cpp
cd stable-diffusion.cpp
```
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
```
cd stable-diffusion.cpp
git pull origin master
git submodule init
git submodule update
```
### Download weights
- download original weights(.ckpt or .safetensors). For example
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
- Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
- Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
```shell
curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
# curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
```sh
curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
```
### Build
#### Build from scratch
```shell
mkdir build
cd build
cmake ..
cmake --build . --config Release
```
##### Using OpenBLAS
```
cmake .. -DGGML_OPENBLAS=ON
cmake --build . --config Release
```
##### Using CUDA
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
```
cmake .. -DSD_CUDA=ON
cmake --build . --config Release
```
##### Using HipBLAS
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
```
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
cmake --build . --config Release
```
##### Using MUSA
This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
```bash
cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
cmake --build . --config Release
```
##### Using Metal
Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
```
cmake .. -DSD_METAL=ON
cmake --build . --config Release
```
##### Using Vulkan
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
```
cmake .. -DSD_VULKAN=ON
cmake --build . --config Release
```
##### Using SYCL
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
```
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake --build . --config Release
```
Example of text2img by using SYCL backend:
- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
<p align="center">
<img src="./assets/sycl_sd3_output.png" width="360x">
</p>
##### Using Flash Attention
Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
eg.:
- flux 768x768 ~600mb
- SD2 768x768 ~1400mb
For most backends, it slows things down, but for cuda it generally speeds it up too.
At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
Run by adding `--diffusion-fa` to the arguments and watch for:
```
[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
```
and the compute buffer shrink in the debug log:
```
[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
```
### Run
```
usage: ./bin/sd [arguments]
arguments:
-h, --help show this help message and exit
-M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)
-t, --threads N number of threads to use during computation (default: -1)
If threads <= 0, then threads will be set to the number of CPU physical cores
-m, --model [MODEL] path to full model
--diffusion-model path to the standalone diffusion model
--clip_l path to the clip-l text encoder
--clip_g path to the clip-l text encoder
--t5xxl path to the the t5xxl text encoder
--vae [VAE] path to vae
--taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--control-net [CONTROL_PATH] path to control net model
--embd-dir [EMBEDDING_PATH] path to embeddings
--stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings
--input-id-images-dir [DIR] path to PHOTOMAKER input id images dir
--normalize-input normalize PHOTOMAKER input id images
--upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
--upscale-repeats Run the ESRGAN upscaler this many times (default 1)
--type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)
If not specified, the default is the type of the weight file
--lora-model-dir [DIR] lora model directory
-i, --init-img [IMAGE] path to the input image, required by img2img
--control-image [IMAGE] path to image condition, control net
-o, --output OUTPUT path to write result image to (default: ./output.png)
-p, --prompt [PROMPT] the prompt to render
-n, --negative-prompt PROMPT the negative prompt (default: "")
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
--skip-layer-start START SLG enabling point: (default: 0.01)
--skip-layer-end END SLG disabling point: (default: 0.2)
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
--strength STRENGTH strength for noising/unnoising (default: 0.75)
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20%)
--control-strength STRENGTH strength to apply Control Net (default: 0.9)
1.0 corresponds to full destruction of information in init image
-H, --height H image height, in pixel space (default: 512)
-W, --width W image width, in pixel space (default: 512)
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm}
sampling method (default: "euler_a")
--steps STEPS number of sample steps (default: 20)
--rng {std_default, cuda} RNG (default: cuda)
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
-b, --batch-count COUNT number of images to generate
--schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)
--clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
--vae-tiling process vae in tiles to reduce memory usage
--vae-on-cpu keep vae in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model (for low vram)
Might lower quality, since it implies converting k and v to f16.
This might crash if it is not supported by the backend.
--control-net-cpu keep controlnet in cpu (for low vram)
--canny apply canny preprocessor (edge detection)
--color Colors the logging tags according to level
-v, --verbose print extra info
```
#### txt2img example
### Generate an image with just one command
```sh
./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
```
Using formats of different precisions will yield results of varying quality.
***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
| ---- |---- |---- |---- |---- |---- |---- |
| ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
## Performance
#### img2img example
- `./output.png` is the image generated from the above txt2img pipeline
```
./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
```
<p align="center">
<img src="./assets/img2img_output.png" width="256x">
</p>
If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
## More Guides
- [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Chroma](./docs/chroma.md)
- [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@ -310,15 +148,18 @@ Using formats of different precisions will yield results of varying quality.
- [Using TAESD to faster decoding](./docs/taesd.md)
- [Docker](./docs/docker.md)
- [Quantization and GGUF](./docs/quantization_and_gguf.md)
- [Inference acceleration via caching](./docs/caching.md)
## Bindings
These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.
* Golang: [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)
## UIs
@ -327,6 +168,11 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
- [Jellybox](https://jellybox.com)
- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
- [LocalAI](https://github.com/mudler/LocalAI)
- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
- [KoboldCpp](https://github.com/LostRuins/koboldcpp)
## Contributors
@ -340,7 +186,8 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
## References
- [ggml](https://github.com/ggerganov/ggml)
- [ggml](https://github.com/ggml-org/ggml)
- [diffusers](https://github.com/huggingface/diffusers)
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
@ -350,3 +197,5 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
- [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
- [generative-models](https://github.com/Stability-AI/generative-models/)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
- [Wan2.2](https://github.com/Wan-Video/Wan2.2)

BIN
assets/anima/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 230 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 477 KiB

BIN
assets/flux/chroma_v40.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 539 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 496 KiB

BIN
assets/flux2/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 556 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 510 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 455 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 511 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 491 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 464 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 552 KiB

BIN
assets/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 401 KiB

BIN
assets/qwen/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 457 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 415 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 450 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 594 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 870 KiB

BIN
assets/z_image/bf16.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q2_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
assets/z_image/q3_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
assets/z_image/q4_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q4_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q5_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q6_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q8_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

File diff suppressed because it is too large Load Diff

View File

@ -1,182 +0,0 @@
#ifndef __DIFFUSION_MODEL_H__
#define __DIFFUSION_MODEL_H__
#include "flux.hpp"
#include "mmdit.hpp"
#include "unet.hpp"
struct DiffusionModel {
virtual void compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) = 0;
virtual void alloc_params_buffer() = 0;
virtual void free_params_buffer() = 0;
virtual void free_compute_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0;
virtual int64_t get_adm_in_channels() = 0;
};
struct UNetModel : public DiffusionModel {
UNetModelRunner unet;
UNetModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
SDVersion version = VERSION_SD1,
bool flash_attn = false)
: unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
}
void alloc_params_buffer() {
unet.alloc_params_buffer();
}
void free_params_buffer() {
unet.free_params_buffer();
}
void free_compute_buffer() {
unet.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
unet.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() {
return unet.get_params_buffer_size();
}
int64_t get_adm_in_channels() {
return unet.unet.adm_in_channels;
}
void compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
(void)skip_layers; // SLG doesn't work with UNet models
return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
}
};
struct MMDiTModel : public DiffusionModel {
MMDiTRunner mmdit;
MMDiTModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types)
: mmdit(backend, tensor_types, "model.diffusion_model") {
}
void alloc_params_buffer() {
mmdit.alloc_params_buffer();
}
void free_params_buffer() {
mmdit.free_params_buffer();
}
void free_compute_buffer() {
mmdit.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
mmdit.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() {
return mmdit.get_params_buffer_size();
}
int64_t get_adm_in_channels() {
return 768 + 1280;
}
void compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
}
};
struct FluxModel : public DiffusionModel {
Flux::FluxRunner flux;
FluxModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
SDVersion version = VERSION_FLUX,
bool flash_attn = false)
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
}
void alloc_params_buffer() {
flux.alloc_params_buffer();
}
void free_params_buffer() {
flux.free_params_buffer();
}
void free_compute_buffer() {
flux.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
flux.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() {
return flux.get_params_buffer_size();
}
int64_t get_adm_in_channels() {
return 768;
}
void compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, output, output_ctx, skip_layers);
}
};
#endif

21
docs/anima.md Normal file
View File

@ -0,0 +1,21 @@
# How to Use
## Download weights
- Download Anima
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main
- gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae
- Download Qwen3-0.6B-Base
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/mradermacher/Qwen3-0.6B-Base-GGUF/tree/main
## Examples
```sh
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa
```
<img alt="anima image example" src="../assets/anima/example.png" />

173
docs/build.md Normal file
View File

@ -0,0 +1,173 @@
# Build from scratch
## Get the Code
```
git clone --recursive https://github.com/leejet/stable-diffusion.cpp
cd stable-diffusion.cpp
```
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
```
cd stable-diffusion.cpp
git pull origin master
git submodule init
git submodule update
```
## Build (CPU only)
If you don't have a GPU or CUDA installed, you can build a CPU-only version.
```shell
mkdir build && cd build
cmake ..
cmake --build . --config Release
```
## Build with OpenBLAS
```shell
mkdir build && cd build
cmake .. -DGGML_OPENBLAS=ON
cmake --build . --config Release
```
## Build with CUDA
This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
```shell
mkdir build && cd build
cmake .. -DSD_CUDA=ON
cmake --build . --config Release
```
## Build with HipBLAS
This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
```shell
mkdir build && cd build
if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
cmake --build . --config Release
```
## Build with MUSA
This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
```shell
mkdir build && cd build
cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
cmake --build . --config Release
```
## Build with Metal
Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
```shell
mkdir build && cd build
cmake .. -DSD_METAL=ON
cmake --build . --config Release
```
## Build with Vulkan
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
```shell
mkdir build && cd build
cmake .. -DSD_VULKAN=ON
cmake --build . --config Release
```
## Build with OpenCL (for Adreno GPU)
Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
Building for Android:
Android NDK:
Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
Setup OpenCL Dependencies for NDK:
You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
* OpenCL Headers:
```bash
# In a temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
cd ..
```
* OpenCL ICD Loader:
```shell
# In the same temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build_ndk && cd build_ndk
# Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=24 \
-DANDROID_STL=c++_shared
ninja
# Replace <YOUR_NDK_PATH>
# e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
cd ../..
```
Build `stable-diffusion.cpp` for Android with OpenCL:
```shell
mkdir build-android && cd build-android
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
cmake .. -G Ninja \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DGGML_OPENMP=OFF \
-DSD_OPENCL=ON
ninja
```
*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
## Build with SYCL
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
```shell
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake --build . --config Release
```

141
docs/caching.md Normal file
View File

@ -0,0 +1,141 @@
## Caching
Caching methods accelerate diffusion inference by reusing intermediate computations when changes between steps are small.
### Cache Modes
| Mode | Target | Description |
|------|--------|-------------|
| `ucache` | UNET models | Condition-level caching with error tracking |
| `easycache` | DiT models | Condition-level cache |
| `dbcache` | DiT models | Block-level L1 residual threshold |
| `taylorseer` | DiT models | Taylor series approximation |
| `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
| `spectrum` | UNET models | Chebyshev + Taylor output forecasting |
### UCache (UNET Models)
UCache caches the residual difference (output - input) and reuses it when input changes are below threshold.
```bash
sd-cli -m model.safetensors -p "a cat" --cache-mode ucache --cache-option "threshold=1.5"
```
#### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `threshold` | Error threshold for reuse decision | 1.0 |
| `start` | Start caching at this percent of steps | 0.15 |
| `end` | Stop caching at this percent of steps | 0.95 |
| `decay` | Error decay rate (0-1) | 1.0 |
| `relative` | Scale threshold by output norm (0/1) | 1 |
| `reset` | Reset error after computing (0/1) | 1 |
#### Reset Parameter
The `reset` parameter controls error accumulation behavior:
- `reset=1` (default): Resets accumulated error after each computed step. More aggressive caching, works well with most samplers.
- `reset=0`: Keeps error accumulated. More conservative, recommended for `euler_a` sampler.
### EasyCache (DiT Models)
Condition-level caching for DiT models. Caches and reuses outputs when input changes are below threshold.
```bash
--cache-mode easycache --cache-option "threshold=0.3"
```
#### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `threshold` | Input change threshold for reuse | 0.2 |
| `start` | Start caching at this percent of steps | 0.15 |
| `end` | Stop caching at this percent of steps | 0.95 |
### Cache-DIT (DiT Models)
For DiT models like FLUX and QWEN, use block-level caching modes.
#### DBCache
Caches blocks based on L1 residual difference threshold:
```bash
--cache-mode dbcache --cache-option "threshold=0.25,warmup=4"
```
#### TaylorSeer
Uses Taylor series approximation to predict block outputs:
```bash
--cache-mode taylorseer
```
#### Cache-DIT (Combined)
Combines DBCache and TaylorSeer:
```bash
--cache-mode cache-dit
```
#### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `Fn` | Front blocks to always compute | 8 |
| `Bn` | Back blocks to always compute | 0 |
| `threshold` | L1 residual difference threshold | 0.08 |
| `warmup` | Steps before caching starts | 8 |
#### SCM Options
Steps Computation Mask controls which steps can be cached:
```bash
--scm-mask "1,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1"
```
Mask values: `1` = compute, `0` = can cache.
| Policy | Description |
|--------|-------------|
| `dynamic` | Check threshold before caching |
| `static` | Always cache on cacheable steps |
```bash
--scm-policy dynamic
```
### Spectrum (UNET Models)
Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).
```bash
sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
```
#### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
| `m` | Chebyshev polynomial degree | 3 |
| `lam` | Ridge regression regularization | 1.0 |
| `window` | Initial window size (compute every N steps) | 2 |
| `flex` | Window growth per computed step after warmup | 0.50 |
| `warmup` | Steps to always compute before caching starts | 4 |
| `stop` | Stop caching at this fraction of total steps | 0.9 |
```
### Performance Tips
- Start with default thresholds and adjust based on output quality
- Lower threshold = better quality, less speedup
- Higher threshold = more speedup, potential quality loss
- More steps generally means more caching opportunities

33
docs/chroma.md Normal file
View File

@ -0,0 +1,33 @@
# How to Use
You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
## Download weights
- Download Chroma
- If you don't want to do the conversion yourself, download the preconverted gguf model from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF)
- Otherwise, download chroma's safetensors from [lodestones/Chroma](https://huggingface.co/lodestones/Chroma)
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Convert Chroma weights
You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
```
.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
```
## Run
### Example
For example:
```
.\bin\Release\sd-cli.exe --diffusion-model ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
```
![](../assets/flux/chroma_v40.png)

21
docs/chroma_radiance.md Normal file
View File

@ -0,0 +1,21 @@
# How to Use
## Download weights
- Download Chroma1-Radiance
- safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
- gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
- Download t5xxl
- safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
```
<img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />

137
docs/distilled_sd.md Normal file
View File

@ -0,0 +1,137 @@
# Running distilled models: SSD1B, Vega and SDx.x with tiny U-Nets
## Preface
These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B and Vega U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
## SSD1B
Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
* https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
* https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
Useful LoRAs are also available:
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
## Vega
Segmind's Vega model is available online here:
* https://huggingface.co/segmind/Segmind-Vega/resolve/main/segmind-vega.safetensors
VegaRT is an example for an LCM-LoRA:
* https://huggingface.co/segmind/Segmind-VegaRT/resolve/main/pytorch_lora_weights.safetensors
Both files can be used out-of-the-box, unlike the models described in next sections.
## SD1.x, SD2.x with tiny U-Nets
These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
* https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
### SD2.x
NotaAI provides the following model online:
* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
```
Second, create the .safetensors file by running:
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
--checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
```
This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
### SD1.x
Several Tiny SD 1.x models are available online, such as:
* https://huggingface.co/segmind/tiny-sd
* https://huggingface.co/segmind/portrait-finetuned
* https://huggingface.co/nota-ai/bk-sdm-tiny
These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
Download and prepare the model using Python:
##### Download the model using Python on your computer, for example this way:
```python
import torch
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
unet=pipe.unet
for param in unet.parameters():
param.data = param.data.contiguous() # <- important here
pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
```
##### Run the conversion script:
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path ./segmindtiny-sd \
--checkpoint_path ./segmind_tiny-sd.ckpt --half
```
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
##### Another available .ckpt file:
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
To use this file, you must first adjust its non-contiguous tensors:
```python
import torch
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
for key, value in ckpt['state_dict'].items():
if isinstance(value, torch.Tensor):
ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```
### SDXS-512
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
##### 1. Download the diffusers model from Hugging Face using Python:
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
pipe.save_pretrained(save_directory="sdxs")
```
##### 2. Create a safetensors file
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
```
##### 3. Run the model as follows:
```bash
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
--cfg-scale 1 --steps 1
```
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.

View File

@ -1,15 +1,39 @@
## Docker
# Docker
### Building using Docker
## Run CLI
```shell
docker run --rm -v /path/to/models:/models -v /path/to/output/:/output ghcr.io/leejet/stable-diffusion.cpp:master [args...]
# For example
# docker run --rm -v ./models:/models -v ./build:/output ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```
## Run server
```shell
docker run --rm --init -v /path/to/models:/models -v /path/to/output/:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master [args...]
# For example
# docker run --rm --init -v ./models:/models -v ./build:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```
## Building using Docker
```shell
docker build -t sd .
```
### Run
## Building variants using Docker
Vulkan:
```shell
docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
docker build -f Dockerfile.vulkan -t sd .
```
## Run locally built image's CLI
```shell
docker run --rm -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
# For example
# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```
# docker run --rm -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```

View File

@ -1,9 +1,9 @@
## Using ESRGAN to upscale results
You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
You can use ESRGAN—such as the model [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth)—to upscale the generated images and improve their overall resolution and clarity.
- Specify the model path using the `--upscale-model PATH` parameter. example:
```bash
sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
```

View File

@ -15,9 +15,9 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
For example:
```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
```
## Run
@ -28,7 +28,7 @@ Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully d
For example:
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
.\bin\Release\sd-cli.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
```
Using formats of different precisions will yield results of varying quality.
@ -44,7 +44,7 @@ Using formats of different precisions will yield results of varying quality.
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4
.\bin\Release\sd-cli.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
```
| q8_0 |
@ -60,7 +60,7 @@ Since many flux LoRA training libraries have used various LoRA naming formats, i
- LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models
.\bin\Release\sd-cli.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
```
![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)

92
docs/flux2.md Normal file
View File

@ -0,0 +1,92 @@
# How to Use
## Flux.2-dev
### Download weights
- Download FLUX.2-dev
- gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
- gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
### Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
```
<img alt="flux2 example" src="../assets/flux2/example.png" />
## Flux.2 klein 4B / Flux.2 klein base 4B
### Download weights
- Download FLUX.2-klein-4B
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-4B
- gguf: https://huggingface.co/leejet/FLUX.2-klein-4B-GGUF/tree/main
- Download FLUX.2-klein-base-4B
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B
- gguf: https://huggingface.co/leejet/FLUX.2-klein-base-4B-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Qwen3 4b
- safetensors: https://huggingface.co/Comfy-Org/flux2-klein-4B/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/unsloth/Qwen3-4B-GGUF/tree/main
### Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
```
<img alt="flux2-klein-4b" src="../assets/flux2/flux2-klein-4b.png" />
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
```
<img alt="flux2-klein-4b-edit" src="../assets/flux2/flux2-klein-4b-edit.png" />
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
```
<img alt="flux2-klein-base-4b" src="../assets/flux2/flux2-klein-base-4b.png" />
## Flux.2 klein 9B / Flux.2 klein base 9B
### Download weights
- Download FLUX.2-klein-9B
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-9B
- gguf: https://huggingface.co/leejet/FLUX.2-klein-9B-GGUF/tree/main
- Download FLUX.2-klein-base-9B
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-9B
- gguf: https://huggingface.co/leejet/FLUX.2-klein-base-9B-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Qwen3 8B
- safetensors: https://huggingface.co/Comfy-Org/flux2-klein-9B/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/unsloth/Qwen3-8B-GGUF/tree/main
### Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
```
<img alt="flux2-klein-9b" src="../assets/flux2/flux2-klein-9b.png" />
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
```
<img alt="flux2-klein-9b-edit" src="../assets/flux2/flux2-klein-9b-edit.png" />
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
```
<img alt="flux2-klein-base-9b" src="../assets/flux2/flux2-klein-base-9b.png" />

View File

@ -82,4 +82,4 @@ cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_H
cmake --build . --config Release
```
If everything went OK, `build\bin\sd.exe` file should appear.
If everything went OK, `build\bin\sd-cli.exe` file should appear.

39
docs/kontext.md Normal file
View File

@ -0,0 +1,39 @@
# How to Use
You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
## Download weights
- Download Kontext
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF)
- Otherwise, download FLUX.1-Kontext-dev from https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev/blob/main/flux1-kontext-dev.safetensors
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Convert Kontext weights
You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.
```
.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
```
## Run
- `--cfg-scale` is recommended to be set to 1.
### Example
For example:
```
.\bin\Release\sd-cli.exe -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
```
| ref_image | prompt | output |
| ---- | ---- |---- |
| ![](../assets/flux/flux1-dev-q8_0.png) | change 'flux.cpp' to 'kontext.cpp' |![](../assets/flux/kontext1_dev_output.png) |

View File

@ -7,7 +7,7 @@
Here's a simple example:
```
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
```
| without LCM-LoRA (--cfg-scale 7) | with LCM-LoRA (--cfg-scale 1) |

View File

@ -7,7 +7,20 @@
Here's a simple example:
```
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
```
`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
# Lora Apply Mode
There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
By default, the mode is selected automatically:
* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
* Otherwise, the **immediately** mode is used.
The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.

19
docs/ovis_image.md Normal file
View File

@ -0,0 +1,19 @@
# How to Use
## Download weights
- Download Ovis-Image-7B
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Ovis-Image-7B-GGUF
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Ovis 2.5
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/text_encoders
## Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
```
<img alt="ovis image example" src="../assets/ovis_image/example.png" />

26
docs/performance.md Normal file
View File

@ -0,0 +1,26 @@
## Use Flash Attention to save memory and improve speed.
Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
eg.:
- flux 768x768 ~600mb
- SD2 768x768 ~1400mb
For most backends, it slows things down, but for cuda it generally speeds it up too.
At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
Run by adding `--diffusion-fa` to the arguments and watch for:
```
[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
```
and the compute buffer shrink in the debug log:
```
[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
```
## Offload weights to the CPU to save VRAM without reducing generation speed.
Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.
## Use quantization to reduce memory usage.
[quantization](./quantization_and_gguf.md)

View File

@ -6,16 +6,15 @@ You can use [PhotoMaker](https://github.com/TencentARC/PhotoMaker) to personaliz
Download PhotoMaker model file (in safetensor format) [here](https://huggingface.co/bssrdf/PhotoMaker). The official release of the model file (in .bin format) does not work with ```stablediffusion.cpp```.
- Specify the PhotoMaker model path using the `--stacked-id-embd-dir PATH` parameter.
- Specify the input images path using the `--input-id-images-dir PATH` parameter.
- input images **must** have the same width and height for preprocessing (to be improved)
- Specify the PhotoMaker model path using the `--photo-maker PATH` parameter.
- Specify the input images path using the `--pm-id-images-dir PATH` parameter.
In prompt, make sure you have a class word followed by the trigger word ```"img"``` (hard-coded for now). The class word could be one of ```"man, woman, girl, boy"```. If input ID images contain asian faces, add ```Asian``` before the class
word.
Another PhotoMaker specific parameter:
- ```--style-ratio (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
- ```--pm-style-strength (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
Other parameters recommended for running Photomaker:
@ -28,7 +27,7 @@ If on low memory GPUs (<= 8GB), recommend running with ```--vae-on-cpu``` option
Example:
```bash
bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
bin/sd-cli -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --photo-maker ../models/photomaker-v1.safetensors --pm-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --pm-style-strength 10 --vae-on-cpu --steps 50
```
## PhotoMaker Version 2
@ -41,7 +40,7 @@ Running PMV2 is now a two-step process:
```
python face_detect.py input_image_dir
```
An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
An ```id_embeds.bin``` file will be generated in ```input_images_dir```
**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
@ -49,6 +48,6 @@ An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
- All the command line parameters from Version 1 remain the same for Version 2
- All the command line parameters from Version 1 remain the same for Version 2 plus one extra pointing to a valid ```id_embeds``` file: --pm-id-embed-path [path_to__id_embeds.bin]

View File

@ -23,5 +23,5 @@ You can also convert weights in the formats `ckpt/safetensors/diffusers` to gguf
For example:
```sh
./bin/sd -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
./bin/sd-cli -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
```

23
docs/qwen_image.md Normal file
View File

@ -0,0 +1,23 @@
# How to Use
## Download weights
- Download Qwen Image
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Qwen-Image-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
- Download qwen_2.5_vl 7b
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
## Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线 探索视觉生成基础模型的极限开创理解与生成一体化的未来。二、Qwen-Image的模型特色1、复杂文字渲染。支持中英渲染、自动布局 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
```
<img alt="qwen example" src="../assets/qwen/example.png" />

48
docs/qwen_image_edit.md Normal file
View File

@ -0,0 +1,48 @@
# How to Use
## Download weights
- Download Qwen Image
- Qwen Image Edit
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-GGUF/tree/main
- Qwen Image Edit 2509
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-2509-GGUF/tree/main
- Qwen Image Edit 2511
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
- Download qwen_2.5_vl 7b
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
## Examples
### Qwen Image Edit
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
```
<img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
### Qwen Image Edit 2509
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
```
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
### Qwen Image Edit 2511
To use the new Qwen Image Edit 2511 mode, the `--qwen-image-zero-cond-t` flag must be enabled; otherwise, image editing quality will degrade significantly.
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-edit-2511-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --qwen-image-zero-cond-t
```
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2511.png" />

37
docs/sd.md Normal file
View File

@ -0,0 +1,37 @@
## Download weights
- download original weights(.ckpt or .safetensors). For example
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
- Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
- Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
### txt2img example
```sh
./bin/sd-cli -m ../models/sd-v1-4.ckpt -p "a lovely cat"
# ./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
# ./bin/sd-cli -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
# ./bin/sd-cli -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
# ./bin/sd-cli --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
# ./bin/sd-cli -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
```
Using formats of different precisions will yield results of varying quality.
| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
| ---- |---- |---- |---- |---- |---- |---- |
| ![](../assets/f32.png) |![](../assets/f16.png) |![](../assets/q8_0.png) |![](../assets/q5_0.png) |![](../assets/q5_1.png) |![](../assets/q4_0.png) |![](../assets/q4_1.png) |
### img2img example
- `./output.png` is the image generated from the above txt2img pipeline
```
./bin/sd-cli -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
```
<p align="center">
<img src="../assets/img2img_output.png" width="256x">
</p>

View File

@ -14,7 +14,7 @@
For example:
```
.\bin\Release\sd.exe -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
.\bin\Release\sd-cli.exe -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
```
![](../assets/sd3.5_large.png)

View File

@ -7,11 +7,33 @@ You can use TAESD to accelerate the decoding of latent images by following these
Or curl
```bash
curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
curl -L -O https://huggingface.co/madebyollin/taesd/resolve/main/diffusion_pytorch_model.safetensors
```
- Specify the model path using the `--taesd PATH` parameter. example:
```bash
sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
```
sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
```
### Qwen-Image and wan (TAEHV)
sd.cpp also supports [TAEHV](https://github.com/madebyollin/taehv) (#937), which can be used for Qwen-Image and wan.
- For **Qwen-Image and wan2.1 and wan2.2-A14B**, download the wan2.1 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_1.safetensors)
Or curl
```bash
curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_1.safetensors
```
- For **wan2.2-TI2V-5B**, use the wan2.2 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_2.safetensors)
Or curl
```bash
curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_2.safetensors
```
Then simply replace the `--vae xxx.safetensors` with `--tae xxx.safetensors` in the commands. If it still out of VRAM, add `--vae-conv-direct` to your command though might be slower.

207
docs/wan.md Normal file
View File

@ -0,0 +1,207 @@
# How to Use
## Download weights
- Download Wan
- Wan2.1
- Wan2.1 T2V 1.3B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- Wan2.1 T2V 14B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/city96/Wan2.1-T2V-14B-gguf/tree/main
- Wan2.1 I2V 14B 480P
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/tree/main
- Wan2.1 I2V 14B 720P
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-720P-gguf/tree/main
- Wan2.1 FLF2V 14B 720P
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/city96/Wan2.1-FLF2V-14B-720P-gguf/tree/main
- Wan2.1 VACE 1.3B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/calcuis/wan-1.3b-gguf/tree/main
- Wan2.1 VACE 14B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Wan2.1_14B_VACE-GGUF/tree/main
- Wan2.2
- Wan2.2 TI2V 5B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Wan2.2-TI2V-5B-GGUF/tree/main
- Wan2.2 T2V A14B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Wan2.2-T2V-A14B-GGUF/tree/main
- Wan2.2 I2V A14B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Wan2.2-I2V-A14B-GGUF/tree/main
- Download vae
- wan_2.1_vae (for all the wan model except Wan2.2 TI2V 5B)
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors
- wan_2.2_vae (for Wan2.2 TI2V 5B only)
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors
> Wan models vae requires really much VRAM! If you do not have enough VRAM, please try tae instead, though the results may be poorer. For tae usage, please refer to [taesd](taesd.md)
- Download umt5_xxl
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors
- gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main
- Download clip_vison_h (for Wan2.1 I2V/FLF2V only)
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/clip_vision/clip_vision_h.safetensors
## Examples
### Wan2.1 T2V 1.3B
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1_t2v_1.3B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.1_1.3B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 T2V 14B
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-t2v-14b-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.1_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 I2V 14B
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-i2v-14b-480p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.1_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 T2V A14B
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 I2V A14B
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 T2V A14B T2I
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --flow-shift 3.0
```
<img width="832" height="480" alt="Wan2 2_14B_t2i" src="../assets/wan/Wan2.2_14B_t2i.png" />
### Wan2.2 T2V 14B with Lora
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat<lora:wan2.2_t2v_lightx2v_4steps_lora_v1.1_low_noise:1><lora:|high_noise|wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise:1>" --cfg-scale 3.5 --sampling-method euler --steps 4 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 4 -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --lora-model-dir ..\..\ComfyUI\models\loras --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_14B_t2v_lora.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 TI2V 5B
#### T2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_5B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### I2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_5B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 FLF2V 14B
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-flf2v-14b-720p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.1_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 FLF2V 14B
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -p "glass flower blossom" -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 VACE 1.3B
#### T2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_1.3B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### R2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_1.3B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### V2V
```
mkdir post+depth
ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\frame_%04d.jpg
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_1.3B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 VACE 14B
#### T2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_14B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### R2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_14B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### V2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_14B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>

41
docs/z_image.md Normal file
View File

@ -0,0 +1,41 @@
# How to Use
You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
## Download weights
- Download Z-Image-Turbo
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
- Download Z-Image
- safetensors: https://huggingface.co/Comfy-Org/z_image/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/unsloth/Z-Image-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Qwen3 4b
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main
## Examples
### Z-Image-Turbo
```
.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
```
<img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
### Z-Image-Base
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\z_image_bf16.safetensors --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
```
<img width="256" alt="z-image example" src="../assets/z_image/base_bf16.png" />
## Comparison of Different Quantization Types
| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|
|---|---|---|---|---|---|---|---|
| <img width="256" alt="bf16" src="../assets/z_image/bf16.png" /> | <img width="256" alt="q8_0" src="../assets/z_image/q8_0.png" /> | <img width="256" alt="q6_K" src="../assets/z_image/q6_K.png" /> | <img width="256" alt="q5_0" src="../assets/z_image/q5_0.png" /> | <img width="256" alt="q4_K" src="../assets/z_image/q4_K.png" /> | <img width="256" alt="q4_0" src="../assets/z_image/q4_0.png" /> | <img width="256" alt="q3_K" src="../assets/z_image/q3_K.png" /> | <img width="256" alt="q2_K" src="../assets/z_image/q2_K.png" /> |

View File

@ -1,197 +0,0 @@
#ifndef __ESRGAN_HPP__
#define __ESRGAN_HPP__
#include "ggml_extend.hpp"
#include "model.h"
/*
=================================== ESRGAN ===================================
References:
https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
https://github.com/XPixelGroup/BasicSR/blob/v1.4.2/basicsr/archs/rrdbnet_arch.py
*/
class ResidualDenseBlock : public GGMLBlock {
protected:
int num_feat;
int num_grow_ch;
public:
ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
: num_feat(num_feat), num_grow_ch(num_grow_ch) {
blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
}
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
return ggml_leaky_relu(ctx, x, 0.2f, true);
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [n, num_feat, h, w]
// return: [n, num_feat, h, w]
auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
auto conv3 = std::dynamic_pointer_cast<Conv2d>(blocks["conv3"]);
auto conv4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv4"]);
auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
auto x1 = lrelu(ctx, conv1->forward(ctx, x));
auto x_cat = ggml_concat(ctx, x, x1, 2);
auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x2, 2);
auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x3, 2);
auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x4, 2);
auto x5 = conv5->forward(ctx, x_cat);
x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
return x5;
}
};
class RRDB : public GGMLBlock {
public:
RRDB(int num_feat, int num_grow_ch = 32) {
blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [n, num_feat, h, w]
// return: [n, num_feat, h, w]
auto rdb1 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb1"]);
auto rdb2 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb2"]);
auto rdb3 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb3"]);
auto out = rdb1->forward(ctx, x);
out = rdb2->forward(ctx, out);
out = rdb3->forward(ctx, out);
out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
return out;
}
};
class RRDBNet : public GGMLBlock {
protected:
int scale = 4; // default RealESRGAN_x4plus_anime_6B
int num_block = 6; // default RealESRGAN_x4plus_anime_6B
int num_in_ch = 3;
int num_out_ch = 3;
int num_feat = 64; // default RealESRGAN_x4plus_anime_6B
int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B
public:
RRDBNet() {
blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
for (int i = 0; i < num_block; i++) {
std::string name = "body." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
}
blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
// upsample
blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
blocks["conv_hr"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
}
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
return ggml_leaky_relu(ctx, x, 0.2f, true);
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [n, num_in_ch, h, w]
// return: [n, num_out_ch, h*4, w*4]
auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
auto conv_body = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
auto conv_hr = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
auto conv_last = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
auto feat = conv_first->forward(ctx, x);
auto body_feat = feat;
for (int i = 0; i < num_block; i++) {
std::string name = "body." + std::to_string(i);
auto block = std::dynamic_pointer_cast<RRDB>(blocks[name]);
body_feat = block->forward(ctx, body_feat);
}
body_feat = conv_body->forward(ctx, body_feat);
feat = ggml_add(ctx, feat, body_feat);
// upsample
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
return out;
}
};
struct ESRGAN : public GGMLRunner {
RRDBNet rrdb_net;
int scale = 4;
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
ESRGAN(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
: GGMLRunner(backend) {
rrdb_net.init(params_ctx, tensor_types, "");
}
std::string get_desc() {
return "esrgan";
}
bool load_from_file(const std::string& file_path) {
LOG_INFO("loading esrgan from '%s'", file_path.c_str());
alloc_params_buffer();
std::map<std::string, ggml_tensor*> esrgan_tensors;
rrdb_net.get_param_tensors(esrgan_tensors);
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
return false;
}
bool success = model_loader.load_tensors(esrgan_tensors, backend);
if (!success) {
LOG_ERROR("load esrgan tensors from model loader failed");
return false;
}
LOG_INFO("esrgan model loaded");
return success;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
x = to_backend(x);
struct ggml_tensor* out = rrdb_net.forward(compute_ctx, x);
ggml_build_forward_expand(gf, out);
return gf;
}
void compute(const int n_threads,
struct ggml_tensor* x,
ggml_tensor** output,
ggml_context* output_ctx = NULL) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x);
};
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
}
};
#endif // __ESRGAN_HPP__

View File

@ -1,3 +1,4 @@
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_subdirectory(cli)
add_subdirectory(cli)
add_subdirectory(server)

View File

@ -1,6 +1,6 @@
set(TARGET sd)
set(TARGET sd-cli)
add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)

149
examples/cli/README.md Normal file
View File

@ -0,0 +1,149 @@
# Run
```
usage: ./bin/sd-cli [options]
CLI Options:
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
./output.png) (eg. output_%03d.png)
--preview-path <string> path to write preview image to (default: ./preview.png)
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step)
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
--canny apply canny preprocessor (edge detection)
--convert-name convert tensor name (for convert mode)
-v, --verbose print extra info
--color colors the logging tags according to level
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
-h, --help show this help message and exit
Context Options:
-m, --model <string> path to full model
--clip_l <string> path to the clip-l text encoder
--clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
--diffusion-model <string> path to the standalone diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--vae <string> path to standalone vae model
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--tae <string> alias of --taesd
--control-net <string> path to control net model
--embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
--fa use flash attention
--diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
--circular enable circular padding for convolutions
--circularx enable circular RoPE wrapping on x-axis (width) only
--circulary enable circular RoPE wrapping on y-axis (height) only
--chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the
other hand, is exactly the opposite.
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
Generation Options:
-p, --prompt <string> the prompt to render
-n, --negative-prompt <string> the negative prompt (default: "")
-i, --init-img <string> path to the init image
--end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image
--control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
lexicographical (character) order. For example, if the control video path is
`frames`, the directory contain images such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
-H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
medium
--skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> eta in DDIM, only for DDIM and TCD (default: 0)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
--strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
```

217
examples/cli/avi_writer.h Normal file
View File

@ -0,0 +1,217 @@
#ifndef __AVI_WRITER_H__
#define __AVI_WRITER_H__
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include "stable-diffusion.h"
#ifndef INCLUDE_STB_IMAGE_WRITE_H
#include "stb_image_write.h"
#endif
typedef struct {
uint32_t offset;
uint32_t size;
} avi_index_entry;
// Write 32-bit little-endian integer
void write_u32_le(FILE* f, uint32_t val) {
fwrite(&val, 4, 1, f);
}
// Write 16-bit little-endian integer
void write_u16_le(FILE* f, uint16_t val) {
fwrite(&val, 2, 1, f);
}
/**
* Create an MJPG AVI file from an array of sd_image_t images.
* Images are encoded to JPEG using stb_image_write.
*
* @param filename Output AVI file name.
* @param images Array of input images.
* @param num_images Number of images in the array.
* @param fps Frames per second for the video.
* @param quality JPEG quality (0-100).
* @return 0 on success, -1 on failure.
*/
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality = 90) {
if (num_images == 0) {
fprintf(stderr, "Error: Image array is empty.\n");
return -1;
}
FILE* f = fopen(filename, "wb");
if (!f) {
perror("Error opening file for writing");
return -1;
}
uint32_t width = images[0].width;
uint32_t height = images[0].height;
uint32_t channels = images[0].channel;
if (channels != 3 && channels != 4) {
fprintf(stderr, "Error: Unsupported channel count: %u\n", channels);
fclose(f);
return -1;
}
// --- RIFF AVI Header ---
fwrite("RIFF", 4, 1, f);
long riff_size_pos = ftell(f);
write_u32_le(f, 0); // Placeholder for file size
fwrite("AVI ", 4, 1, f);
// 'hdrl' LIST (header list)
fwrite("LIST", 4, 1, f);
write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
fwrite("hdrl", 4, 1, f);
// 'avih' chunk (AVI main header)
fwrite("avih", 4, 1, f);
write_u32_le(f, 56);
write_u32_le(f, 1000000 / fps); // Microseconds per frame
write_u32_le(f, 0); // Max bytes per second
write_u32_le(f, 0); // Padding granularity
write_u32_le(f, 0x110); // Flags (HASINDEX | ISINTERLEAVED)
write_u32_le(f, num_images); // Total frames
write_u32_le(f, 0); // Initial frames
write_u32_le(f, 1); // Number of streams
write_u32_le(f, width * height * 3); // Suggested buffer size
write_u32_le(f, width);
write_u32_le(f, height);
write_u32_le(f, 0); // Reserved
write_u32_le(f, 0); // Reserved
write_u32_le(f, 0); // Reserved
write_u32_le(f, 0); // Reserved
// 'strl' LIST (stream list)
fwrite("LIST", 4, 1, f);
write_u32_le(f, 4 + 8 + 56 + 8 + 40);
fwrite("strl", 4, 1, f);
// 'strh' chunk (stream header)
fwrite("strh", 4, 1, f);
write_u32_le(f, 56);
fwrite("vids", 4, 1, f); // Stream type: video
fwrite("MJPG", 4, 1, f); // Codec: Motion JPEG
write_u32_le(f, 0); // Flags
write_u16_le(f, 0); // Priority
write_u16_le(f, 0); // Language
write_u32_le(f, 0); // Initial frames
write_u32_le(f, 1); // Scale
write_u32_le(f, fps); // Rate
write_u32_le(f, 0); // Start
write_u32_le(f, num_images); // Length
write_u32_le(f, width * height * 3); // Suggested buffer size
write_u32_le(f, (uint32_t)-1); // Quality
write_u32_le(f, 0); // Sample size
write_u16_le(f, 0); // rcFrame.left
write_u16_le(f, 0); // rcFrame.top
write_u16_le(f, 0); // rcFrame.right
write_u16_le(f, 0); // rcFrame.bottom
// 'strf' chunk (stream format: BITMAPINFOHEADER)
fwrite("strf", 4, 1, f);
write_u32_le(f, 40);
write_u32_le(f, 40); // biSize
write_u32_le(f, width);
write_u32_le(f, height);
write_u16_le(f, 1); // biPlanes
write_u16_le(f, 24); // biBitCount
fwrite("MJPG", 4, 1, f); // biCompression (FOURCC)
write_u32_le(f, width * height * 3); // biSizeImage
write_u32_le(f, 0); // XPelsPerMeter
write_u32_le(f, 0); // YPelsPerMeter
write_u32_le(f, 0); // Colors used
write_u32_le(f, 0); // Colors important
// 'movi' LIST (video frames)
// long movi_list_pos = ftell(f);
fwrite("LIST", 4, 1, f);
long movi_size_pos = ftell(f);
write_u32_le(f, 0); // Placeholder for movi size
fwrite("movi", 4, 1, f);
avi_index_entry* index = (avi_index_entry*)malloc(sizeof(avi_index_entry) * num_images);
if (!index) {
fclose(f);
return -1;
}
// Encode and write each frame as JPEG
struct {
uint8_t* buf;
size_t size;
} jpeg_data;
for (int i = 0; i < num_images; i++) {
jpeg_data.buf = nullptr;
jpeg_data.size = 0;
// Callback function to collect JPEG data into memory
auto write_to_buf = [](void* context, void* data, int size) {
auto jd = (decltype(jpeg_data)*)context;
jd->buf = (uint8_t*)realloc(jd->buf, jd->size + size);
memcpy(jd->buf + jd->size, data, size);
jd->size += size;
};
// Encode to JPEG in memory
stbi_write_jpg_to_func(
write_to_buf,
&jpeg_data,
images[i].width,
images[i].height,
channels,
images[i].data,
quality);
// Write '00dc' chunk (video frame)
fwrite("00dc", 4, 1, f);
write_u32_le(f, (uint32_t)jpeg_data.size);
index[i].offset = ftell(f) - 8;
index[i].size = (uint32_t)jpeg_data.size;
fwrite(jpeg_data.buf, 1, jpeg_data.size, f);
// Align to even byte size
if (jpeg_data.size % 2)
fputc(0, f);
free(jpeg_data.buf);
}
// Finalize 'movi' size
long cur_pos = ftell(f);
long movi_size = cur_pos - movi_size_pos - 4;
fseek(f, movi_size_pos, SEEK_SET);
write_u32_le(f, movi_size);
fseek(f, cur_pos, SEEK_SET);
// Write 'idx1' index
fwrite("idx1", 4, 1, f);
write_u32_le(f, num_images * 16);
for (int i = 0; i < num_images; i++) {
fwrite("00dc", 4, 1, f);
write_u32_le(f, 0x10);
write_u32_le(f, index[i].offset);
write_u32_le(f, index[i].size);
}
// Finalize RIFF size
cur_pos = ftell(f);
long file_size = cur_pos - riff_size_pos - 4;
fseek(f, riff_size_pos, SEEK_SET);
write_u32_le(f, file_size);
fseek(f, cur_pos, SEEK_SET);
fclose(f);
free(index);
return 0;
}
#endif // __AVI_WRITER_H__

File diff suppressed because it is too large Load Diff

2096
examples/common/common.hpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,73 @@
set(TARGET sd-server)
option(SD_SERVER_BUILD_FRONTEND "Build server frontend with pnpm" ON)
set(FRONTEND_DIR "${CMAKE_CURRENT_SOURCE_DIR}/frontend")
set(GENERATED_HTML_HEADER "${FRONTEND_DIR}/dist/gen_index_html.h")
set(HAVE_FRONTEND_BUILD OFF)
if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}")
if(WIN32)
find_program(PNPM_EXECUTABLE NAMES pnpm.cmd pnpm)
else()
find_program(PNPM_EXECUTABLE NAMES pnpm)
endif()
if(PNPM_EXECUTABLE)
message(STATUS "Frontend dir found: ${FRONTEND_DIR}")
message(STATUS "pnpm found: ${PNPM_EXECUTABLE}")
set(HAVE_FRONTEND_BUILD ON)
add_custom_target(${TARGET}_frontend_install
COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" install
WORKING_DIRECTORY "${FRONTEND_DIR}"
COMMENT "Installing frontend dependencies"
VERBATIM
)
add_custom_target(${TARGET}_frontend_build
COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build
WORKING_DIRECTORY "${FRONTEND_DIR}"
COMMENT "Building frontend"
VERBATIM
)
add_custom_target(${TARGET}_frontend_header
COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build:header
WORKING_DIRECTORY "${FRONTEND_DIR}"
COMMENT "Generating gen_index_html.h"
VERBATIM
)
add_dependencies(${TARGET}_frontend_build ${TARGET}_frontend_install)
add_dependencies(${TARGET}_frontend_header ${TARGET}_frontend_build)
add_custom_target(${TARGET}_frontend
DEPENDS ${TARGET}_frontend_header
)
set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE)
else()
message(WARNING "pnpm not found, frontend build disabled")
endif()
else()
message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}")
endif()
add_executable(${TARGET} main.cpp)
if(HAVE_FRONTEND_BUILD)
add_dependencies(${TARGET} ${TARGET}_frontend)
target_sources(${TARGET} PRIVATE "${GENERATED_HTML_HEADER}")
target_include_directories(${TARGET} PRIVATE "${FRONTEND_DIR}/dist")
target_compile_definitions(${TARGET} PRIVATE HAVE_INDEX_HTML)
message(STATUS "HAVE_INDEX_HTML enabled")
else()
message(STATUS "HAVE_INDEX_HTML disabled")
endif()
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)

227
examples/server/README.md Normal file
View File

@ -0,0 +1,227 @@
# Frontend
## Build with Frontend
The server can optionally build the web frontend and embed it into the binary as `gen_index_html.h`.
### Requirements
Install the following tools:
* **Node.js** ≥ 22.18
https://nodejs.org/
* **pnpm** ≥ 10
Install via npm:
```bash
npm install -g pnpm
```
Verify installation:
```bash
node -v
pnpm -v
```
### Install frontend dependencies
Go to the frontend directory and install dependencies:
```bash
cd examples/server/frontend
pnpm install
```
### Build the server with CMake
Enable the frontend build option when configuring CMake:
```bash
cmake -B build -DSD_SERVER_BUILD_FRONTEND=ON
cmake --build build --config Release
```
If `pnpm` is available, the build system will automatically run:
```
pnpm run build
pnpm run build:header
```
and embed the generated frontend into the server binary.
## Frontend Repository
The web frontend is maintained in a **separate repository**, https://github.com/leejet/stable-ui.
If you want to modify the UI or frontend logic, please submit pull requests to the **frontend repository**.
This repository (`stable-diffusion.cpp`) only vendors the frontend periodically. Changes from the frontend repo are synchronized:
* approximately **every 12 weeks**, or
* when there are **major frontend updates**
Because of this, frontend changes will **not appear here immediately** after being merged upstream.
## Using an external frontend
By default, the server uses the **embedded frontend** generated during the build (`gen_index_html.h`).
You can also serve a custom frontend file instead of the embedded one by using:
```bash
--serve-html-path <path-to-index.html>
```
For example:
```bash
sd-server --serve-html-path ./index.html
```
In this case, the server will load and serve the specified `index.html` file instead of the embedded frontend. This is useful when:
* developing or testing frontend changes
* using a custom UI
* avoiding rebuilding the binary after frontend modifications
# Run
```
usage: ./bin/sd-server [options]
Svr Options:
-l, --listen-ip <string> server listen ip (default: 127.0.0.1)
--serve-html-path <string> path to HTML file to serve at root (optional)
--listen-port <int> server listen port (default: 1234)
-v, --verbose print extra info
--color colors the logging tags according to level
-h, --help show this help message and exit
Context Options:
-m, --model <string> path to full model
--clip_l <string> path to the clip-l text encoder
--clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
--diffusion-model <string> path to the standalone diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--vae <string> path to standalone vae model
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--tae <string> alias of --taesd
--control-net <string> path to control net model
--embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
--fa use flash attention
--diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
--circular enable circular padding for convolutions
--circularx enable circular RoPE wrapping on x-axis (width) only
--circulary enable circular RoPE wrapping on y-axis (height) only
--chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the
other hand, is exactly the opposite.
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
Default Generation Options:
-p, --prompt <string> the prompt to render
-n, --negative-prompt <string> the negative prompt (default: "")
-i, --init-img <string> path to the init image
--end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image
--control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
lexicographical (character) order. For example, if the control video path is
`frames`, the directory contain images such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
-H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
medium
--skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> eta in DDIM, only for DDIM and TCD (default: 0)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
--strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
```

@ -0,0 +1 @@
Subproject commit 1a34176cd6d39ad3a226b2b69047e71f6797f6bc

1238
examples/server/main.cpp Normal file

File diff suppressed because it is too large Load Diff

1048
flux.hpp

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,8 @@
clang-format -style=file -i *.cpp *.h *.hpp
clang-format -style=file -i examples/cli/*.cpp
for f in src/*.cpp src/*.h src/*.hpp src/vocab/*.h src/vocab/*.cpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do
[[ "$f" == vocab* ]] && continue
echo "formatting '$f'"
# if [ "$f" != "stable-diffusion.h" ]; then
# clang-tidy -fix -p build_linux/ "$f"
# fi
clang-format -style=file -i "$f"
done

2
ggml

@ -1 +1 @@
Subproject commit ff9052988b76e137bcf92bb335733933ca196ac0
Subproject commit a8db410a252c8c8f2d120c6f2e7133ebe032f35d

File diff suppressed because it is too large Load Diff

422
include/stable-diffusion.h Normal file
View File

@ -0,0 +1,422 @@
#ifndef __STABLE_DIFFUSION_H__
#define __STABLE_DIFFUSION_H__
#if defined(_WIN32) || defined(__CYGWIN__)
#ifndef SD_BUILD_SHARED_LIB
#define SD_API
#else
#ifdef SD_BUILD_DLL
#define SD_API __declspec(dllexport)
#else
#define SD_API __declspec(dllimport)
#endif
#endif
#else
#if __GNUC__ >= 4
#define SD_API __attribute__((visibility("default")))
#else
#define SD_API
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
enum rng_type_t {
STD_DEFAULT_RNG,
CUDA_RNG,
CPU_RNG,
RNG_TYPE_COUNT
};
enum sample_method_t {
EULER_SAMPLE_METHOD,
EULER_A_SAMPLE_METHOD,
HEUN_SAMPLE_METHOD,
DPM2_SAMPLE_METHOD,
DPMPP2S_A_SAMPLE_METHOD,
DPMPP2M_SAMPLE_METHOD,
DPMPP2Mv2_SAMPLE_METHOD,
IPNDM_SAMPLE_METHOD,
IPNDM_V_SAMPLE_METHOD,
LCM_SAMPLE_METHOD,
DDIM_TRAILING_SAMPLE_METHOD,
TCD_SAMPLE_METHOD,
RES_MULTISTEP_SAMPLE_METHOD,
RES_2S_SAMPLE_METHOD,
SAMPLE_METHOD_COUNT
};
enum scheduler_t {
DISCRETE_SCHEDULER,
KARRAS_SCHEDULER,
EXPONENTIAL_SCHEDULER,
AYS_SCHEDULER,
GITS_SCHEDULER,
SGM_UNIFORM_SCHEDULER,
SIMPLE_SCHEDULER,
SMOOTHSTEP_SCHEDULER,
KL_OPTIMAL_SCHEDULER,
LCM_SCHEDULER,
BONG_TANGENT_SCHEDULER,
SCHEDULER_COUNT
};
enum prediction_t {
EPS_PRED,
V_PRED,
EDM_V_PRED,
FLOW_PRED,
FLUX_FLOW_PRED,
FLUX2_FLOW_PRED,
PREDICTION_COUNT
};
// same as enum ggml_type
enum sd_type_t {
SD_TYPE_F32 = 0,
SD_TYPE_F16 = 1,
SD_TYPE_Q4_0 = 2,
SD_TYPE_Q4_1 = 3,
// SD_TYPE_Q4_2 = 4, support has been removed
// SD_TYPE_Q4_3 = 5, support has been removed
SD_TYPE_Q5_0 = 6,
SD_TYPE_Q5_1 = 7,
SD_TYPE_Q8_0 = 8,
SD_TYPE_Q8_1 = 9,
SD_TYPE_Q2_K = 10,
SD_TYPE_Q3_K = 11,
SD_TYPE_Q4_K = 12,
SD_TYPE_Q5_K = 13,
SD_TYPE_Q6_K = 14,
SD_TYPE_Q8_K = 15,
SD_TYPE_IQ2_XXS = 16,
SD_TYPE_IQ2_XS = 17,
SD_TYPE_IQ3_XXS = 18,
SD_TYPE_IQ1_S = 19,
SD_TYPE_IQ4_NL = 20,
SD_TYPE_IQ3_S = 21,
SD_TYPE_IQ2_S = 22,
SD_TYPE_IQ4_XS = 23,
SD_TYPE_I8 = 24,
SD_TYPE_I16 = 25,
SD_TYPE_I32 = 26,
SD_TYPE_I64 = 27,
SD_TYPE_F64 = 28,
SD_TYPE_IQ1_M = 29,
SD_TYPE_BF16 = 30,
// SD_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
// SD_TYPE_Q4_0_4_8 = 32,
// SD_TYPE_Q4_0_8_8 = 33,
SD_TYPE_TQ1_0 = 34,
SD_TYPE_TQ2_0 = 35,
// SD_TYPE_IQ4_NL_4_4 = 36,
// SD_TYPE_IQ4_NL_4_8 = 37,
// SD_TYPE_IQ4_NL_8_8 = 38,
SD_TYPE_MXFP4 = 39, // MXFP4 (1 block)
SD_TYPE_COUNT = 40,
};
enum sd_log_level_t {
SD_LOG_DEBUG,
SD_LOG_INFO,
SD_LOG_WARN,
SD_LOG_ERROR
};
enum preview_t {
PREVIEW_NONE,
PREVIEW_PROJ,
PREVIEW_TAE,
PREVIEW_VAE,
PREVIEW_COUNT
};
enum lora_apply_mode_t {
LORA_APPLY_AUTO,
LORA_APPLY_IMMEDIATELY,
LORA_APPLY_AT_RUNTIME,
LORA_APPLY_MODE_COUNT,
};
typedef struct {
bool enabled;
int tile_size_x;
int tile_size_y;
float target_overlap;
float rel_size_x;
float rel_size_y;
} sd_tiling_params_t;
typedef struct {
const char* name;
const char* path;
} sd_embedding_t;
typedef struct {
const char* model_path;
const char* clip_l_path;
const char* clip_g_path;
const char* clip_vision_path;
const char* t5xxl_path;
const char* llm_path;
const char* llm_vision_path;
const char* diffusion_model_path;
const char* high_noise_diffusion_model_path;
const char* vae_path;
const char* taesd_path;
const char* control_net_path;
const sd_embedding_t* embeddings;
uint32_t embedding_count;
const char* photo_maker_path;
const char* tensor_type_rules;
bool vae_decode_only;
bool free_params_immediately;
int n_threads;
enum sd_type_t wtype;
enum rng_type_t rng_type;
enum rng_type_t sampler_rng_type;
enum prediction_t prediction;
enum lora_apply_mode_t lora_apply_mode;
bool offload_params_to_cpu;
bool enable_mmap;
bool keep_clip_on_cpu;
bool keep_control_net_on_cpu;
bool keep_vae_on_cpu;
bool flash_attn;
bool diffusion_flash_attn;
bool tae_preview_only;
bool diffusion_conv_direct;
bool vae_conv_direct;
bool circular_x;
bool circular_y;
bool force_sdxl_vae_conv_scale;
bool chroma_use_dit_mask;
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t;
} sd_ctx_params_t;
typedef struct {
uint32_t width;
uint32_t height;
uint32_t channel;
uint8_t* data;
} sd_image_t;
typedef struct {
int* layers;
size_t layer_count;
float layer_start;
float layer_end;
float scale;
} sd_slg_params_t;
typedef struct {
float txt_cfg;
float img_cfg;
float distilled_guidance;
sd_slg_params_t slg;
} sd_guidance_params_t;
typedef struct {
sd_guidance_params_t guidance;
enum scheduler_t scheduler;
enum sample_method_t sample_method;
int sample_steps;
float eta;
int shifted_timestep;
float* custom_sigmas;
int custom_sigmas_count;
float flow_shift;
} sd_sample_params_t;
typedef struct {
sd_image_t* id_images;
int id_images_count;
const char* id_embed_path;
float style_strength;
} sd_pm_params_t; // photo maker
enum sd_cache_mode_t {
SD_CACHE_DISABLED = 0,
SD_CACHE_EASYCACHE,
SD_CACHE_UCACHE,
SD_CACHE_DBCACHE,
SD_CACHE_TAYLORSEER,
SD_CACHE_CACHE_DIT,
SD_CACHE_SPECTRUM,
};
typedef struct {
enum sd_cache_mode_t mode;
float reuse_threshold;
float start_percent;
float end_percent;
float error_decay_rate;
bool use_relative_threshold;
bool reset_error_on_compute;
int Fn_compute_blocks;
int Bn_compute_blocks;
float residual_diff_threshold;
int max_warmup_steps;
int max_cached_steps;
int max_continuous_cached_steps;
int taylorseer_n_derivatives;
int taylorseer_skip_interval;
const char* scm_mask;
bool scm_policy_dynamic;
float spectrum_w;
int spectrum_m;
float spectrum_lam;
int spectrum_window_size;
float spectrum_flex_window;
int spectrum_warmup_steps;
float spectrum_stop_percent;
} sd_cache_params_t;
typedef struct {
bool is_high_noise;
float multiplier;
const char* path;
} sd_lora_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt;
const char* negative_prompt;
int clip_skip;
sd_image_t init_image;
sd_image_t* ref_images;
int ref_images_count;
bool auto_resize_ref_image;
bool increase_ref_index;
sd_image_t mask_image;
int width;
int height;
sd_sample_params_t sample_params;
float strength;
int64_t seed;
int batch_count;
sd_image_t control_image;
float control_strength;
sd_pm_params_t pm_params;
sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache;
} sd_img_gen_params_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt;
const char* negative_prompt;
int clip_skip;
sd_image_t init_image;
sd_image_t end_image;
sd_image_t* control_frames;
int control_frames_size;
int width;
int height;
sd_sample_params_t sample_params;
sd_sample_params_t high_noise_sample_params;
float moe_boundary;
float strength;
int64_t seed;
int video_frames;
float vace_strength;
sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache;
} sd_vid_gen_params_t;
typedef struct sd_ctx_t sd_ctx_t;
typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy, void* data);
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
SD_API int32_t sd_get_num_physical_cores();
SD_API const char* sd_get_system_info();
SD_API const char* sd_type_name(enum sd_type_t type);
SD_API enum sd_type_t str_to_sd_type(const char* str);
SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
SD_API enum rng_type_t str_to_rng_type(const char* str);
SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
SD_API enum sample_method_t str_to_sample_method(const char* str);
SD_API const char* sd_scheduler_name(enum scheduler_t scheduler);
SD_API enum scheduler_t str_to_scheduler(const char* str);
SD_API const char* sd_prediction_name(enum prediction_t prediction);
SD_API enum prediction_t str_to_prediction(const char* str);
SD_API const char* sd_preview_name(enum preview_t preview);
SD_API enum preview_t str_to_preview(const char* str);
SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
SD_API void sd_cache_params_init(sd_cache_params_t* cache_params);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);
SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);
SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_method_t sample_method);
SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out);
typedef struct upscaler_ctx_t upscaler_ctx_t;
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
bool offload_params_to_cpu,
bool direct,
int n_threads,
int tile_size);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
sd_image_t input_image,
uint32_t upscale_factor);
SD_API int get_upscale_factor(upscaler_ctx_t* upscaler_ctx);
SD_API bool convert(const char* input_path,
const char* vae_path,
const char* output_path,
enum sd_type_t output_type,
const char* tensor_type_rules,
bool convert_name);
SD_API bool preprocess_canny(sd_image_t image,
float high_threshold,
float low_threshold,
float weak,
float strong,
bool inverse);
SD_API const char* sd_commit(void);
SD_API const char* sd_version(void);
#ifdef __cplusplus
}
#endif
#endif // __STABLE_DIFFUSION_H__

Some files were not shown because too many files have changed in this diff Show More