Compare commits

...

176 Commits

Author SHA1 Message Date
leejet
11ab095230
fix: resolve embedding loading issue when calling generate_image multiple times (#1078) 2025-12-12 23:08:12 +08:00
Wagner Bruna
a3a88fc9b2
fix: avoid crash loading LoRAs with bf16 weights (#1077) 2025-12-12 22:36:54 +08:00
leejet
8823dc48bc
feat: align the spatial size to the corresponding multiple (#1073) 2025-12-10 23:15:08 +08:00
Pedrito
1ac5a616de
feat: support custom upscale tile size (#896) 2025-12-10 22:25:19 +08:00
leejet
d939f6e86a
refactor: optimize the handling of LoRA models (#1070) 2025-12-10 00:26:07 +08:00
Wagner Bruna
e72aea796e
feat: embed version string and git commit hash (#1008) 2025-12-09 22:38:54 +08:00
wuhei
a908436729
docs: update download link for Stable Diffusion v1.5 (#1063) 2025-12-09 22:06:16 +08:00
stduhpf
583a02e29e
feat: add Flux.2 VAE proj matrix for previews (#1017) 2025-12-09 22:00:45 +08:00
leejet
96c3e64057
refactor: optimize the handling of embedding (#1068)
* optimize the handling of embedding

* support case-insensitive embedding names
2025-12-08 23:59:04 +08:00
Weiqi Gao
0392273e10
chore: add compute kernels to Windows CUDA build (#1062)
* Fix syntax for CUDA architecture definitions

* Extend CUDA support to GTX 10 Series to RTX 50 Series

* update cuda installer step version to install cuda 12.8.1

* Remove unsupported compute capability
2025-12-07 22:12:50 +08:00
leejet
bf1a388b44 docs: update logo 2025-12-07 15:09:32 +08:00
leejet
c9005337a8 docs: update logo 2025-12-07 14:56:21 +08:00
leejet
2f0bd31a84
feat: add ovis image support (#1057) 2025-12-07 12:32:56 +08:00
leejet
bfbb929790
feat: do not convert bf16 to f32 (#1055) 2025-12-06 23:55:51 +08:00
leejet
689e44c9a8
fix: correct ggml_ext_silu_act (#1056) 2025-12-06 23:55:28 +08:00
leejet
985aedda32
refactor: optimize the handling of pred type (#1048) 2025-12-04 23:31:55 +08:00
leejet
3f3610b5cd
chore: optimize lora log (#1047) 2025-12-04 22:44:58 +08:00
Wagner Bruna
118683de8a
fix: correct preview method selection (#1038) 2025-12-04 22:43:16 +08:00
stduhpf
bcc9c0d0b3
feat: handle ggml compute failures without crashing the program (#1003)
* Feat: handle compute failures more gracefully

* fix Unreachable code after return

Co-authored-by: idostyle <idostyl3@googlemail.com>

* adjust z_image.hpp

---------

Co-authored-by: idostyle <idostyl3@googlemail.com>
Co-authored-by: leejet <leejet714@gmail.com>
2025-12-04 22:04:27 +08:00
leejet
5865b5e703
refactor: split SDParams to SDCliParams/SDContextParams/SDGenerationParams (#1032) 2025-12-03 22:31:46 +08:00
stduhpf
edf2cb3846
fix: fix CosXL not being detected (#989) 2025-12-03 22:25:02 +08:00
Wagner Bruna
99e17232a4
fix: prevent NaN issues with Z-Image on certain ROCm setups (#1034) 2025-12-03 22:19:34 +08:00
leejet
710169df5c docs: update news 2025-12-01 22:46:15 +08:00
Wagner Bruna
e4c50f1de5
chore: add sd_ prefix to a few functions (#967) 2025-12-01 22:43:52 +08:00
rmatif
0743a1b3b5
fix: fix vae tiling for flux2 (#1025) 2025-12-01 22:41:56 +08:00
leejet
34a6fd4e60
feat: add z-image support (#1020)
* add z-image support

* use flux_latent_rgb_proj for z-image

* fix qwen3 rope type

* add support for qwen3 4b gguf

* add support for diffusers format lora

* fix nan issue that occurs when using CUDA with k-quants weights

* add z-image docs
2025-12-01 22:39:43 +08:00
leejet
3c1187ce83 docs: correct the time of adding flux2 support 2025-11-30 12:40:56 +08:00
leejet
20eb674100
fix: avoid crash when the lora file is not found using immediately mode (#1022) 2025-11-30 12:19:37 +08:00
leejet
bc80225336
fix: make the immediate LoRA apply mode work better when using Vulkan (#1021) 2025-11-30 12:08:25 +08:00
leejet
ab7e8d285e docs: update news 2025-11-30 11:51:23 +08:00
Wagner Bruna
673dbdda17
fix: add missing line cleanup for s/it progress display (#891) 2025-11-30 11:45:30 +08:00
Wagner Bruna
0249509a30
refactor: add user data pointer to the image preview callback (#1001) 2025-11-30 11:34:17 +08:00
leejet
52b67c538b
feat: add flux2 support (#1016)
* add flux2 support

* rename qwenvl to llm

* add Flux2FlowDenoiser

* update docs
2025-11-30 11:32:56 +08:00
leejet
20345888a3
refactor: optimize the handling of sample method (#999) 2025-11-22 14:00:25 +08:00
akleine
490c51d963
feat: report success/failure when saving PNG/JPG output (#912) 2025-11-22 13:57:44 +08:00
Wagner Bruna
45c46779af
feat: add LCM scheduler (#983) 2025-11-22 13:53:31 +08:00
leejet
869d023416
refactor: optimize the handling of scheduler (#998) 2025-11-22 12:48:53 +08:00
akleine
e9bc3b6c06
fix: check the PhotoMaker id_embeds tensor ONLY in PhotoMaker V2 mode (#987) 2025-11-22 12:47:40 +08:00
Wagner Bruna
b542894fb9
fix: avoid crash on default video preview path (#997)
Co-authored-by: masamaru-san
2025-11-22 12:46:27 +08:00
leejet
5498cc0d67
feat: add Wan2.1-I2V-1.3B(SkyReels) support (#988) 2025-11-19 23:56:46 +08:00
stduhpf
aa2b8e0ca5
fix: patch 1x1 conv weights at runtime (#986) 2025-11-19 23:27:23 +08:00
rmatif
a14e2b321d
feat: add easycache support (#940) 2025-11-19 23:19:32 +08:00
leejet
28ffb6c13d
fix: resolve issue with concat multiple LoRA output diffs at runtime (#985) 2025-11-17 22:56:07 +08:00
leejet
b88cc32346
fix: avoid using same type but diff instances for rng and sampler_rng (#982) 2025-11-16 23:37:14 +08:00
leejet
f532972d60
fix: avoid precision issues on vulkan backend (#980) 2025-11-16 20:57:08 +08:00
leejet
d5b05f70c6
feat: support independent sampler rng (#978) 2025-11-16 17:11:02 +08:00
akleine
6d6dc1b8ed
fix: make PhotoMakerV2 more robust by image count check (#970) 2025-11-16 17:10:48 +08:00
Wagner Bruna
199e675cc7
feat: support for --tensor-type-rules on generation modes (#932) 2025-11-16 17:07:32 +08:00
leejet
742a7333c3
feat: add cpu rng (#977) 2025-11-16 14:48:15 +08:00
Wagner Bruna
e8eb3791c8
fix: typo in --lora-apply-mode help (#972) 2025-11-16 14:48:00 +08:00
Wagner Bruna
aa44e06890
fix: avoid crash with LoRAs and type override (#974) 2025-11-16 14:47:36 +08:00
Daniele
6448430dbb
feat: add break pseudo token support (#422)
---------

Co-authored-by: Urs Ganse <urs.ganse@helsinki.fi>
2025-11-16 14:45:20 +08:00
leejet
347710f68f
feat: support applying LoRA at runtime (#969) 2025-11-13 21:48:44 +08:00
lcy
59ebdf0bb5
chrore: enable Windows ROCm(HIP) build release (#956)
* build: fix missing commit sha in macOS and Ubuntu build zip name

The build workflows for macOS and Ubuntu incorrectly check for the
"main" branch instead of "master" when retrieving the commit hash for
naming the build artifacts.

* build: correct Vulkan SDK installation condition in build workflow

* build: Enable Windows ROCm(HIP) build release

Refer to the build workflow of llama.cpp to add a Windows ROCm (HIP)
build release to the workflow.
Since there are many differences between the HIP build and other
builds, this commit add a separate "windows-latest-cmake-hip" job,
instead of enabling the ROCm matrix entry in the existing Windows
build job.

Main differences include:

- Install ROCm SDK from AMD official installer.
- Add a cache step for ROCm installation and a ccache step for build
  processing, since the HIP build takes much longer time than other
  builds.
- Include the ROCm/HIP artifact in the release assets.
2025-11-12 00:28:55 +08:00
Flavio Bizzarri
4ffcbcaed7
fix: specify enum modifier in sd_set_preview_callback signature (#959) 2025-11-12 00:27:23 +08:00
leejet
694f0d9235
refactor: optimize the logic for name conversion and the processing of the LoRA model (#955) 2025-11-10 00:12:20 +08:00
stduhpf
8ecdf053ac
feat: add image preview support (#522) 2025-11-10 00:12:02 +08:00
leejet
ee89afc878
fix: resolve issue with pmid (#957) 2025-11-09 22:47:53 +08:00
akleine
d2d3944f50
feat: add support for SD2.x with TINY U-Nets (#939) 2025-11-09 22:47:37 +08:00
akleine
0fa3e1a383
fix: prevent core dump in PM V2 in case of incomplete cmd line (#950) 2025-11-09 22:36:43 +08:00
leejet
c2d8ffc22c
fix: compatibility for models with modified tensor shapes (#951) 2025-11-07 23:04:41 +08:00
stduhpf
fb748bb8a4
fix: TAE encoding (#935) 2025-11-07 22:58:59 +08:00
leejet
8f6c5c217b
refactor: simplify the model loading logic (#933)
* remove String2GGMLType

* remove preprocess_tensor

* fix clip init

* simplify the logic for reading weights
2025-11-03 21:21:34 +08:00
leejet
6103d86e2c
refactor: introduce GGMLRunnerContext (#928)
* introduce GGMLRunnerContext

* add Flash Attention enable control through GGMLRunnerContext

* add conv2d_direct enable control through GGMLRunnerContext
2025-11-02 02:11:04 +08:00
stduhpf
c42826b77c
fix: resolve multiple inpainting issues (#926)
* Fix inpainting masked image being broken by side effect

* Fix unet inpainting concat not being set

* Fix Flex.2 inpaint mode crash (+ use scale factor)
2025-11-02 02:10:32 +08:00
Wagner Bruna
945d9a9ee3
docs: add Koboldcpp as an available UI (#930) 2025-11-02 02:03:01 +08:00
Wagner Bruna
353e708844
docs: update ggml and llama.cpp URLs (#931) 2025-11-02 02:02:44 +08:00
leejet
dd75fc081c
refactor: unify the naming style of ggml extension functions (#921) 2025-10-28 23:26:48 +08:00
stduhpf
77eb95f8e4
docs: fix taesd direct download link (#917) 2025-10-28 23:26:23 +08:00
Wagner Bruna
8a45d0ff7f
chore: clean up stb includes (#919) 2025-10-28 23:25:45 +08:00
leejet
9e28be6479
feat: add chroma radiance support (#910)
* add chroma radiance support

* fix ci

* simply generate_init_latent

* workaround: avoid ggml cuda error

* format code

* add chroma radiance doc
2025-10-25 23:56:14 +08:00
akleine
062490aa7c
feat: add SSD1B and tiny-sd support (#897)
* feat: add code and doc for running SSD1B models

* Added some more lines to support SD1.x with TINY U-Nets too.

* support SSD-1B.safetensors

* fix sdv1.5 diffusers format loader

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-10-25 23:35:54 +08:00
stduhpf
faabc5ad3c
feat: allow models to run without all text encoder(s) (#645) 2025-10-25 22:00:56 +08:00
leejet
69b9511ce9 sync: update ggml 2025-10-24 00:32:45 +08:00
stduhpf
917f7bfe99
fix: support --flow-shift for flux models with default pred (#913) 2025-10-23 21:35:18 +08:00
leejet
48e0a28ddf
feat: add shift factor support (#903) 2025-10-23 01:20:29 +08:00
leejet
d05e46ca5e
chore: add .clang-tidy configuration and apply modernize checks (#902) 2025-10-18 23:23:40 +08:00
Wagner Bruna
64a7698347
chore: report number of Qwen layers as info (#901) 2025-10-18 23:22:01 +08:00
leejet
0723ee51c9
refactor: optimize option printing (#900) 2025-10-18 17:50:30 +08:00
leejet
90ef5f8246
feat: add auto-resize support for reference images (was Qwen-Image-Edit only) (#898) 2025-10-18 16:37:09 +08:00
leejet
db6f4791b4
feat: add wtype stat (#899) 2025-10-17 23:40:32 +08:00
leejet
b25785bc10 sync: update ggml 2025-10-17 21:46:39 +08:00
leejet
0585e2609d docs: split README sections (build, performance, etc.) into separate docs 2025-10-16 23:22:06 +08:00
leejet
683d6d08a8 chore: add github issue template 2025-10-16 21:04:41 +08:00
leejet
40a6a8710e
fix: resolve precision issues in SDXL VAE under fp16 (#888)
* fix: resolve precision issues in SDXL VAE under fp16

* add --force-sdxl-vae-conv-scale option

* update docs
2025-10-15 23:01:00 +08:00
Daniele
e3702585cb
feat: added prediction argument (#334) 2025-10-15 23:00:10 +08:00
cmdr2
a7d6d296c7
chore: allow building ggml as a separate shared lib (#468) 2025-10-15 22:10:26 +08:00
leejet
2e9242e37f
feat: add Qwen Image Edit support (#877)
* add ref latent support for qwen image

* optimize clip_preprocess and fix get_first_stage_encoding

* add qwen2vl vit support

* add qwen image edit support

* fix qwen image edit pipeline

* add mmproj file support

* support dynamic number of Qwen image transformer blocks

* set prompt_template_encode_start_idx every time

* to_add_out precision fix

* to_out.0 precision fix

* update docs
2025-10-13 23:17:18 +08:00
Wagner Bruna
c64994dc1d
fix: better progress display for second-order samplers (#834) 2025-10-13 22:12:48 +08:00
Wagner Bruna
5436f6b814
fix: correct canny preprocessor (#861) 2025-10-13 22:02:35 +08:00
leejet
1c32fa03bc
fix: avoid generating black images when running T5 on the GPU (#882) 2025-10-13 00:01:06 +08:00
Wagner Bruna
9727c6bb98
fix: resolve VAE tiling problem in Qwen Image (#873) 2025-10-12 23:45:53 +08:00
leejet
beb99a2de2
feat: add Qwen Image support (#851)
* add qwen tokenizer

* add qwen2.5 vl support

* mv qwen.hpp -> qwenvl.hpp

* add qwen image model

* add qwen image t2i pipeline

* fix qwen image flash attn

* add qwen image i2i pipline

* change encoding of vocab_qwen.hpp to utf8

* fix get_first_stage_encoding

* apply jeffbolz f32 patch

https://github.com/leejet/stable-diffusion.cpp/pull/851#issuecomment-3335515302

* fix the issue that occurs when using CUDA with k-quants weights

* optimize the handling of the FeedForward precision fix

* to_add_out precision fix

* update docs
2025-10-12 23:23:19 +08:00
Wagner Bruna
aa68b875b9
refactor: deal with default img-cfg-scale at the library level (#869) 2025-10-12 23:17:52 +08:00
Wagner Bruna
5b261b9cee
feat: add a stand-alone upscale mode (#865)
* feat: add a stand-alone upscale mode

* fix prompt option check

* format code

* update README.md

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-10-12 23:10:02 +08:00
Pedrito
e70d0205ca
feat: add support for more esrgan models & x2 & x1 models (#855) 2025-10-12 22:53:31 +08:00
leejet
02af48a97f
chore: fix vulkan ci (#878) 2025-10-11 00:40:57 +08:00
leejet
e12d5e0aaf
fix: ensure directory iteration results are sorted by filename (#858) 2025-10-11 00:18:39 +08:00
Serkan Sahin
940a2018e1
chore: fix dockerfile libgomp1 dependency + improvements (#852) 2025-10-11 00:17:45 +08:00
Sharuzzaman Ahmat Raslan
b451728b2f
docs: update README.md (#866) 2025-10-11 00:11:10 +08:00
stduhpf
11f436c483
feat: add support for Flux Controls and Flex.2 (#692) 2025-10-11 00:06:57 +08:00
leejet
35843c77ea
fix: optimize the handling of embedding weight (#859) 2025-09-25 23:09:59 +08:00
leejet
6ad46bb700 sync: update ggml 2025-09-25 21:57:43 +08:00
leejet
1ba30ce005 sync: update ggml 2025-09-25 00:38:38 +08:00
leejet
2abe9451c4
fix: optimize the handling of CLIP embedding weight (#840) 2025-09-25 00:28:20 +08:00
Wagner Bruna
f3140eadbb
fix: tensor loading thread count (#854) 2025-09-25 00:26:38 +08:00
Stefan-Olt
98ba155fc6
docs: HipBLAS / ROCm build instruction fix (#843) 2025-09-25 00:03:05 +08:00
Wagner Bruna
513f36d495
docs: include Vulkan compatibility for LoRA quants (#845) 2025-09-25 00:01:10 +08:00
rmatif
1e0d2821bb
fix: correct tensor deduplication logic (#844) 2025-09-24 23:22:40 +08:00
leejet
fd693ac6a2
refactor: remove unused --normalize-input parameter (#835) 2025-09-18 00:12:53 +08:00
Wagner Bruna
171b2222a5
fix: avoid segfault for pix2pix models without reference images (#766)
* fix: avoid segfault for pix2pix models with no reference images

* fix: default to empty reference on pix2pix models to avoid segfault

* use resize instead of reserve

* format code

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-18 00:11:38 +08:00
leejet
567f9f14f0 fix: avoid multithreading issues in the model loader 2025-09-18 00:00:15 +08:00
leejet
1e5f207006
chore: fix workflow (#836) 2025-09-17 22:11:55 +08:00
leejet
79426d578e chore: set release tag by commit count 2025-09-16 23:24:36 +08:00
vmobilis
97ad3e7ff9
refactor: simplify DPM++ (2S) Ancestral (#667) 2025-09-16 23:05:25 +08:00
Erik Scholz
8909523e92
refactor: move tiling cacl and debug print into the tiling code branch (#833) 2025-09-16 22:46:56 +08:00
rmatif
8376dfba2a
feat: add sgm_uniform scheduler, simple scheduler, and support for NitroFusion (#675)
* feat: Add timestep shift and two new schedulers

* update readme

* fix spaces

* format code

* simplify SGMUniformSchedule

* simplify shifted_timestep logic

* avoid conflict

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-16 22:42:09 +08:00
leejet
0ebe6fe118
refactor: simplify the logic of pm id image loading (#827) 2025-09-14 22:50:21 +08:00
rmatif
55c2e05d98
feat: optimize tensor loading time (#790)
* opt tensor loading

* fix build failure

* revert the changes

* allow the use of n_threads

* fix lora loading

* optimize lora loading

* add mutex

* use atomic

* fix build

* fix potential duplicate issue

* avoid duplicate lookup of lora tensor

* fix progeress bar

* remove unused remove_duplicates

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-14 22:48:35 +08:00
leejet
52a97b3ac1
feat: add vace support (#819)
* add wan vace t2v support

* add --vace-strength option

* add vace i2v support

* fix the processing of vace_context

* add vace v2v support

* update docs
2025-09-14 16:57:33 +08:00
stduhpf
2c9b1e2594
feat: add VAE encoding tiling support and adaptive overlap (#484)
* implement  tiling vae encode support

* Tiling (vae/upscale): adaptative overlap

* Tiling: fix edge case

* Tiling: fix crash when less than 2 tiles per dim

* remove extra dot

* Tiling: fix edge cases for adaptative overlap

* tiling: fix edge case

* set vae tile size via env var

* vae tiling: refactor again, base on smaller buffer for alignment

* Use bigger tiles for encode (to match compute buffer size)

* Fix edge case when tile is bigger than latent

* non-square VAE tiling (#3)

* refactor tile number calculation

* support non-square tiles

* add env var to change tile overlap

* add safeguards and better error messages for SD_TILE_OVERLAP

* add safeguards and include overlapping factor for SD_TILE_SIZE

* avoid rounding issues when specifying SD_TILE_SIZE as a factor

* lower SD_TILE_OVERLAP limit

* zero-init empty output buffer

* Fix decode latent size

* fix encode

* tile size params instead of env

* Tiled vae parameter validation (#6)

* avoid crash with invalid tile sizes, use 0 for default

* refactor default tile size, limit overlap factor

* remove explicit parameter for relative tile size

* limit encoding tile to latent size

* unify code style and format code

* update docs

* fix get_tile_sizes in decode_first_stage

---------

Co-authored-by: Wagner Bruna <wbruna@users.noreply.github.com>
Co-authored-by: leejet <leejet714@gmail.com>
2025-09-14 16:00:29 +08:00
leejet
288e2d63c0 docs: update docs 2025-09-14 14:24:24 +08:00
leejet
dc46993b55
feat: increase work_ctx memory buffer size (#814) 2025-09-14 13:19:20 +08:00
Richard Palethorpe
a6a8569ea0
feat: Add SYCL Dockerfile (#651) 2025-09-14 13:02:59 +08:00
Erik Scholz
9e7befa320
fix: harden for large files (#643) 2025-09-14 12:44:19 +08:00
Wagner Bruna
c607fc3ed4
feat: use Euler sampling by default for SD3 and Flux (#753)
Thank you for your contribution.
2025-09-14 12:34:41 +08:00
Wagner Bruna
b54bec3f18
fix: do not force VAE type to f32 on SDXL (#716)
This seems to be a leftover from the initial SDXL support: it's
not enough to avoid NaN issues, and it's not not needed for the
fixed sdxl-vae-fp16-fix .
2025-09-14 12:19:59 +08:00
Wagner Bruna
5869987fe4
fix: make weight override more robust against ggml changes (#760) 2025-09-14 12:15:53 +08:00
Wagner Bruna
48956ffb87
feat: reduce CLIP memory usage with no embeddings (#768) 2025-09-14 12:08:00 +08:00
Wagner Bruna
ddc4a18b92
fix: make tiled VAE reuse the compute buffer (#821) 2025-09-14 11:41:50 +08:00
leejet
fce6afcc6a
feat: add sd3 flash attn support (#815) 2025-09-11 23:24:29 +08:00
Erik Scholz
49d6570c43
feat: add SmoothStep Scheduler (#813) 2025-09-11 23:17:46 +08:00
clibdev
6bbaf161ad
chore: add install() support in CMakeLists.txt (#540) 2025-09-11 22:24:16 +08:00
clibdev
87cdbd5978
feat: use log_printf to print ggml logs (#545) 2025-09-11 22:16:05 +08:00
leejet
b017918106
chore: remove sd3 flash attention warn (#812) 2025-09-10 22:21:02 +08:00
Wagner Bruna
ac5a215998
fix: use {} for params init instead of memset (#781) 2025-09-10 21:49:29 +08:00
Wagner Bruna
abb36d66b5
chore: update flash attention warnings (#805) 2025-09-10 21:38:21 +08:00
Wagner Bruna
ff4fdbb88d
fix: accept NULL in sd_img_gen_params_t::input_id_images_path (#809) 2025-09-10 21:22:55 +08:00
Markus Hartung
abb115cd02
fix: clarify lora quant support and small fixes (#792) 2025-09-08 22:39:25 +08:00
leejet
c648001030
feat: add detailed tensor loading time stat (#793) 2025-09-07 22:51:44 +08:00
stduhpf
c587a43c99
feat: support incrementing ref image index (omni-kontext) (#755)
* kontext: support  ref images indices

* lora: support x_embedder

* update help message

* Support for negative indices

* support for OmniControl (offsets at index 0)

* c++11 compat

* add --increase-ref-index option

* simplify the logic and fix some issues

* update README.md

* remove unused variable

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-07 22:35:16 +08:00
leejet
f8fe4e7db9
fix: add flash attn support check (#803) 2025-09-07 21:29:06 +08:00
leejet
1c07fb6fb1 docs: update docs/wan.md 2025-09-07 12:07:20 +08:00
leejet
675208dcb6 chore: update to c++17 2025-09-07 12:04:17 +08:00
leejet
d7f430cd69 docs: update docs and help message 2025-09-07 02:26:44 +08:00
stduhpf
141a4b4113
feat: add flow shift parameter (for SD3 and Wan) (#780)
* Add flow shift parameter (for SD3 and Wan)

* unify code style and fix some issues

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-07 02:16:59 +08:00
stduhpf
21ce9fe2cf
feat: add support for timestep boundary based automatic expert routing in Wan MoE (#779)
* Wan MoE: Automatic expert routing based on timestep boundary

* unify code style and fix some issues

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-09-07 01:44:10 +08:00
leejet
cb1d975e96
feat: add wan2.1/2.2 support (#778)
* add wan vae suppport

* add wan model support

* add umt5 support

* add wan2.1 t2i support

* make flash attn work with wan

* make wan a little faster

* add wan2.1 t2v support

* add wan gguf support

* add offload params to cpu support

* add wan2.1 i2v support

* crop image before resize

* set default fps to 16

* add diff lora support

* fix wan2.1 i2v

* introduce sd_sample_params_t

* add wan2.2 t2v support

* add wan2.2 14B i2v support

* add wan2.2 ti2v support

* add high noise lora support

* sync: update ggml submodule url

* avoid build failure on linux

* avoid build failure

* update ggml

* update ggml

* fix sd_version_is_wan

* update ggml, fix cpu im2col_3d

* fix ggml_nn_attention_ext mask

* add cache support to ggml runner

* fix the issue of illegal memory access

* unify image loading processing

* add wan2.1/2.2 FLF2V support

* fix end_image mask

* update to latest ggml

* add GGUFReader

* update docs
2025-09-06 18:08:03 +08:00
Wagner Bruna
2eb3845df5
fix: typo in the verbose long flag (#783) 2025-09-04 00:49:01 +08:00
stduhpf
4c6475f917
feat: show usage on unknown arg (#767) 2025-09-01 21:38:34 +08:00
SmallAndSoft
f0fa7ddc40
docs: add compile option needed by Ninja (#770) 2025-09-01 21:35:25 +08:00
SmallAndSoft
a7c7905c6d
docs: add missing dash to docs/chroma.md (#771) 2025-09-01 21:34:34 +08:00
Wagner Bruna
eea77cbad9
feat: throttle model loading progress updates (#782)
Some terminals have slow display latency, so frequent output
during model loading can actually slow down the process.

Also, since tensor loading times can vary a lot, the progress
display now shows the average across past iterations instead
of just the last one.
2025-09-01 21:32:01 +08:00
NekopenDev
0e86d90ee4
chore: add Nvidia 30 series (cuda arch 86) to build 2025-09-01 21:21:34 +08:00
leejet
5900ef6605 sync: update ggml, make cuda im2col a little faster 2025-08-03 01:29:40 +08:00
Daniele
5b8996f74a
Conv2D direct support (#744)
* Conv2DDirect for VAE stage

* Enable only for Vulkan, reduced duplicated code

* Cmake option to use conv2d direct

* conv2d direct always on for opencl

* conv direct as a flag

* fix merge typo

* Align conv2d behavior to flash attention's

* fix readme

* add conv2d direct for controlnet

* add conv2d direct for esrgan

* clean code, use enable_conv2d_direct/get_all_blocks

* format code

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-08-03 01:25:17 +08:00
Wagner Bruna
f7f05fb185
chore: avoid setting GGML_MAX_NAME when building against external ggml (#751)
An external ggml will most likely have been built with the default
GGML_MAX_NAME value (64), which would be inconsistent with the value
set by our build (128). That would be an ODR violation, and it could
easily cause memory corruption issues due to the different
sizeof(struct ggml_tensor) values.

For now, when linking against an external ggml, we demand it has been
patched with a bigger GGML_MAX_NAME, since we can't check against a
value defined only at build time.
2025-08-03 01:24:40 +08:00
Seas0
6167e2927a
feat: support build against system installed GGML library (#749) 2025-08-02 11:03:18 +08:00
leejet
f6b9aa1a43 refector: optimize the usage of tensor_types 2025-07-28 23:18:29 +08:00
Wagner Bruna
7eb30d00e5
feat: add missing models and parameters to image metadata (#743)
* feat: add new scheduler types, clip skip and vae to image embedded params

- If a non default scheduler is set, include it in the 'Sampler' tag in the data
embedded into the final image.
- If a custom VAE path is set, include the vae name (without path and extension)
in embedded image params under a `VAE:` tag.
- If a custom Clip skip is set, include that Clip skip value in embedded image
params under a `Clip skip:` tag.

* feat: add separate diffusion and text models to metadata

---------

Co-authored-by: one-lithe-rune <skapusniak@lithe-runes.com>
2025-07-28 22:00:27 +08:00
stduhpf
59080d3ce1
feat: change image dimensions requirement for DiT models (#742) 2025-07-28 21:58:17 +08:00
R0CKSTAR
8c3c788f31
feat: upgrade musa sdk to rc4.2.0 (#732) 2025-07-28 21:51:11 +08:00
leejet
f54524f620 sync: update ggml 2025-07-28 21:50:12 +08:00
leejet
eed97a5e1d sync: update ggml 2025-07-24 23:04:08 +08:00
Ettore Di Giacinto
fb86bf4cb0
docs: add LocalAI to README's UIs (#741) 2025-07-24 22:39:26 +08:00
leejet
bd1eaef93e fix: convert f64 to f32 and i64 to i32 when loading weights 2025-07-24 00:59:38 +08:00
Erik Scholz
ab835f7d39
fix: correct head dim check and L_k padding of flash attention (#736) 2025-07-24 00:57:45 +08:00
Daniele
26f3f61d37
docs: add sd.cpp-webui as an available frontend (#738) 2025-07-23 23:51:57 +08:00
Oleg Skutte
1896b28ef2
fix: make --taesd work (#731) 2025-07-15 00:45:22 +08:00
leejet
0739361bfe fix: avoid macOS build failed 2025-07-13 20:18:10 +08:00
leejet
ca0bd9396e
refactor: update c api (#728) 2025-07-13 18:48:42 +08:00
stduhpf
a772dca27a
feat: add Instruct-Pix2pix/CosXL-Edit support (#679)
* Instruct-p2p support

* support 2 conditionings cfg

* Do not re-encode the exact same image twice

* fixes for 2-cfg

* Fix pix2pix latent inputs + improve inpainting a bit + fix naming

* prepare for other pix2pix-like models

* Support sdxl ip2p

* fix reference image embeddings

* Support 2-cond cfg properly in cli

* fix typo in help

* Support masks for ip2p models

* unify code style

* delete unused code

* use edit mode

* add img_cond

* format code

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-07-12 15:36:45 +08:00
Wagner Bruna
6d84a30c66
feat: overriding quant types for specific tensors on model conversion (#724) 2025-07-08 00:11:38 +08:00
stduhpf
dafc32d0dd
feat: add support for f64/i64 and clip_g diffusers model (#681) 2025-07-06 23:24:55 +08:00
idostyle
225162f270
fix: mark encoder.embed_tokens.weight as unused tensor (#721) 2025-07-06 23:10:10 +08:00
leejet
b9e4718fac fix: correct --chroma-enable-t5-mask argument 2025-07-06 11:11:47 +08:00
113 changed files with 1413237 additions and 7683 deletions

10
.clang-tidy Normal file
View File

@ -0,0 +1,10 @@
Checks: >
modernize-make-shared,
modernize-use-nullptr,
modernize-use-override,
modernize-pass-by-value,
modernize-return-braced-init-list,
modernize-deprecated-headers,
HeaderFilterRegex: '^$'
WarningsAsErrors: ''
FormatStyle: none

73
.github/ISSUE_TEMPLATE/bug_report.yml vendored Normal file
View File

@ -0,0 +1,73 @@
name: 🐞 Bug Report
description: Report a bug or unexpected behavior
title: "[Bug] "
labels: ["bug"]
body:
- type: markdown
attributes:
value: |
Please use this template and include as many details as possible to help us reproduce and fix the issue.
- type: textarea
id: commit
attributes:
label: Git commit
description: Which commit are you trying to compile?
placeholder: |
$git rev-parse HEAD
40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
validations:
required: true
- type: input
id: os
attributes:
label: Operating System & Version
placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
validations:
required: true
- type: dropdown
id: backends
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
- type: input
id: cmd_arguments
attributes:
label: Command-line arguments used
placeholder: The full command line you ran (with all flags)
validations:
required: true
- type: textarea
id: steps_to_reproduce
attributes:
label: Steps to reproduce
placeholder: A step-by-step list of what you did
validations:
required: true
- type: textarea
id: expected_behavior
attributes:
label: What you expected to happen
placeholder: Describe the expected behavior or result
validations:
required: true
- type: textarea
id: actual_behavior
attributes:
label: What actually happened
placeholder: Describe what you saw instead (errors, logs, crash, etc.)
validations:
required: true
- type: textarea
id: logs_and_errors
attributes:
label: Logs / error messages / stack trace
placeholder: Paste complete logs or error output
- type: textarea
id: additional_info
attributes:
label: Additional context / environment details
placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.

View File

@ -0,0 +1,33 @@
name: 💡 Feature Request
description: Suggest a new feature or improvement
title: "[Feature] "
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
Thank you for suggesting an improvement! Please fill in the fields below.
- type: input
id: summary
attributes:
label: Feature Summary
placeholder: A one-line summary of the feature youd like
validations:
required: true
- type: textarea
id: description
attributes:
label: Detailed Description
placeholder: What problem does this solve? How do you expect it to work?
validations:
required: true
- type: textarea
id: alternatives
attributes:
label: Alternatives you considered
placeholder: Any alternative designs or workarounds you tried
- type: textarea
id: additional_context
attributes:
label: Additional context
placeholder: Any extra information (use cases, related functionalities, constraints)

View File

@ -65,7 +65,7 @@ jobs:
- name: Get commit hash - name: Get commit hash
id: commit id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2 uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info - name: Fetch system info
@ -118,7 +118,7 @@ jobs:
- name: Get commit hash - name: Get commit hash
id: commit id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2 uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info - name: Fetch system info
@ -149,7 +149,7 @@ jobs:
runs-on: windows-2025 runs-on: windows-2025
env: env:
VULKAN_VERSION: 1.3.261.1 VULKAN_VERSION: 1.4.328.1
strategy: strategy:
matrix: matrix:
@ -163,9 +163,7 @@ jobs:
- build: "avx512" - build: "avx512"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON" defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12" - build: "cuda12"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;80;75" defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120'"
# - build: "rocm5.5"
# defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
- build: 'vulkan' - build: 'vulkan'
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON" defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
steps: steps:
@ -178,30 +176,17 @@ jobs:
- name: Install cuda-toolkit - name: Install cuda-toolkit
id: cuda-toolkit id: cuda-toolkit
if: ${{ matrix.build == 'cuda12' }} if: ${{ matrix.build == 'cuda12' }}
uses: Jimver/cuda-toolkit@v0.2.19 uses: Jimver/cuda-toolkit@v0.2.22
with: with:
cuda: "12.6.2" cuda: "12.8.1"
method: "network" method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Install rocm-toolkit
id: rocm-toolkit
if: ${{ matrix.build == 'rocm5.5' }}
uses: Cyberhan123/rocm-toolkit@v0.1.0
with:
rocm: "5.5.0"
- name: Install Ninja
id: install-ninja
if: ${{ matrix.build == 'rocm5.5' }}
uses: urkle/action-get-ninja@v1
with:
version: 1.11.1
- name: Install Vulkan SDK - name: Install Vulkan SDK
id: get_vulkan id: get_vulkan
if: ${{ matrix.build == 'vulkan' }} if: ${{ matrix.build == 'vulkan' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@ -254,7 +239,7 @@ jobs:
- name: Copy and pack Cuda runtime - name: Copy and pack Cuda runtime
id: pack_cuda_runtime id: pack_cuda_runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }} if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
run: | run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
$dst='.\build\bin\cudart\' $dst='.\build\bin\cudart\'
@ -262,7 +247,7 @@ jobs:
7z a cudart-sd-bin-win-cu12-x64.zip $dst\* 7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
- name: Upload Cuda runtime - name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }} if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: sd-cudart-sd-bin-win-cu12-x64.zip name: sd-cudart-sd-bin-win-cu12-x64.zip
@ -277,6 +262,104 @@ jobs:
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-hip:
runs-on: windows-2022
env:
HIPSDK_INSTALLER_VERSION: "25.Q3"
GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Cache ROCm Installation
id: cache-rocm
uses: actions/cache@v4
with:
path: C:\Program Files\AMD\ROCm
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
evict-old-files: 1d
- name: Install ROCm
if: steps.cache-rocm.outputs.cache-hit != 'true'
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$completed = $proc.WaitForExit(600000)
if (-not $completed) {
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
$proc.Kill()
exit 1
}
if ($proc.ExitCode -ne 0) {
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
exit 1
}
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
run: |
# Find and test ROCm installation
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
if (-not $clangPath) {
Write-Error "ROCm installation not found"
exit 1
}
& $clangPath.FullName --version
# Set HIP_PATH environment variable for later steps
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
- name: Build
run: |
mkdir build
cd build
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake .. `
-G "Unix Makefiles" `
-DSD_HIPBLAS=ON `
-DSD_BUILD_SHARED_LIBS=ON `
-DGGML_NATIVE=OFF `
-DCMAKE_C_COMPILER=clang `
-DCMAKE_CXX_COMPILER=clang++ `
-DCMAKE_BUILD_TYPE=Release `
-DGPU_TARGETS="${{ env.GPU_TARGETS }}"
cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Pack artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -286,8 +369,14 @@ jobs:
- ubuntu-latest-cmake - ubuntu-latest-cmake
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-latest-cmake-hip
steps: steps:
- name: Clone
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Download artifacts - name: Download artifacts
id: download-artifact id: download-artifact
uses: actions/download-artifact@v4 uses: actions/download-artifact@v4
@ -296,20 +385,27 @@ jobs:
pattern: sd-* pattern: sd-*
merge-multiple: true merge-multiple: true
- name: Get commit count
id: commit_count
run: |
echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
- name: Get commit hash - name: Get commit hash
id: commit id: commit
uses: pr-mpt/actions-commit-hash@v2 uses: pr-mpt/actions-commit-hash@v2
- name: Create release - name: Create release
id: create_release id: create_release
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
uses: anzz1/action-create-release@v1 uses: anzz1/action-create-release@v1
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with: with:
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }} tag_name: ${{ format('{0}-{1}-{2}', env.BRANCH_NAME, steps.commit_count.outputs.count, steps.commit.outputs.short) }}
- name: Upload release - name: Upload release
id: upload_release id: upload_release
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
uses: actions/github-script@v3 uses: actions/github-script@v3
with: with:
github-token: ${{secrets.GITHUB_TOKEN}} github-token: ${{secrets.GITHUB_TOKEN}}

6
.gitignore vendored
View File

@ -1,13 +1,15 @@
build*/ build*/
cmake-build-*/
test/ test/
.vscode/ .vscode/
.idea/
.cache/ .cache/
*.swp *.swp
.vscode/
*.bat *.bat
*.bin *.bin
*.exe *.exe
*.gguf *.gguf
output*.png output*.png
models* models*
*.log *.log
preview.png

2
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "ggml"] [submodule "ggml"]
path = ggml path = ggml
url = https://github.com/ggerganov/ggml.git url = https://github.com/ggml-org/ggml.git

View File

@ -33,6 +33,8 @@ option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF) option(SD_MUSA "sd: musa backend" OFF)
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF) option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF) option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON) #option(SD_BUILD_SERVER "sd: build server example" ON)
if(SD_CUDA) if(SD_CUDA)
@ -85,18 +87,53 @@ file(GLOB SD_LIB_SOURCES
"*.hpp" "*.hpp"
) )
# we can get only one share lib find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
endif()
if(NOT SDCPP_BUILD_VERSION)
set(SDCPP_BUILD_VERSION unknown)
endif()
message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
if(NOT SDCPP_BUILD_COMMIT)
set(SDCPP_BUILD_COMMIT unknown)
endif()
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
set_property(
SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
APPEND PROPERTY COMPILE_DEFINITIONS
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
)
if(SD_BUILD_SHARED_LIBS) if(SD_BUILD_SHARED_LIBS)
message("-- Build shared library") message("-- Build shared library")
message(${SD_LIB_SOURCES}) message(${SD_LIB_SOURCES})
set(BUILD_SHARED_LIBS OFF) if(NOT SD_BUILD_SHARED_GGML_LIB)
set(BUILD_SHARED_LIBS OFF)
endif()
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES}) add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
add_definitions(-DSD_BUILD_SHARED_LIB) add_definitions(-DSD_BUILD_SHARED_LIB)
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL) target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON)
else() else()
message("-- Build static library") message("-- Build static library")
set(BUILD_SHARED_LIBS OFF) if(NOT SD_BUILD_SHARED_GGML_LIB)
set(BUILD_SHARED_LIBS OFF)
endif()
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES}) add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
endif() endif()
@ -118,23 +155,37 @@ endif()
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
# see https://github.com/ggerganov/ggml/pull/682 if (NOT SD_USE_SYSTEM_GGML)
add_definitions(-DGGML_MAX_NAME=128) # see https://github.com/ggerganov/ggml/pull/682
add_definitions(-DGGML_MAX_NAME=128)
endif()
# deps # deps
# Only add ggml if it hasn't been added yet # Only add ggml if it hasn't been added yet
if (NOT TARGET ggml) if (NOT TARGET ggml)
add_subdirectory(ggml) if (SD_USE_SYSTEM_GGML)
find_package(ggml REQUIRED)
if (NOT ggml_FOUND)
message(FATAL_ERROR "System-installed GGML library not found.")
endif()
add_library(ggml ALIAS ggml::ggml)
else()
add_subdirectory(ggml)
endif()
endif() endif()
add_subdirectory(thirdparty) add_subdirectory(thirdparty)
target_link_libraries(${SD_LIB} PUBLIC ggml zip) target_link_libraries(${SD_LIB} PUBLIC ggml zip)
target_include_directories(${SD_LIB} PUBLIC . thirdparty) target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC cxx_std_11) target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
if (SD_BUILD_EXAMPLES) if (SD_BUILD_EXAMPLES)
add_subdirectory(examples) add_subdirectory(examples)
endif() endif()
set(SD_PUBLIC_HEADERS stable-diffusion.h)
set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)

View File

@ -1,16 +1,21 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && apt-get install -y build-essential git cmake RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
WORKDIR /sd.cpp WORKDIR /sd.cpp
COPY . . COPY . .
RUN mkdir build && cd build && cmake .. && cmake --build . --config Release RUN cmake . -B ./build
RUN cmake --build ./build --config Release --parallel
FROM ubuntu:$UBUNTU_VERSION as runtime FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install --yes --no-install-recommends libgomp1 && \
apt-get clean
COPY --from=build /sd.cpp/build/bin/sd /sd COPY --from=build /sd.cpp/build/bin/sd /sd

View File

@ -1,6 +1,7 @@
ARG MUSA_VERSION=rc3.1.1 ARG MUSA_VERSION=rc4.2.0
ARG UBUNTU_VERSION=22.04
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
RUN apt-get update && apt-get install -y ccache cmake git RUN apt-get update && apt-get install -y ccache cmake git
@ -15,7 +16,7 @@ RUN mkdir build && cd build && \
-DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build . --config Release cmake --build . --config Release
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
COPY --from=build /sd.cpp/build/bin/sd /sd COPY --from=build /sd.cpp/build/bin/sd /sd

19
Dockerfile.sycl Normal file
View File

@ -0,0 +1,19 @@
ARG SYCL_VERSION=2025.1.0-0
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
RUN apt-get update && apt-get install -y cmake
WORKDIR /sd.cpp
COPY . .
RUN mkdir build && cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build . --config Release -j$(nproc)
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
COPY --from=build /sd.cpp/build/bin/sd /sd
ENTRYPOINT [ "/sd" ]

449
README.md
View File

@ -1,40 +1,86 @@
<p align="center"> <p align="center">
<img src="./assets/cat_with_sd_cpp_42.png" width="360x"> <img src="./assets/logo.png" width="360x">
</p> </p>
# stable-diffusion.cpp # stable-diffusion.cpp
Inference of Stable Diffusion and Flux in pure C/C++ <div align="center">
<a href="https://trendshift.io/repositories/9714" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9714" alt="leejet%2Fstable-diffusion.cpp | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</div>
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
***Note that this project is under active development. \
API and command-line option may change frequently.***
## 🔥Important News
* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**
👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**
👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**
👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**
👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**
👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**
👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
## Features ## Features
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp) - Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
- Super lightweight and without external dependencies - Super lightweight and without external dependencies
- SD1.x, SD2.x, SDXL and [SD3/SD3.5](./docs/sd3.md) support - Supported models
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors). - Image Models
- [Flux-dev/Flux-schnell Support](./docs/flux.md) - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- [FLUX.1-Kontext-dev](./docs/kontext.md) - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- [Chroma](./docs/chroma.md) - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support - [SD3/SD3.5](./docs/sd3.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support. - [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
- 16-bit, 32-bit float support - [FLUX.2-dev](./docs/flux2.md)
- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support - [Chroma](./docs/chroma.md)
- Accelerated memory-efficient CPU inference - [Chroma1-Radiance](./docs/chroma_radiance.md)
- Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB. - [Qwen Image](./docs/qwen_image.md)
- AVX, AVX2 and AVX512 support for x86 architectures - [Z-Image](./docs/z_image.md)
- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration. - [Ovis-Image](./docs/ovis_image.md)
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models - Image Edit Models
- No need to convert to `.ggml` or `.gguf` anymore! - [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
- Video Models
- [Wan2.1/Wan2.2](./docs/wan.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
- Control Net support with SD 1.5
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
- Latent Consistency Models support (LCM/LCM-LoRA)
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
- Supported backends
- CPU (AVX, AVX2 and AVX512 support for x86 architectures)
- CUDA
- Vulkan
- Metal
- OpenCL
- SYCL
- Supported weight formats
- Pytorch checkpoint (`.ckpt` or `.pth`)
- Safetensors (`./safetensors`)
- GGUF (`.gguf`)
- Supported platforms
- Linux
- Mac OS
- Windows
- Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
- Flash Attention for memory usage optimization - Flash Attention for memory usage optimization
- Original `txt2img` and `img2img` mode
- Negative prompt - Negative prompt
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now) - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
- Latent Consistency Models support (LCM/LCM-LoRA)
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
- VAE tiling processing for reduce memory usage - VAE tiling processing for reduce memory usage
- Control Net support with SD 1.5
- Sampling method - Sampling method
- `Euler A` - `Euler A`
- `Euler` - `Euler`
@ -44,342 +90,52 @@ Inference of Stable Diffusion and Flux in pure C/C++
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457) - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
- `DPM++ 2S a` - `DPM++ 2S a`
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952) - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`) - Cross-platform reproducibility
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
- `--rng cpu`, consistent with the `comfyui RNG`
- Embedds generation parameters into png output as webui-compatible text string - Embedds generation parameters into png output as webui-compatible text string
- Supported platforms
- Linux
- Mac OS
- Windows
- Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
### TODO ## Quick Start
- [ ] More sampling methods ### Get the sd executable
- [ ] Make inference faster
- The current implementation of ggml_conv_2d is slow and has high memory usage
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
- [ ] Implement Inpainting support
## Usage - Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
- Or build from source by following the [build guide](./docs/build.md)
For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest). ### Download model weights
If the built product does not meet your requirements, you can choose to build it manually.
### Get the Code - download weights(.ckpt or .safetensors or .gguf). For example
- Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
``` ```sh
git clone --recursive https://github.com/leejet/stable-diffusion.cpp curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
cd stable-diffusion.cpp
```
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
```
cd stable-diffusion.cpp
git pull origin master
git submodule init
git submodule update
```
### Download weights
- download original weights(.ckpt or .safetensors). For example
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
- Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
- Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
```shell
curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
# curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
``` ```
### Build ### Generate an image with just one command
#### Build from scratch
```shell
mkdir build
cd build
cmake ..
cmake --build . --config Release
```
##### Using OpenBLAS
```
cmake .. -DGGML_OPENBLAS=ON
cmake --build . --config Release
```
##### Using CUDA
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
```
cmake .. -DSD_CUDA=ON
cmake --build . --config Release
```
##### Using HipBLAS
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
```
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
cmake --build . --config Release
```
##### Using MUSA
This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
```bash
cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
cmake --build . --config Release
```
##### Using Metal
Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
```
cmake .. -DSD_METAL=ON
cmake --build . --config Release
```
##### Using Vulkan
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
```
cmake .. -DSD_VULKAN=ON
cmake --build . --config Release
```
##### Using OpenCL (for Adreno GPU)
Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
To build for Windows ARM please refers to [Windows 11 Arm64
](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
Building for Android:
Android NDK:
Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
Setup OpenCL Dependencies for NDK:
You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
* OpenCL Headers:
```bash
# In a temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
cd ..
```
* OpenCL ICD Loader:
```bash
# In the same temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build_ndk && cd build_ndk
# Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=24 \
-DANDROID_STL=c++_shared
ninja
# Replace <YOUR_NDK_PATH>
# e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
cd ../..
```
Build `stable-diffusion.cpp` for Android with OpenCL:
```bash
mkdir build-android && cd build-android
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
cmake .. -G Ninja \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DGGML_OPENMP=OFF \
-DSD_OPENCL=ON
ninja
```
*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
##### Using SYCL
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
```
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake --build . --config Release
```
Example of text2img by using SYCL backend:
- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
<p align="center">
<img src="./assets/sycl_sd3_output.png" width="360x">
</p>
##### Using Flash Attention
Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
eg.:
- flux 768x768 ~600mb
- SD2 768x768 ~1400mb
For most backends, it slows things down, but for cuda it generally speeds it up too.
At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
Run by adding `--diffusion-fa` to the arguments and watch for:
```
[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
```
and the compute buffer shrink in the debug log:
```
[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
```
### Run
```
usage: ./bin/sd [arguments]
arguments:
-h, --help show this help message and exit
-M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)
-t, --threads N number of threads to use during computation (default: -1)
If threads <= 0, then threads will be set to the number of CPU physical cores
-m, --model [MODEL] path to full model
--diffusion-model path to the standalone diffusion model
--clip_l path to the clip-l text encoder
--clip_g path to the clip-g text encoder
--t5xxl path to the the t5xxl text encoder
--vae [VAE] path to vae
--taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--control-net [CONTROL_PATH] path to control net model
--embd-dir [EMBEDDING_PATH] path to embeddings
--stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings
--input-id-images-dir [DIR] path to PHOTOMAKER input id images dir
--normalize-input normalize PHOTOMAKER input id images
--upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
--upscale-repeats Run the ESRGAN upscaler this many times (default 1)
--type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
If not specified, the default is the type of the weight file
--lora-model-dir [DIR] lora model directory
-i, --init-img [IMAGE] path to the input image, required by img2img
--mask [MASK] path to the mask image, required by img2img with mask
--control-image [IMAGE] path to image condition, control net
-r, --ref_image [PATH] reference image for Flux Kontext models (can be used multiple times)
-o, --output OUTPUT path to write result image to (default: ./output.png)
-p, --prompt [PROMPT] the prompt to render
-n, --negative-prompt PROMPT the negative prompt (default: "")
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
--guidance SCALE guidance scale for img2img (default: 3.5)
--slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)
0 means disabled, a value of 2.5 is nice for sd3.5 medium
--eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
--skip-layer-start START SLG enabling point: (default: 0.01)
--skip-layer-end END SLG disabling point: (default: 0.2)
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
--strength STRENGTH strength for noising/unnoising (default: 0.75)
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20%)
--control-strength STRENGTH strength to apply Control Net (default: 0.9)
1.0 corresponds to full destruction of information in init image
-H, --height H image height, in pixel space (default: 512)
-W, --width W image width, in pixel space (default: 512)
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
sampling method (default: "euler_a")
--steps STEPS number of sample steps (default: 20)
--rng {std_default, cuda} RNG (default: cuda)
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
-b, --batch-count COUNT number of images to generate
--schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)
--clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
--vae-tiling process vae in tiles to reduce memory usage
--vae-on-cpu keep vae in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model (for low vram)
Might lower quality, since it implies converting k and v to f16.
This might crash if it is not supported by the backend.
--control-net-cpu keep controlnet in cpu (for low vram)
--canny apply canny preprocessor (edge detection)
--color colors the logging tags according to level
--chroma-disable-dit-mask disable dit mask for chroma
--chroma-enable-t5-mask enable t5 mask for chroma
--chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
-v, --verbose print extra info
```
#### txt2img example
```sh ```sh
./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat" ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
``` ```
Using formats of different precisions will yield results of varying quality. ***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 | ## Performance
| ---- |---- |---- |---- |---- |---- |---- |
| ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
#### img2img example If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
- `./output.png` is the image generated from the above txt2img pipeline
```
./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
```
<p align="center">
<img src="./assets/img2img_output.png" width="256x">
</p>
## More Guides ## More Guides
- [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev](./docs/flux2.md)
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Chroma](./docs/chroma.md)
- [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [LoRA](./docs/lora.md) - [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md) - [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md) - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@ -407,6 +163,10 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx) - [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp) - [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
- [Local Diffusion](https://github.com/rmatif/Local-Diffusion) - [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
- [LocalAI](https://github.com/mudler/LocalAI)
- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
- [KoboldCpp](https://github.com/LostRuins/koboldcpp)
## Contributors ## Contributors
@ -420,7 +180,8 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
## References ## References
- [ggml](https://github.com/ggerganov/ggml) - [ggml](https://github.com/ggml-org/ggml)
- [diffusers](https://github.com/huggingface/diffusers)
- [stable-diffusion](https://github.com/CompVis/stable-diffusion) - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
- [sd3-ref](https://github.com/Stability-AI/sd3-ref) - [sd3-ref](https://github.com/Stability-AI/sd3-ref)
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion) - [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
@ -430,3 +191,5 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
- [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model) - [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
- [generative-models](https://github.com/Stability-AI/generative-models/) - [generative-models](https://github.com/Stability-AI/generative-models/)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) - [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
- [Wan2.2](https://github.com/Wan-Video/Wan2.2)

Binary file not shown.

After

Width:  |  Height:  |  Size: 477 KiB

BIN
assets/flux2/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 556 KiB

BIN
assets/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 401 KiB

BIN
assets/qwen/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 457 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 415 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 594 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
assets/z_image/bf16.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q2_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
assets/z_image/q3_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
assets/z_image/q4_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q4_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q5_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q6_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q8_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

333
clip.hpp
View File

@ -3,35 +3,11 @@
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "model.h" #include "model.h"
#include "tokenize_util.h"
/*================================================== CLIPTokenizer ===================================================*/ /*================================================== CLIPTokenizer ===================================================*/
std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) { __STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
std::regex re("<lora:([^:]+):([^>]+)>");
std::smatch matches;
std::unordered_map<std::string, float> filename2multiplier;
while (std::regex_search(text, matches, re)) {
std::string filename = matches[1].str();
float multiplier = std::stof(matches[2].str());
text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
if (multiplier == 0.f) {
continue;
}
if (filename2multiplier.find(filename) == filename2multiplier.end()) {
filename2multiplier[filename] = multiplier;
} else {
filename2multiplier[filename] += multiplier;
}
}
return std::make_pair(filename2multiplier, text);
}
std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs; std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
std::set<int> byte_set; std::set<int> byte_set;
for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) { for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
@ -72,6 +48,8 @@ private:
int encoder_len; int encoder_len;
int bpe_len; int bpe_len;
std::vector<std::string> special_tokens;
public: public:
const std::string UNK_TOKEN = "<|endoftext|>"; const std::string UNK_TOKEN = "<|endoftext|>";
const std::string BOS_TOKEN = "<|startoftext|>"; const std::string BOS_TOKEN = "<|startoftext|>";
@ -117,6 +95,15 @@ private:
return pairs; return pairs;
} }
bool is_special_token(const std::string& token) {
for (auto& special_token : special_tokens) {
if (special_token == token) {
return true;
}
}
return false;
}
public: public:
CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "") CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
: PAD_TOKEN_ID(pad_token_id) { : PAD_TOKEN_ID(pad_token_id) {
@ -125,6 +112,8 @@ public:
} else { } else {
load_from_merges(ModelLoader::load_merges()); load_from_merges(ModelLoader::load_merges());
} }
add_special_token("<|startoftext|>");
add_special_token("<|endoftext|>");
} }
void load_from_merges(const std::string& merges_utf8_str) { void load_from_merges(const std::string& merges_utf8_str) {
@ -179,9 +168,9 @@ public:
auto it = encoder.find(utf8_to_utf32("img</w>")); auto it = encoder.find(utf8_to_utf32("img</w>"));
if (it != encoder.end()) { if (it != encoder.end()) {
LOG_DEBUG(" trigger word img already in vocab"); LOG_DEBUG("trigger word img already in vocab");
} else { } else {
LOG_DEBUG(" trigger word img not in vocab yet"); LOG_DEBUG("trigger word img not in vocab yet");
} }
int rank = 0; int rank = 0;
@ -201,6 +190,10 @@ public:
} }
} }
void add_special_token(const std::string& token) {
special_tokens.push_back(token);
}
std::u32string bpe(const std::u32string& token) { std::u32string bpe(const std::u32string& token) {
std::vector<std::u32string> word; std::vector<std::u32string> word;
@ -379,25 +372,54 @@ public:
return trim(text); return trim(text);
} }
std::vector<std::string> token_split(const std::string& text) {
std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::sregex_iterator iter(text.begin(), text.end(), pat);
std::sregex_iterator end;
std::vector<std::string> result;
for (; iter != end; ++iter) {
result.emplace_back(iter->str());
}
return result;
}
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) { std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
std::string original_text = text; std::string original_text = text;
std::vector<int32_t> bpe_tokens; std::vector<int32_t> bpe_tokens;
text = whitespace_clean(text); text = whitespace_clean(text);
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); }); std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::smatch matches;
std::string str = text; std::string str = text;
std::vector<std::string> token_strs; std::vector<std::string> token_strs;
while (std::regex_search(str, matches, pat)) {
bool skip = on_new_token_cb(str, bpe_tokens); auto splited_texts = split_with_special_tokens(text, special_tokens);
if (skip) {
for (auto& splited_text : splited_texts) {
LOG_DEBUG("token %s", splited_text.c_str());
if (is_special_token(splited_text)) {
LOG_DEBUG("special %s", splited_text.c_str());
bool skip = on_new_token_cb(splited_text, bpe_tokens);
if (skip) {
token_strs.push_back(splited_text);
continue;
}
continue; continue;
} }
for (auto& token : matches) {
std::string token_str = token.str(); auto tokens = token_split(splited_text);
for (auto& token : tokens) {
if (on_new_token_cb != nullptr) {
bool skip = on_new_token_cb(token, bpe_tokens);
if (skip) {
token_strs.push_back(token);
continue;
}
}
std::string token_str = token;
std::u32string utf32_token; std::u32string utf32_token;
for (int i = 0; i < token_str.length(); i++) { for (int i = 0; i < token_str.length(); i++) {
unsigned char b = token_str[i]; unsigned char b = token_str[i];
@ -417,14 +439,13 @@ public:
bpe_tokens.push_back(encoder[bpe_str]); bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str)); token_strs.push_back(utf32_to_utf8(bpe_str));
} }
str = matches.suffix();
} }
std::stringstream ss; // std::stringstream ss;
ss << "["; // ss << "[";
for (auto token : token_strs) { // for (auto token : token_strs) {
ss << "\"" << token << "\", "; // ss << "\"" << token << "\", ";
} // }
ss << "]"; // ss << "]";
// LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str()); // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str()); // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
return bpe_tokens; return bpe_tokens;
@ -451,16 +472,16 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, n_token, d_model] // x: [N, n_token, d_model]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]); auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]); auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
x = fc1->forward(ctx, x); x = fc1->forward(ctx, x);
if (use_gelu) { if (use_gelu) {
x = ggml_gelu_inplace(ctx, x); x = ggml_gelu_inplace(ctx->ggml_ctx, x);
} else { } else {
x = ggml_gelu_quick_inplace(ctx, x); x = ggml_gelu_quick_inplace(ctx->ggml_ctx, x);
} }
x = fc2->forward(ctx, x); x = fc2->forward(ctx, x);
return x; return x;
@ -476,11 +497,12 @@ protected:
public: public:
CLIPLayer(int64_t d_model, CLIPLayer(int64_t d_model,
int64_t n_head, int64_t n_head,
int64_t intermediate_size) int64_t intermediate_size,
bool proj_in = false)
: d_model(d_model), : d_model(d_model),
n_head(n_head), n_head(n_head),
intermediate_size(intermediate_size) { intermediate_size(intermediate_size) {
blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true)); blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true, proj_in));
blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model)); blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model)); blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
@ -488,15 +510,15 @@ public:
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size)); blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = true) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, bool mask = true) {
// x: [N, n_token, d_model] // x: [N, n_token, d_model]
auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]); auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]); auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]); auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);
auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]); auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);
x = ggml_add(ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask)); x = ggml_add(ctx->ggml_ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x))); x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
return x; return x;
} }
}; };
@ -509,15 +531,19 @@ public:
CLIPEncoder(int64_t n_layer, CLIPEncoder(int64_t n_layer,
int64_t d_model, int64_t d_model,
int64_t n_head, int64_t n_head,
int64_t intermediate_size) int64_t intermediate_size,
bool proj_in = false)
: n_layer(n_layer) { : n_layer(n_layer) {
for (int i = 0; i < n_layer; i++) { for (int i = 0; i < n_layer; i++) {
std::string name = "layers." + std::to_string(i); std::string name = "layers." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size)); blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size, proj_in));
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) { struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
int clip_skip = -1,
bool mask = true) {
// x: [N, n_token, d_model] // x: [N, n_token, d_model]
int layer_idx = n_layer - 1; int layer_idx = n_layer - 1;
// LOG_DEBUG("clip_skip %d", clip_skip); // LOG_DEBUG("clip_skip %d", clip_skip);
@ -544,11 +570,17 @@ protected:
int64_t embed_dim; int64_t embed_dim;
int64_t vocab_size; int64_t vocab_size;
int64_t num_positions; int64_t num_positions;
bool force_clip_f32;
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type token_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32; enum ggml_type token_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32; if (!force_clip_f32) {
token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
if (!support_get_rows(token_wtype)) {
token_wtype = GGML_TYPE_F32;
}
}
enum ggml_type position_wtype = GGML_TYPE_F32;
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size); params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions); params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
} }
@ -556,17 +588,19 @@ protected:
public: public:
CLIPEmbeddings(int64_t embed_dim, CLIPEmbeddings(int64_t embed_dim,
int64_t vocab_size = 49408, int64_t vocab_size = 49408,
int64_t num_positions = 77) int64_t num_positions = 77,
bool force_clip_f32 = false)
: embed_dim(embed_dim), : embed_dim(embed_dim),
vocab_size(vocab_size), vocab_size(vocab_size),
num_positions(num_positions) { num_positions(num_positions),
force_clip_f32(force_clip_f32) {
} }
struct ggml_tensor* get_token_embed_weight() { struct ggml_tensor* get_token_embed_weight() {
return params["token_embedding.weight"]; return params["token_embedding.weight"];
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* custom_embed_weight) { struct ggml_tensor* custom_embed_weight) {
// input_ids: [N, n_token] // input_ids: [N, n_token]
@ -574,12 +608,12 @@ public:
auto position_embed_weight = params["position_embedding.weight"]; auto position_embed_weight = params["position_embedding.weight"];
GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]); GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]); input_ids = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids); auto token_embedding = ggml_get_rows(ctx->ggml_ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
token_embedding = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]); token_embedding = ggml_reshape_3d(ctx->ggml_ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
// token_embedding + position_embedding // token_embedding + position_embedding
auto x = ggml_add(ctx, auto x = ggml_add(ctx->ggml_ctx,
token_embedding, token_embedding,
position_embed_weight); // [N, n_token, embed_dim] position_embed_weight); // [N, n_token, embed_dim]
return x; return x;
@ -594,10 +628,11 @@ protected:
int64_t image_size; int64_t image_size;
int64_t num_patches; int64_t num_patches;
int64_t num_positions; int64_t num_positions;
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16; void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32; enum ggml_type patch_wtype = GGML_TYPE_F16;
enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32; enum ggml_type class_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32;
params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim); params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim); params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
@ -617,7 +652,7 @@ public:
num_positions = num_patches + 1; num_positions = num_patches + 1;
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) {
// pixel_values: [N, num_channels, image_size, image_size] // pixel_values: [N, num_channels, image_size, image_size]
// return: [N, num_positions, embed_dim] // return: [N, num_positions, embed_dim]
GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels); GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@ -629,18 +664,18 @@ public:
// concat(patch_embedding, class_embedding) + position_embedding // concat(patch_embedding, class_embedding) + position_embedding
struct ggml_tensor* patch_embedding; struct ggml_tensor* patch_embedding;
int64_t N = pixel_values->ne[3]; int64_t N = pixel_values->ne[3];
patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N); struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim] class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim]
class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1] class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1] struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim] x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
x = ggml_add(ctx, x, position_embed_weight); x = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
return x; // [N, num_positions, embed_dim] return x; // [N, num_positions, embed_dim]
} }
}; };
@ -657,9 +692,9 @@ enum CLIPVersion {
class CLIPTextModel : public GGMLBlock { class CLIPTextModel : public GGMLBlock {
protected: protected:
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
if (version == OPEN_CLIP_VIT_BIGG_14) { if (version == OPEN_CLIP_VIT_BIGG_14) {
enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size); params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
} }
} }
@ -674,12 +709,12 @@ public:
int32_t n_head = 12; int32_t n_head = 12;
int32_t n_layer = 12; // num_hidden_layers int32_t n_layer = 12; // num_hidden_layers
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14 int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
int32_t clip_skip = -1;
bool with_final_ln = true; bool with_final_ln = true;
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true, bool with_final_ln = true,
int clip_skip_value = -1) bool force_clip_f32 = false,
bool proj_in = false)
: version(version), with_final_ln(with_final_ln) { : version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) { if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024; hidden_size = 1024;
@ -692,30 +727,23 @@ public:
n_head = 20; n_head = 20;
n_layer = 32; n_layer = 32;
} }
set_clip_skip(clip_skip_value);
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token)); blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size)); blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
} }
void set_clip_skip(int skip) {
if (skip <= 0) {
skip = -1;
}
clip_skip = skip;
}
struct ggml_tensor* get_token_embed_weight() { struct ggml_tensor* get_token_embed_weight() {
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]); auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
return embeddings->get_token_embed_weight(); return embeddings->get_token_embed_weight();
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* tkn_embeddings, struct ggml_tensor* tkn_embeddings,
size_t max_token_idx = 0, size_t max_token_idx = 0,
bool return_pooled = false) { bool return_pooled = false,
int clip_skip = -1) {
// input_ids: [N, n_token] // input_ids: [N, n_token]
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]); auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]); auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@ -729,11 +757,11 @@ public:
if (return_pooled) { if (return_pooled) {
auto text_projection = params["text_projection"]; auto text_projection = params["text_projection"];
ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx); ggml_tensor* pooled = ggml_view_1d(ctx->ggml_ctx, x, hidden_size, x->nb[1] * max_token_idx);
if (text_projection != NULL) { if (text_projection != nullptr) {
pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL); pooled = ggml_ext_linear(ctx->ggml_ctx, pooled, text_projection, nullptr);
} else { } else {
LOG_DEBUG("Missing text_projection matrix, assuming identity..."); LOG_DEBUG("identity projection");
} }
return pooled; // [hidden_size, 1, 1] return pooled; // [hidden_size, 1, 1]
} }
@ -755,7 +783,7 @@ public:
int32_t n_layer = 24; int32_t n_layer = 24;
public: public:
CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) { CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) { if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280; hidden_size = 1280;
intermediate_size = 5120; intermediate_size = 5120;
@ -770,11 +798,14 @@ public:
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size)); blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
blocks["pre_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size)); blocks["pre_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size)); blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) { struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values,
bool return_pooled = true,
int clip_skip = -1) {
// pixel_values: [N, num_channels, image_size, image_size] // pixel_values: [N, num_channels, image_size, image_size]
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]); auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]); auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@ -783,14 +814,14 @@ public:
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim] auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x); x = pre_layernorm->forward(ctx, x);
x = encoder->forward(ctx, x, -1, false); x = encoder->forward(ctx, x, clip_skip, false);
// print_ggml_tensor(x, true, "ClipVisionModel x: "); // print_ggml_tensor(x, true, "ClipVisionModel x: ");
auto last_hidden_state = x; auto last_hidden_state = x;
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size] x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
GGML_ASSERT(x->ne[3] == 1); GGML_ASSERT(x->ne[3] == 1);
if (return_pooled) { if (return_pooled) {
ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0)); ggml_tensor* pooled = ggml_cont(ctx->ggml_ctx, ggml_view_2d(ctx->ggml_ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
return pooled; // [N, hidden_size] return pooled; // [N, hidden_size]
} else { } else {
// return x; // [N, n_token, hidden_size] // return x; // [N, n_token, hidden_size]
@ -805,8 +836,8 @@ protected:
int64_t out_features; int64_t out_features;
bool transpose_weight; bool transpose_weight;
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
if (transpose_weight) { if (transpose_weight) {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features); params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
} else { } else {
@ -822,12 +853,12 @@ public:
out_features(out_features), out_features(out_features),
transpose_weight(transpose_weight) {} transpose_weight(transpose_weight) {}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
struct ggml_tensor* w = params["weight"]; struct ggml_tensor* w = params["weight"];
if (transpose_weight) { if (transpose_weight) {
w = ggml_cont(ctx, ggml_transpose(ctx, w)); w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
} }
return ggml_nn_linear(ctx, x, w, NULL); return ggml_ext_linear(ctx->ggml_ctx, x, w, nullptr);
} }
}; };
@ -839,7 +870,8 @@ public:
public: public:
CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14, CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool transpose_proj_w = false) { bool transpose_proj_w = false,
bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) { if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280; hidden_size = 1280;
projection_dim = 1024; projection_dim = 1024;
@ -847,20 +879,26 @@ public:
hidden_size = 1664; hidden_size = 1664;
} }
blocks["vision_model"] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version)); blocks["vision_model"] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version, proj_in));
blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w)); blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) { struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values,
bool return_pooled = true,
int clip_skip = -1) {
// pixel_values: [N, num_channels, image_size, image_size] // pixel_values: [N, num_channels, image_size, image_size]
// return: [N, projection_dim] // return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]); auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]); auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
auto x = vision_model->forward(ctx, pixel_values); // [N, hidden_size] auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
x = visual_projection->forward(ctx, x); // [N, projection_dim]
return x; // [N, projection_dim] if (return_pooled) {
x = visual_projection->forward(ctx, x); // [N, projection_dim]
}
return x;
} }
}; };
@ -868,54 +906,64 @@ struct CLIPTextModelRunner : public GGMLRunner {
CLIPTextModel model; CLIPTextModel model;
CLIPTextModelRunner(ggml_backend_t backend, CLIPTextModelRunner(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types, bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14, CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true, bool with_final_ln = true,
int clip_skip_value = -1) bool force_clip_f32 = false)
: GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) { : GGMLRunner(backend, offload_params_to_cpu) {
model.init(params_ctx, tensor_types, prefix); bool proj_in = false;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (contains(name, "self_attn.in_proj")) {
proj_in = true;
break;
}
}
model = CLIPTextModel(version, with_final_ln, force_clip_f32, proj_in);
model.init(params_ctx, tensor_storage_map, prefix);
} }
std::string get_desc() { std::string get_desc() override {
return "clip"; return "clip";
} }
void set_clip_skip(int clip_skip) {
model.set_clip_skip(clip_skip);
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix); model.get_param_tensors(tensors, prefix);
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* embeddings, struct ggml_tensor* embeddings,
size_t max_token_idx = 0, size_t max_token_idx = 0,
bool return_pooled = false) { bool return_pooled = false,
int clip_skip = -1) {
size_t N = input_ids->ne[1]; size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0]; size_t n_token = input_ids->ne[0];
if (input_ids->ne[0] > model.n_token) { if (input_ids->ne[0] > model.n_token) {
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0); GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token); input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
} }
return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled); return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
int num_custom_embeddings = 0, int num_custom_embeddings = 0,
void* custom_embeddings_data = NULL, void* custom_embeddings_data = nullptr,
size_t max_token_idx = 0, size_t max_token_idx = 0,
bool return_pooled = false) { bool return_pooled = false,
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); int clip_skip = -1) {
struct ggml_cgraph* gf = new_graph_custom(2048);
input_ids = to_backend(input_ids); input_ids = to_backend(input_ids);
struct ggml_tensor* embeddings = NULL; struct ggml_tensor* embeddings = nullptr;
if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) { if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
auto token_embed_weight = model.get_token_embed_weight(); auto token_embed_weight = model.get_token_embed_weight();
auto custom_embeddings = ggml_new_tensor_2d(compute_ctx, auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
token_embed_weight->type, token_embed_weight->type,
@ -927,25 +975,28 @@ struct CLIPTextModelRunner : public GGMLRunner {
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1); embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
} }
struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled); auto runner_ctx = get_context();
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
int num_custom_embeddings, int num_custom_embeddings,
void* custom_embeddings_data, void* custom_embeddings_data,
size_t max_token_idx, size_t max_token_idx,
bool return_pooled, bool return_pooled,
int clip_skip,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx = NULL) { ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled); return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
}; };
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
} }
}; };

View File

@ -23,12 +23,12 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
if (vae_downsample) { if (vae_downsample) {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]); auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_pad(ctx, x, 1, 1, 0, 0); x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
x = conv->forward(ctx, x); x = conv->forward(ctx, x);
} else { } else {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]); auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
@ -52,12 +52,12 @@ public:
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]); auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2] x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
return x; return x;
} }
}; };
@ -121,7 +121,7 @@ public:
} }
} }
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) { virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
// [N, c, t, h, w] => [N, c, t, h * w] // [N, c, t, h, w] => [N, c, t, h * w]
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w] // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@ -131,38 +131,38 @@ public:
auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]); auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]); auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
if (emb == NULL) { if (emb == nullptr) {
GGML_ASSERT(skip_t_emb); GGML_ASSERT(skip_t_emb);
} }
// in_layers // in_layers
auto h = in_layers_0->forward(ctx, x); auto h = in_layers_0->forward(ctx, x);
h = ggml_silu_inplace(ctx, h); h = ggml_silu_inplace(ctx->ggml_ctx, h);
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
// emb_layers // emb_layers
if (!skip_t_emb) { if (!skip_t_emb) {
auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]); auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
auto emb_out = ggml_silu(ctx, emb); auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels] emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
if (dims == 2) { if (dims == 2) {
emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1] emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
} else { } else {
emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1] emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
if (exchange_temb_dims) { if (exchange_temb_dims) {
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...") // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1] emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
} }
} }
h = ggml_add(ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] h = ggml_add(ctx->ggml_ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
} }
// out_layers // out_layers
h = out_layers_0->forward(ctx, h); h = out_layers_0->forward(ctx, h);
h = ggml_silu_inplace(ctx, h); h = ggml_silu_inplace(ctx->ggml_ctx, h);
// dropout, skip for inference // dropout, skip for inference
h = out_layers_3->forward(ctx, h); h = out_layers_3->forward(ctx, h);
@ -172,67 +172,95 @@ public:
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
} }
h = ggml_add(ctx, h, x); h = ggml_add(ctx->ggml_ctx, h, x);
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
} }
}; };
class GEGLU : public GGMLBlock { class GEGLU : public UnaryBlock {
protected: protected:
int64_t dim_in; int64_t dim_in;
int64_t dim_out; int64_t dim_out;
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
enum ggml_type wtype = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
enum ggml_type bias_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
}
public: public:
GEGLU(int64_t dim_in, int64_t dim_out) GEGLU(int64_t dim_in, int64_t dim_out)
: dim_in(dim_in), dim_out(dim_out) {} : dim_in(dim_in), dim_out(dim_out) {
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in] // x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out] // return: [ne3, ne2, ne1, dim_out]
struct ggml_tensor* w = params["proj.weight"]; auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
struct ggml_tensor* b = params["proj.bias"];
auto x_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in] x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2]
auto x_b = ggml_view_1d(ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in] auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ] x = x_vec[0]; // [ne3, ne2, ne1, dim_out]
auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ] auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out]
auto x_in = x; gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
x = ggml_nn_linear(ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
gate = ggml_gelu_inplace(ctx, gate); x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out]
x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, dim_out] return x;
}
};
class GELU : public UnaryBlock {
public:
GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out]
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
x = proj->forward(ctx, x);
x = ggml_gelu_inplace(ctx->ggml_ctx, x);
return x; return x;
} }
}; };
class FeedForward : public GGMLBlock { class FeedForward : public GGMLBlock {
public: public:
enum class Activation {
GEGLU,
GELU
};
FeedForward(int64_t dim, FeedForward(int64_t dim,
int64_t dim_out, int64_t dim_out,
int64_t mult = 4) { int64_t mult = 4,
Activation activation = Activation::GEGLU,
bool precision_fix = false) {
int64_t inner_dim = dim * mult; int64_t inner_dim = dim * mult;
if (activation == Activation::GELU) {
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
} else {
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
}
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
// net_1 is nn.Dropout(), skip for inference // net_1 is nn.Dropout(), skip for inference
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out)); bool force_prec_f32 = false;
float scale = 1.f;
if (precision_fix) {
scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
}
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32,
// or when using CUDA but the weights are k-quants.
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [ne3, ne2, ne1, dim] // x: [ne3, ne2, ne1, dim]
// return: [ne3, ne2, ne1, dim_out] // return: [ne3, ne2, ne1, dim_out]
auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]); auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]); auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
@ -247,19 +275,16 @@ protected:
int64_t context_dim; int64_t context_dim;
int64_t n_head; int64_t n_head;
int64_t d_head; int64_t d_head;
bool flash_attn;
public: public:
CrossAttention(int64_t query_dim, CrossAttention(int64_t query_dim,
int64_t context_dim, int64_t context_dim,
int64_t n_head, int64_t n_head,
int64_t d_head, int64_t d_head)
bool flash_attn = false)
: n_head(n_head), : n_head(n_head),
d_head(d_head), d_head(d_head),
query_dim(query_dim), query_dim(query_dim),
context_dim(context_dim), context_dim(context_dim) {
flash_attn(flash_attn) {
int64_t inner_dim = d_head * n_head; int64_t inner_dim = d_head * n_head;
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false)); blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
@ -270,7 +295,9 @@ public:
// to_out_1 is nn.Dropout(), skip for inference // to_out_1 is nn.Dropout(), skip for inference
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) { struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, n_token, query_dim] // x: [N, n_token, query_dim]
// context: [N, n_context, context_dim] // context: [N, n_context, context_dim]
// return: [N, n_token, query_dim] // return: [N, n_token, query_dim]
@ -288,7 +315,7 @@ public:
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false, false, flash_attn); // [N, n_token, inner_dim] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
return x; return x;
@ -306,16 +333,15 @@ public:
int64_t n_head, int64_t n_head,
int64_t d_head, int64_t d_head,
int64_t context_dim, int64_t context_dim,
bool ff_in = false, bool ff_in = false)
bool flash_attn = false)
: n_head(n_head), d_head(d_head), ff_in(ff_in) { : n_head(n_head), d_head(d_head), ff_in(ff_in) {
// disable_self_attn is always False // disable_self_attn is always False
// disable_temporal_crossattention is always False // disable_temporal_crossattention is always False
// switch_temporal_ca_to_sa is always False // switch_temporal_ca_to_sa is always False
// inner_dim is always None or equal to dim // inner_dim is always None or equal to dim
// gated_ff is always True // gated_ff is always True
blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head, flash_attn)); blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn)); blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim)); blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim)); blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim)); blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
@ -327,7 +353,9 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) { struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, n_token, query_dim] // x: [N, n_token, query_dim]
// context: [N, n_context, context_dim] // context: [N, n_context, context_dim]
// return: [N, n_token, query_dim] // return: [N, n_token, query_dim]
@ -347,21 +375,21 @@ public:
x = norm_in->forward(ctx, x); x = norm_in->forward(ctx, x);
x = ff_in->forward(ctx, x); x = ff_in->forward(ctx, x);
// self.is_res is always True // self.is_res is always True
x = ggml_add(ctx, x, x_skip); x = ggml_add(ctx->ggml_ctx, x, x_skip);
} }
auto r = x; auto r = x;
x = norm1->forward(ctx, x); x = norm1->forward(ctx, x);
x = attn1->forward(ctx, x, x); // self-attention x = attn1->forward(ctx, x, x); // self-attention
x = ggml_add(ctx, x, r); x = ggml_add(ctx->ggml_ctx, x, r);
r = x; r = x;
x = norm2->forward(ctx, x); x = norm2->forward(ctx, x);
x = attn2->forward(ctx, x, context); // cross-attention x = attn2->forward(ctx, x, context); // cross-attention
x = ggml_add(ctx, x, r); x = ggml_add(ctx->ggml_ctx, x, r);
r = x; r = x;
x = norm3->forward(ctx, x); x = norm3->forward(ctx, x);
x = ff->forward(ctx, x); x = ff->forward(ctx, x);
x = ggml_add(ctx, x, r); x = ggml_add(ctx->ggml_ctx, x, r);
return x; return x;
} }
@ -374,6 +402,23 @@ protected:
int64_t d_head; int64_t d_head;
int64_t depth = 1; // 1 int64_t depth = 1; // 1
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2 int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
bool use_linear = false;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
if (iter != tensor_storage_map.end()) {
int64_t inner_dim = n_head * d_head;
if (iter->second.n_dims == 4 && use_linear) {
use_linear = false;
blocks["proj_in"] = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
} else if (iter->second.n_dims == 2 && !use_linear) {
use_linear = true;
blocks["proj_in"] = std::make_shared<Linear>(in_channels, inner_dim);
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
}
}
}
public: public:
SpatialTransformer(int64_t in_channels, SpatialTransformer(int64_t in_channels,
@ -381,32 +426,42 @@ public:
int64_t d_head, int64_t d_head,
int64_t depth, int64_t depth,
int64_t context_dim, int64_t context_dim,
bool flash_attn = false) bool use_linear)
: in_channels(in_channels), : in_channels(in_channels),
n_head(n_head), n_head(n_head),
d_head(d_head), d_head(d_head),
depth(depth), depth(depth),
context_dim(context_dim) { context_dim(context_dim),
// We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False use_linear(use_linear) {
// disable_self_attn is always False // disable_self_attn is always False
int64_t inner_dim = n_head * d_head; // in_channels int64_t inner_dim = n_head * d_head; // in_channels
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels)); blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1})); if (use_linear) {
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
} else {
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
}
for (int i = 0; i < depth; i++) { for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i); std::string name = "transformer_blocks." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn)); blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
} }
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1})); if (use_linear) {
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
} else {
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
}
} }
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) { virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)] // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]); auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
auto proj_in = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]); auto proj_in = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]); auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
auto x_in = x; auto x_in = x;
int64_t n = x->ne[3]; int64_t n = x->ne[3];
@ -415,10 +470,15 @@ public:
int64_t inner_dim = n_head * d_head; int64_t inner_dim = n_head * d_head;
x = norm->forward(ctx, x); x = norm->forward(ctx, x);
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w] if (use_linear) {
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim] x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim] x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
} else {
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
}
for (int i = 0; i < depth; i++) { for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i); std::string name = "transformer_blocks." + std::to_string(i);
@ -427,29 +487,37 @@ public:
x = transformer_block->forward(ctx, x, context); x = transformer_block->forward(ctx, x, context);
} }
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w] if (use_linear) {
x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w] // proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
// proj_out x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = proj_out->forward(ctx, x); // [N, in_channels, h, w] x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
} else {
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
x = ggml_add(ctx, x, x_in); // proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
}
x = ggml_add(ctx->ggml_ctx, x, x_in);
return x; return x;
} }
}; };
class AlphaBlender : public GGMLBlock { class AlphaBlender : public GGMLBlock {
protected: protected:
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
} }
float get_alpha() { float get_alpha() {
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,] // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
// so learned_with_images is same as learned // so learned_with_images is same as learned
float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]); float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
return sigmoid(alpha); return sigmoid(alpha);
} }
@ -460,14 +528,14 @@ public:
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern // since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x_spatial, struct ggml_tensor* x_spatial,
struct ggml_tensor* x_temporal) { struct ggml_tensor* x_temporal) {
// image_only_indicator is always tensor([0.]) // image_only_indicator is always tensor([0.])
float alpha = get_alpha(); float alpha = get_alpha();
auto x = ggml_add(ctx, auto x = ggml_add(ctx->ggml_ctx,
ggml_scale(ctx, x_spatial, alpha), ggml_scale(ctx->ggml_ctx, x_spatial, alpha),
ggml_scale(ctx, x_temporal, 1.0f - alpha)); ggml_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
return x; return x;
} }
}; };
@ -485,7 +553,7 @@ public:
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender()); blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* emb, struct ggml_tensor* emb,
int num_video_frames) { int num_video_frames) {
@ -503,18 +571,18 @@ public:
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int64_t W = x->ne[0]; int64_t W = x->ne[0];
x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w) x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
auto x_mix = x; auto x_mix = x;
emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ... emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
x = time_stack->forward(ctx, x, emb); // b t c (h w) x = time_stack->forward(ctx, x, emb); // b t c (h w)
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w) x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
return x; return x;
} }

File diff suppressed because it is too large Load Diff

View File

@ -27,6 +27,7 @@ protected:
int num_heads = 8; int num_heads = 8;
int num_head_channels = -1; // channels // num_heads int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
bool use_linear_projection = false;
public: public:
int model_channels = 320; int model_channels = 320;
@ -82,7 +83,7 @@ public:
int64_t d_head, int64_t d_head,
int64_t depth, int64_t depth,
int64_t context_dim) -> SpatialTransformer* { int64_t context_dim) -> SpatialTransformer* {
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim); return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
}; };
auto make_zero_conv = [&](int64_t channels) { auto make_zero_conv = [&](int64_t channels) {
@ -165,7 +166,7 @@ public:
} }
struct ggml_tensor* resblock_forward(std::string name, struct ggml_tensor* resblock_forward(std::string name,
struct ggml_context* ctx, GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* emb) { struct ggml_tensor* emb) {
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]); auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
@ -173,14 +174,14 @@ public:
} }
struct ggml_tensor* attention_layer_forward(std::string name, struct ggml_tensor* attention_layer_forward(std::string name,
struct ggml_context* ctx, GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* context) { struct ggml_tensor* context) {
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]); auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
return block->forward(ctx, x, context); return block->forward(ctx, x, context);
} }
struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx, struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
struct ggml_tensor* hint, struct ggml_tensor* hint,
struct ggml_tensor* emb, struct ggml_tensor* emb,
struct ggml_tensor* context) { struct ggml_tensor* context) {
@ -192,32 +193,32 @@ public:
h = block->forward(ctx, h); h = block->forward(ctx, h);
} else { } else {
h = ggml_silu_inplace(ctx, h); h = ggml_silu_inplace(ctx->ggml_ctx, h);
} }
} }
return h; return h;
} }
std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx, std::vector<struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* hint, struct ggml_tensor* hint,
struct ggml_tensor* guided_hint, struct ggml_tensor* guided_hint,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* y = NULL) { struct ggml_tensor* y = nullptr) {
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w] // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
// timesteps: [N,] // timesteps: [N,]
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768] // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
// y: [N, adm_in_channels] or [1, adm_in_channels] // y: [N, adm_in_channels] or [1, adm_in_channels]
if (context != NULL) { if (context != nullptr) {
if (context->ne[2] != x->ne[3]) { if (context->ne[2] != x->ne[3]) {
context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3])); context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
} }
} }
if (y != NULL) { if (y != nullptr) {
if (y->ne[1] != x->ne[3]) { if (y->ne[1] != x->ne[3]) {
y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3])); y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
} }
} }
@ -228,27 +229,27 @@ public:
auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]); auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels] auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels); // [N, model_channels]
auto emb = time_embed_0->forward(ctx, t_emb); auto emb = time_embed_0->forward(ctx, t_emb);
emb = ggml_silu_inplace(ctx, emb); emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim] emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
// SDXL/SVD // SDXL/SVD
if (y != NULL) { if (y != nullptr) {
auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]); auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]); auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
auto label_emb = label_embed_0->forward(ctx, y); auto label_emb = label_embed_0->forward(ctx, y);
label_emb = ggml_silu_inplace(ctx, label_emb); label_emb = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim] label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim] emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
} }
std::vector<struct ggml_tensor*> outs; std::vector<struct ggml_tensor*> outs;
if (guided_hint == NULL) { if (guided_hint == nullptr) {
guided_hint = input_hint_block_forward(ctx, hint, emb, context); guided_hint = input_hint_block_forward(ctx, hint, emb, context);
} }
outs.push_back(guided_hint); outs.push_back(guided_hint);
@ -257,7 +258,7 @@ public:
// input block 0 // input block 0
auto h = input_blocks_0_0->forward(ctx, x); auto h = input_blocks_0_0->forward(ctx, x);
h = ggml_add(ctx, h, guided_hint); h = ggml_add(ctx->ggml_ctx, h, guided_hint);
outs.push_back(zero_convs_0->forward(ctx, h)); outs.push_back(zero_convs_0->forward(ctx, h));
// input block 1-11 // input block 1-11
@ -310,27 +311,28 @@ struct ControlNet : public GGMLRunner {
SDVersion version = VERSION_SD1; SDVersion version = VERSION_SD1;
ControlNetBlock control_net; ControlNetBlock control_net;
ggml_backend_buffer_t control_buffer = NULL; // keep control output tensors in backend memory ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory
ggml_context* control_ctx = NULL; ggml_context* control_ctx = nullptr;
std::vector<struct ggml_tensor*> controls; // (12 input block outputs, 1 middle block output) SD 1.5 std::vector<struct ggml_tensor*> controls; // (12 input block outputs, 1 middle block output) SD 1.5
struct ggml_tensor* guided_hint = NULL; // guided_hint cache, for faster inference struct ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference
bool guided_hint_cached = false; bool guided_hint_cached = false;
ControlNet(ggml_backend_t backend, ControlNet(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types, bool offload_params_to_cpu,
SDVersion version = VERSION_SD1) const String2TensorStorage& tensor_storage_map = {},
: GGMLRunner(backend), control_net(version) { SDVersion version = VERSION_SD1)
control_net.init(params_ctx, tensor_types, ""); : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
control_net.init(params_ctx, tensor_storage_map, "");
} }
~ControlNet() { ~ControlNet() override {
free_control_ctx(); free_control_ctx();
} }
void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) { void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024; params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
params.mem_buffer = NULL; params.mem_buffer = nullptr;
params.no_alloc = true; params.no_alloc = true;
control_ctx = ggml_init(params); control_ctx = ggml_init(params);
@ -346,26 +348,26 @@ struct ControlNet : public GGMLRunner {
control_buffer_size += ggml_nbytes(controls[i]); control_buffer_size += ggml_nbytes(controls[i]);
} }
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend); control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f); LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
} }
void free_control_ctx() { void free_control_ctx() {
if (control_buffer != NULL) { if (control_buffer != nullptr) {
ggml_backend_buffer_free(control_buffer); ggml_backend_buffer_free(control_buffer);
control_buffer = NULL; control_buffer = nullptr;
} }
if (control_ctx != NULL) { if (control_ctx != nullptr) {
ggml_free(control_ctx); ggml_free(control_ctx);
control_ctx = NULL; control_ctx = nullptr;
} }
guided_hint = NULL; guided_hint = nullptr;
guided_hint_cached = false; guided_hint_cached = false;
controls.clear(); controls.clear();
} }
std::string get_desc() { std::string get_desc() override {
return "control_net"; return "control_net";
} }
@ -377,12 +379,12 @@ struct ControlNet : public GGMLRunner {
struct ggml_tensor* hint, struct ggml_tensor* hint,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* y = NULL) { struct ggml_tensor* y = nullptr) {
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false); struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
x = to_backend(x); x = to_backend(x);
if (guided_hint_cached) { if (guided_hint_cached) {
hint = NULL; hint = nullptr;
} else { } else {
hint = to_backend(hint); hint = to_backend(hint);
} }
@ -390,15 +392,17 @@ struct ControlNet : public GGMLRunner {
y = to_backend(y); y = to_backend(y);
timesteps = to_backend(timesteps); timesteps = to_backend(timesteps);
auto outs = control_net.forward(compute_ctx, auto runner_ctx = get_context();
auto outs = control_net.forward(&runner_ctx,
x, x,
hint, hint,
guided_hint_cached ? guided_hint : NULL, guided_hint_cached ? guided_hint : nullptr,
timesteps, timesteps,
context, context,
y); y);
if (control_ctx == NULL) { if (control_ctx == nullptr) {
alloc_control_ctx(outs); alloc_control_ctx(outs);
} }
@ -410,14 +414,14 @@ struct ControlNet : public GGMLRunner {
return gf; return gf;
} }
void compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* hint, struct ggml_tensor* hint,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* y, struct ggml_tensor* y,
struct ggml_tensor** output = NULL, struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = NULL) { struct ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// timesteps: [N, ] // timesteps: [N, ]
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
@ -426,11 +430,15 @@ struct ControlNet : public GGMLRunner {
return build_graph(x, hint, timesteps, context, y); return build_graph(x, hint, timesteps, context, y);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
guided_hint_cached = true; if (res) {
// cache guided_hint
guided_hint_cached = true;
}
return res;
} }
bool load_from_file(const std::string& file_path) { bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading control net from '%s'", file_path.c_str()); LOG_INFO("loading control net from '%s'", file_path.c_str());
alloc_params_buffer(); alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors; std::map<std::string, ggml_tensor*> tensors;
@ -438,12 +446,12 @@ struct ControlNet : public GGMLRunner {
std::set<std::string> ignore_tensors; std::set<std::string> ignore_tensors;
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
return false; return false;
} }
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors); bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
if (!success) { if (!success) {
LOG_ERROR("load control net tensors from model loader failed"); LOG_ERROR("load control net tensors from model loader failed");

View File

@ -11,15 +11,14 @@
#define TIMESTEPS 1000 #define TIMESTEPS 1000
#define FLUX_TIMESTEPS 1000 #define FLUX_TIMESTEPS 1000
struct SigmaSchedule { struct SigmaScheduler {
int version = 0;
typedef std::function<float(float)> t_to_sigma_t; typedef std::function<float(float)> t_to_sigma_t;
virtual std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) = 0; virtual std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) = 0;
}; };
struct DiscreteSchedule : SigmaSchedule { struct DiscreteScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector<float> result; std::vector<float> result;
int t_max = TIMESTEPS - 1; int t_max = TIMESTEPS - 1;
@ -42,8 +41,8 @@ struct DiscreteSchedule : SigmaSchedule {
} }
}; };
struct ExponentialSchedule : SigmaSchedule { struct ExponentialScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector<float> sigmas; std::vector<float> sigmas;
// Calculate step size // Calculate step size
@ -149,8 +148,11 @@ std::vector<float> log_linear_interpolation(std::vector<float> sigma_in,
/* /*
https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
*/ */
struct AYSSchedule : SigmaSchedule { struct AYSScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { SDVersion version;
explicit AYSScheduler(SDVersion version)
: version(version) {}
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
const std::vector<float> noise_levels[] = { const std::vector<float> noise_levels[] = {
/* SD1.5 */ /* SD1.5 */
{14.6146412293f, 6.4745760956f, 3.8636745985f, 2.6946151520f, {14.6146412293f, 6.4745760956f, 3.8636745985f, 2.6946151520f,
@ -169,19 +171,19 @@ struct AYSSchedule : SigmaSchedule {
std::vector<float> results(n + 1); std::vector<float> results(n + 1);
if (sd_version_is_sd2((SDVersion)version)) { if (sd_version_is_sd2((SDVersion)version)) {
LOG_WARN("AYS not designed for SD2.X models"); LOG_WARN("AYS_SCHEDULER not designed for SD2.X models");
} /* fallthrough */ } /* fallthrough */
else if (sd_version_is_sd1((SDVersion)version)) { else if (sd_version_is_sd1((SDVersion)version)) {
LOG_INFO("AYS using SD1.5 noise levels"); LOG_INFO("AYS_SCHEDULER using SD1.5 noise levels");
inputs = noise_levels[0]; inputs = noise_levels[0];
} else if (sd_version_is_sdxl((SDVersion)version)) { } else if (sd_version_is_sdxl((SDVersion)version)) {
LOG_INFO("AYS using SDXL noise levels"); LOG_INFO("AYS_SCHEDULER using SDXL noise levels");
inputs = noise_levels[1]; inputs = noise_levels[1];
} else if (version == VERSION_SVD) { } else if (version == VERSION_SVD) {
LOG_INFO("AYS using SVD noise levels"); LOG_INFO("AYS_SCHEDULER using SVD noise levels");
inputs = noise_levels[2]; inputs = noise_levels[2];
} else { } else {
LOG_ERROR("Version not compatible with AYS scheduler"); LOG_ERROR("Version not compatible with AYS_SCHEDULER scheduler");
return results; return results;
} }
@ -203,8 +205,8 @@ struct AYSSchedule : SigmaSchedule {
/* /*
* GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main * GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main
*/ */
struct GITSSchedule : SigmaSchedule { struct GITSScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
if (sigma_max <= 0.0f) { if (sigma_max <= 0.0f) {
return std::vector<float>{}; return std::vector<float>{};
} }
@ -232,8 +234,44 @@ struct GITSSchedule : SigmaSchedule {
} }
}; };
struct KarrasSchedule : SigmaSchedule { struct SGMUniformScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { std::vector<float> get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override {
std::vector<float> result;
if (n == 0) {
result.push_back(0.0f);
return result;
}
result.reserve(n + 1);
int t_max = TIMESTEPS - 1;
int t_min = 0;
std::vector<float> timesteps = linear_space(static_cast<float>(t_max), static_cast<float>(t_min), n + 1);
for (int i = 0; i < n; i++) {
result.push_back(t_to_sigma_func(timesteps[i]));
}
result.push_back(0.0f);
return result;
}
};
struct LCMScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector<float> result;
result.reserve(n + 1);
const int original_steps = 50;
const int k = TIMESTEPS / original_steps;
for (int i = 0; i < n; i++) {
// the rounding ensures we match the training schedule of the LCM model
int index = (i * original_steps) / n;
int timestep = (original_steps - index) * k - 1;
result.push_back(t_to_sigma(timestep));
}
result.push_back(0.0f);
return result;
}
};
struct KarrasScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
// These *COULD* be function arguments here, // These *COULD* be function arguments here,
// but does anybody ever bother to touch them? // but does anybody ever bother to touch them?
float rho = 7.f; float rho = 7.f;
@ -251,8 +289,65 @@ struct KarrasSchedule : SigmaSchedule {
} }
}; };
struct SimpleScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector<float> result_sigmas;
if (n == 0) {
return result_sigmas;
}
result_sigmas.reserve(n + 1);
int model_sigmas_len = TIMESTEPS;
float step_factor = static_cast<float>(model_sigmas_len) / static_cast<float>(n);
for (uint32_t i = 0; i < n; ++i) {
int offset_from_start_of_py_array = static_cast<int>(static_cast<float>(i) * step_factor);
int timestep_index = model_sigmas_len - 1 - offset_from_start_of_py_array;
if (timestep_index < 0) {
timestep_index = 0;
}
result_sigmas.push_back(t_to_sigma(static_cast<float>(timestep_index)));
}
result_sigmas.push_back(0.0f);
return result_sigmas;
}
};
// Close to Beta Scheduler, but increadably simple in code.
struct SmoothStepScheduler : SigmaScheduler {
static constexpr float smoothstep(float x) {
return x * x * (3.0f - 2.0f * x);
}
std::vector<float> get_sigmas(uint32_t n, float /*sigma_min*/, float /*sigma_max*/, t_to_sigma_t t_to_sigma) override {
std::vector<float> result;
result.reserve(n + 1);
const int t_max = TIMESTEPS - 1;
if (n == 0) {
return result;
} else if (n == 1) {
result.push_back(t_to_sigma((float)t_max));
result.push_back(0.f);
return result;
}
for (uint32_t i = 0; i < n; i++) {
float u = 1.f - float(i) / float(n);
result.push_back(t_to_sigma(std::round(smoothstep(u) * t_max)));
}
result.push_back(0.f);
return result;
}
};
struct Denoiser { struct Denoiser {
std::shared_ptr<SigmaSchedule> schedule = std::make_shared<DiscreteSchedule>();
virtual float sigma_min() = 0; virtual float sigma_min() = 0;
virtual float sigma_max() = 0; virtual float sigma_max() = 0;
virtual float sigma_to_t(float sigma) = 0; virtual float sigma_to_t(float sigma) = 0;
@ -261,9 +356,52 @@ struct Denoiser {
virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0; virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0;
virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) = 0; virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) = 0;
virtual std::vector<float> get_sigmas(uint32_t n) { virtual std::vector<float> get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) {
auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma); std::shared_ptr<SigmaScheduler> scheduler;
switch (scheduler_type) {
case DISCRETE_SCHEDULER:
LOG_INFO("get_sigmas with discrete scheduler");
scheduler = std::make_shared<DiscreteScheduler>();
break;
case KARRAS_SCHEDULER:
LOG_INFO("get_sigmas with Karras scheduler");
scheduler = std::make_shared<KarrasScheduler>();
break;
case EXPONENTIAL_SCHEDULER:
LOG_INFO("get_sigmas exponential scheduler");
scheduler = std::make_shared<ExponentialScheduler>();
break;
case AYS_SCHEDULER:
LOG_INFO("get_sigmas with Align-Your-Steps scheduler");
scheduler = std::make_shared<AYSScheduler>(version);
break;
case GITS_SCHEDULER:
LOG_INFO("get_sigmas with GITS scheduler");
scheduler = std::make_shared<GITSScheduler>();
break;
case SGM_UNIFORM_SCHEDULER:
LOG_INFO("get_sigmas with SGM Uniform scheduler");
scheduler = std::make_shared<SGMUniformScheduler>();
break;
case SIMPLE_SCHEDULER:
LOG_INFO("get_sigmas with Simple scheduler");
scheduler = std::make_shared<SimpleScheduler>();
break;
case SMOOTHSTEP_SCHEDULER:
LOG_INFO("get_sigmas with SmoothStep scheduler");
scheduler = std::make_shared<SmoothStepScheduler>();
break;
case LCM_SCHEDULER:
LOG_INFO("get_sigmas with LCM scheduler");
scheduler = std::make_shared<LCMScheduler>();
break;
default:
LOG_INFO("get_sigmas with discrete scheduler (default)");
scheduler = std::make_shared<DiscreteScheduler>();
break;
}
return scheduler->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma);
} }
}; };
@ -273,15 +411,15 @@ struct CompVisDenoiser : public Denoiser {
float sigma_data = 1.0f; float sigma_data = 1.0f;
float sigma_min() { float sigma_min() override {
return sigmas[0]; return sigmas[0];
} }
float sigma_max() { float sigma_max() override {
return sigmas[TIMESTEPS - 1]; return sigmas[TIMESTEPS - 1];
} }
float sigma_to_t(float sigma) { float sigma_to_t(float sigma) override {
float log_sigma = std::log(sigma); float log_sigma = std::log(sigma);
std::vector<float> dists; std::vector<float> dists;
dists.reserve(TIMESTEPS); dists.reserve(TIMESTEPS);
@ -307,7 +445,7 @@ struct CompVisDenoiser : public Denoiser {
return t; return t;
} }
float t_to_sigma(float t) { float t_to_sigma(float t) override {
int low_idx = static_cast<int>(std::floor(t)); int low_idx = static_cast<int>(std::floor(t));
int high_idx = static_cast<int>(std::ceil(t)); int high_idx = static_cast<int>(std::ceil(t));
float w = t - static_cast<float>(low_idx); float w = t - static_cast<float>(low_idx);
@ -315,7 +453,7 @@ struct CompVisDenoiser : public Denoiser {
return std::exp(log_sigma); return std::exp(log_sigma);
} }
std::vector<float> get_scalings(float sigma) { std::vector<float> get_scalings(float sigma) override {
float c_skip = 1.0f; float c_skip = 1.0f;
float c_out = -sigma; float c_out = -sigma;
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data); float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
@ -323,19 +461,19 @@ struct CompVisDenoiser : public Denoiser {
} }
// this function will modify noise/latent // this function will modify noise/latent
ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) { ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
ggml_tensor_scale(noise, sigma); ggml_ext_tensor_scale_inplace(noise, sigma);
ggml_tensor_add(latent, noise); ggml_ext_tensor_add_inplace(latent, noise);
return latent; return latent;
} }
ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) { ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
return latent; return latent;
} }
}; };
struct CompVisVDenoiser : public CompVisDenoiser { struct CompVisVDenoiser : public CompVisDenoiser {
std::vector<float> get_scalings(float sigma) { std::vector<float> get_scalings(float sigma) override {
float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data); float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data); float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data); float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
@ -349,22 +487,21 @@ struct EDMVDenoiser : public CompVisVDenoiser {
EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0) EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0)
: min_sigma(min_sigma), max_sigma(max_sigma) { : min_sigma(min_sigma), max_sigma(max_sigma) {
schedule = std::make_shared<ExponentialSchedule>();
} }
float t_to_sigma(float t) { float t_to_sigma(float t) override {
return std::exp(t * 4 / (float)TIMESTEPS); return std::exp(t * 4 / (float)TIMESTEPS);
} }
float sigma_to_t(float s) { float sigma_to_t(float s) override {
return 0.25 * std::log(s); return 0.25 * std::log(s);
} }
float sigma_min() { float sigma_min() override {
return min_sigma; return min_sigma;
} }
float sigma_max() { float sigma_max() override {
return max_sigma; return max_sigma;
} }
}; };
@ -382,7 +519,8 @@ struct DiscreteFlowDenoiser : public Denoiser {
float sigma_data = 1.0f; float sigma_data = 1.0f;
DiscreteFlowDenoiser() { DiscreteFlowDenoiser(float shift = 3.0f)
: shift(shift) {
set_parameters(); set_parameters();
} }
@ -392,24 +530,24 @@ struct DiscreteFlowDenoiser : public Denoiser {
} }
} }
float sigma_min() { float sigma_min() override {
return sigmas[0]; return sigmas[0];
} }
float sigma_max() { float sigma_max() override {
return sigmas[TIMESTEPS - 1]; return sigmas[TIMESTEPS - 1];
} }
float sigma_to_t(float sigma) { float sigma_to_t(float sigma) override {
return sigma * 1000.f; return sigma * 1000.f;
} }
float t_to_sigma(float t) { float t_to_sigma(float t) override {
t = t + 1; t = t + 1;
return time_snr_shift(shift, t / 1000.f); return time_snr_shift(shift, t / 1000.f);
} }
std::vector<float> get_scalings(float sigma) { std::vector<float> get_scalings(float sigma) override {
float c_skip = 1.0f; float c_skip = 1.0f;
float c_out = -sigma; float c_out = -sigma;
float c_in = 1.0f; float c_in = 1.0f;
@ -417,15 +555,15 @@ struct DiscreteFlowDenoiser : public Denoiser {
} }
// this function will modify noise/latent // this function will modify noise/latent
ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) { ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
ggml_tensor_scale(noise, sigma); ggml_ext_tensor_scale_inplace(noise, sigma);
ggml_tensor_scale(latent, 1.0f - sigma); ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
ggml_tensor_add(latent, noise); ggml_ext_tensor_add_inplace(latent, noise);
return latent; return latent;
} }
ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) { ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
ggml_tensor_scale(latent, 1.0f / (1.0f - sigma)); ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
return latent; return latent;
} }
}; };
@ -444,31 +582,35 @@ struct FluxFlowDenoiser : public Denoiser {
set_parameters(shift); set_parameters(shift);
} }
void set_parameters(float shift = 1.15f) { void set_shift(float shift) {
this->shift = shift; this->shift = shift;
for (int i = 1; i < TIMESTEPS + 1; i++) { }
sigmas[i - 1] = t_to_sigma(i / TIMESTEPS * TIMESTEPS);
void set_parameters(float shift) {
set_shift(shift);
for (int i = 0; i < TIMESTEPS; i++) {
sigmas[i] = t_to_sigma(i);
} }
} }
float sigma_min() { float sigma_min() override {
return sigmas[0]; return sigmas[0];
} }
float sigma_max() { float sigma_max() override {
return sigmas[TIMESTEPS - 1]; return sigmas[TIMESTEPS - 1];
} }
float sigma_to_t(float sigma) { float sigma_to_t(float sigma) override {
return sigma; return sigma;
} }
float t_to_sigma(float t) { float t_to_sigma(float t) override {
t = t + 1; t = t + 1;
return flux_time_shift(shift, 1.0f, t / TIMESTEPS); return flux_time_shift(shift, 1.0f, t / TIMESTEPS);
} }
std::vector<float> get_scalings(float sigma) { std::vector<float> get_scalings(float sigma) override {
float c_skip = 1.0f; float c_skip = 1.0f;
float c_out = -sigma; float c_out = -sigma;
float c_in = 1.0f; float c_in = 1.0f;
@ -476,23 +618,55 @@ struct FluxFlowDenoiser : public Denoiser {
} }
// this function will modify noise/latent // this function will modify noise/latent
ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) { ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
ggml_tensor_scale(noise, sigma); ggml_ext_tensor_scale_inplace(noise, sigma);
ggml_tensor_scale(latent, 1.0f - sigma); ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
ggml_tensor_add(latent, noise); ggml_ext_tensor_add_inplace(latent, noise);
return latent; return latent;
} }
ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) { ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
ggml_tensor_scale(latent, 1.0f / (1.0f - sigma)); ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
return latent; return latent;
} }
}; };
struct Flux2FlowDenoiser : public FluxFlowDenoiser {
Flux2FlowDenoiser() = default;
float compute_empirical_mu(uint32_t n, int image_seq_len) {
const float a1 = 8.73809524e-05f;
const float b1 = 1.89833333f;
const float a2 = 0.00016927f;
const float b2 = 0.45666666f;
if (image_seq_len > 4300) {
float mu = a2 * image_seq_len + b2;
return mu;
}
float m_200 = a2 * image_seq_len + b2;
float m_10 = a1 * image_seq_len + b1;
float a = (m_200 - m_10) / 190.0f;
float b = m_200 - 200.0f * a;
float mu = a * n + b;
return mu;
}
std::vector<float> get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version) override {
float mu = compute_empirical_mu(n, image_seq_len);
LOG_DEBUG("Flux2FlowDenoiser: set shift to %.3f", mu);
set_shift(mu);
return Denoiser::get_sigmas(n, image_seq_len, scheduler_type, version);
}
};
typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t; typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t;
// k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
static void sample_k_diffusion(sample_method_t method, static bool sample_k_diffusion(sample_method_t method,
denoise_cb_t model, denoise_cb_t model,
ggml_context* work_ctx, ggml_context* work_ctx,
ggml_tensor* x, ggml_tensor* x,
@ -502,7 +676,7 @@ static void sample_k_diffusion(sample_method_t method,
size_t steps = sigmas.size() - 1; size_t steps = sigmas.size() - 1;
// sample_euler_ancestral // sample_euler_ancestral
switch (method) { switch (method) {
case EULER_A: { case EULER_A_SAMPLE_METHOD: {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
@ -511,6 +685,9 @@ static void sample_k_diffusion(sample_method_t method,
// denoise // denoise
ggml_tensor* denoised = model(x, sigma, i + 1); ggml_tensor* denoised = model(x, sigma, i + 1);
if (denoised == nullptr) {
return false;
}
// d = (x - denoised) / sigma // d = (x - denoised) / sigma
{ {
@ -542,7 +719,7 @@ static void sample_k_diffusion(sample_method_t method,
if (sigmas[i + 1] > 0) { if (sigmas[i + 1] > 0) {
// x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
ggml_tensor_set_f32_randn(noise, rng); ggml_ext_im_set_randn_f32(noise, rng);
// noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin"); // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
{ {
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
@ -555,7 +732,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case EULER: // Implemented without any sigma churn case EULER_SAMPLE_METHOD: // Implemented without any sigma churn
{ {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
@ -564,6 +741,9 @@ static void sample_k_diffusion(sample_method_t method,
// denoise // denoise
ggml_tensor* denoised = model(x, sigma, i + 1); ggml_tensor* denoised = model(x, sigma, i + 1);
if (denoised == nullptr) {
return false;
}
// d = (x - denoised) / sigma // d = (x - denoised) / sigma
{ {
@ -588,13 +768,16 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case HEUN: { case HEUN_SAMPLE_METHOD: {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], -(i + 1)); ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
if (denoised == nullptr) {
return false;
}
// d = (x - denoised) / sigma // d = (x - denoised) / sigma
{ {
@ -629,7 +812,10 @@ static void sample_k_diffusion(sample_method_t method,
} }
ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
float* vec_denoised = (float*)denoised->data; if (denoised == nullptr) {
return false;
}
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1]; float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
vec_d[j] = (vec_d[j] + d2) / 2; vec_d[j] = (vec_d[j] + d2) / 2;
@ -638,13 +824,16 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case DPM2: { case DPM2_SAMPLE_METHOD: {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], i + 1);
if (denoised == nullptr) {
return false;
}
// d = (x - denoised) / sigma // d = (x - denoised) / sigma
{ {
@ -681,7 +870,10 @@ static void sample_k_diffusion(sample_method_t method,
} }
ggml_tensor* denoised = model(x2, sigma_mid, i + 1); ggml_tensor* denoised = model(x2, sigma_mid, i + 1);
float* vec_denoised = (float*)denoised->data; if (denoised == nullptr) {
return false;
}
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid; float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
vec_x[j] = vec_x[j] + d2 * dt_2; vec_x[j] = vec_x[j] + d2 * dt_2;
@ -690,14 +882,16 @@ static void sample_k_diffusion(sample_method_t method,
} }
} break; } break;
case DPMPP2S_A: { case DPMPP2S_A_SAMPLE_METHOD: {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], i + 1);
if (denoised == nullptr) {
return false;
}
// get_ancestral_step // get_ancestral_step
float sigma_up = std::min(sigmas[i + 1], float sigma_up = std::min(sigmas[i + 1],
@ -707,22 +901,15 @@ static void sample_k_diffusion(sample_method_t method,
auto sigma_fn = [](float t) -> float { return exp(-t); }; auto sigma_fn = [](float t) -> float { return exp(-t); };
if (sigma_down == 0) { if (sigma_down == 0) {
// Euler step // d = (x - denoised) / sigmas[i];
float* vec_d = (float*)d->data; // dt = sigma_down - sigmas[i];
// x += d * dt;
// => x = denoised
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data; float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(d); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; vec_x[j] = vec_denoised[j];
}
// TODO: If sigma_down == 0, isn't this wrong?
// But
// https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py#L525
// has this exactly the same way.
float dt = sigma_down - sigmas[i];
for (int j = 0; j < ggml_nelements(d); j++) {
vec_x[j] = vec_x[j] + vec_d[j] * dt;
} }
} else { } else {
// DPM-Solver++(2S) // DPM-Solver++(2S)
@ -731,7 +918,6 @@ static void sample_k_diffusion(sample_method_t method,
float h = t_next - t; float h = t_next - t;
float s = t + 0.5f * h; float s = t + 0.5f * h;
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
float* vec_x2 = (float*)x2->data; float* vec_x2 = (float*)x2->data;
float* vec_denoised = (float*)denoised->data; float* vec_denoised = (float*)denoised->data;
@ -742,6 +928,9 @@ static void sample_k_diffusion(sample_method_t method,
} }
ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
if (denoised == nullptr) {
return false;
}
// Second half-step // Second half-step
for (int j = 0; j < ggml_nelements(x); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
@ -751,7 +940,7 @@ static void sample_k_diffusion(sample_method_t method,
// Noise addition // Noise addition
if (sigmas[i + 1] > 0) { if (sigmas[i + 1] > 0) {
ggml_tensor_set_f32_randn(noise, rng); ggml_ext_im_set_randn_f32(noise, rng);
{ {
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data; float* vec_noise = (float*)noise->data;
@ -763,7 +952,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case DPMPP2M: // DPM++ (2M) from Karras et al (2022) case DPMPP2M_SAMPLE_METHOD: // DPM++ (2M) from Karras et al (2022)
{ {
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
@ -772,6 +961,9 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], i + 1);
if (denoised == nullptr) {
return false;
}
float t = t_fn(sigmas[i]); float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]); float t_next = t_fn(sigmas[i + 1]);
@ -802,7 +994,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 case DPMPP2Mv2_SAMPLE_METHOD: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
{ {
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
@ -811,6 +1003,9 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], i + 1);
if (denoised == nullptr) {
return false;
}
float t = t_fn(sigmas[i]); float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]); float t_next = t_fn(sigmas[i + 1]);
@ -845,7 +1040,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case IPNDM: // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main case IPNDM_SAMPLE_METHOD: // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
{ {
int max_order = 4; int max_order = 4;
ggml_tensor* x_next = x; ggml_tensor* x_next = x;
@ -861,7 +1056,10 @@ static void sample_k_diffusion(sample_method_t method,
// Denoising step // Denoising step
ggml_tensor* denoised = model(x_cur, sigma, i + 1); ggml_tensor* denoised = model(x_cur, sigma, i + 1);
float* vec_denoised = (float*)denoised->data; if (denoised == nullptr) {
return false;
}
float* vec_denoised = (float*)denoised->data;
// d_cur = (x_cur - denoised) / sigma // d_cur = (x_cur - denoised) / sigma
struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur); struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur);
float* vec_d_cur = (float*)d_cur->data; float* vec_d_cur = (float*)d_cur->data;
@ -920,7 +1118,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case IPNDM_V: // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main case IPNDM_V_SAMPLE_METHOD: // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
{ {
int max_order = 4; int max_order = 4;
std::vector<ggml_tensor*> buffer_model; std::vector<ggml_tensor*> buffer_model;
@ -994,7 +1192,7 @@ static void sample_k_diffusion(sample_method_t method,
d_cur = ggml_dup_tensor(work_ctx, x_next); d_cur = ggml_dup_tensor(work_ctx, x_next);
} }
} break; } break;
case LCM: // Latent Consistency Models case LCM_SAMPLE_METHOD: // Latent Consistency Models
{ {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
@ -1004,6 +1202,9 @@ static void sample_k_diffusion(sample_method_t method,
// denoise // denoise
ggml_tensor* denoised = model(x, sigma, i + 1); ggml_tensor* denoised = model(x, sigma, i + 1);
if (denoised == nullptr) {
return false;
}
// x = denoised // x = denoised
{ {
@ -1016,7 +1217,7 @@ static void sample_k_diffusion(sample_method_t method,
if (sigmas[i + 1] > 0) { if (sigmas[i + 1] > 0) {
// x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1]) // x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1])
ggml_tensor_set_f32_randn(noise, rng); ggml_ext_im_set_randn_f32(noise, rng);
// noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
{ {
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
@ -1029,8 +1230,8 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case DDIM_TRAILING: // Denoising Diffusion Implicit Models case DDIM_TRAILING_SAMPLE_METHOD: // Denoising Diffusion Implicit Models
// with the "trailing" timestep spacing // with the "trailing" timestep spacing
{ {
// See J. Song et al., "Denoising Diffusion Implicit // See J. Song et al., "Denoising Diffusion Implicit
// Models", arXiv:2010.02502 [cs.LG] // Models", arXiv:2010.02502 [cs.LG]
@ -1040,7 +1241,7 @@ static void sample_k_diffusion(sample_method_t method,
// end beta) (which unfortunately k-diffusion's data // end beta) (which unfortunately k-diffusion's data
// structure hides from the denoiser), and the sigmas are // structure hides from the denoiser), and the sigmas are
// also needed to invert the behavior of CompVisDenoiser // also needed to invert the behavior of CompVisDenoiser
// (k-diffusion's LMSDiscreteScheduler) // (k-diffusion's LMSDiscreteSchedulerr)
float beta_start = 0.00085f; float beta_start = 0.00085f;
float beta_end = 0.0120f; float beta_end = 0.0120f;
std::vector<double> alphas_cumprod; std::vector<double> alphas_cumprod;
@ -1068,7 +1269,7 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// The "trailing" DDIM timestep, see S. Lin et al., // The "trailing" DDIM timestep, see S. Lin et al.,
// "Common Diffusion Noise Schedules and Sample Steps // "Common Diffusion Noise Schedulers and Sample Steps
// are Flawed", arXiv:2305.08891 [cs], p. 4, Table // are Flawed", arXiv:2305.08891 [cs], p. 4, Table
// 2. Most variables below follow Diffusers naming // 2. Most variables below follow Diffusers naming
// //
@ -1207,7 +1408,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
if (eta > 0) { if (eta > 0) {
ggml_tensor_set_f32_randn(variance_noise, rng); ggml_ext_im_set_randn_f32(variance_noise, rng);
float* vec_variance_noise = float* vec_variance_noise =
(float*)variance_noise->data; (float*)variance_noise->data;
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
@ -1223,8 +1424,8 @@ static void sample_k_diffusion(sample_method_t method,
// factor c_in. // factor c_in.
} }
} break; } break;
case TCD: // Strategic Stochastic Sampling (Algorithm 4) in case TCD_SAMPLE_METHOD: // Strategic Stochastic Sampling (Algorithm 4) in
// Trajectory Consistency Distillation // Trajectory Consistency Distillation
{ {
// See J. Zheng et al., "Trajectory Consistency // See J. Zheng et al., "Trajectory Consistency
// Distillation: Improved Latent Consistency Distillation // Distillation: Improved Latent Consistency Distillation
@ -1375,7 +1576,7 @@ static void sample_k_diffusion(sample_method_t method,
if (eta > 0 && i != steps - 1) { if (eta > 0 && i != steps - 1) {
// In this case, x is still pred_noised_sample, // In this case, x is still pred_noised_sample,
// continue in-place // continue in-place
ggml_tensor_set_f32_randn(noise, rng); ggml_ext_im_set_randn_f32(noise, rng);
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data; float* vec_noise = (float*)noise->data;
for (int j = 0; j < ggml_nelements(x); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
@ -1396,8 +1597,9 @@ static void sample_k_diffusion(sample_method_t method,
default: default:
LOG_ERROR("Attempting to sample with nonexisting sample method %i", method); LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
abort(); return false;
} }
return true;
} }
#endif // __DENOISER_HPP__ #endif // __DENOISER_HPP__

View File

@ -3,81 +3,103 @@
#include "flux.hpp" #include "flux.hpp"
#include "mmdit.hpp" #include "mmdit.hpp"
#include "qwen_image.hpp"
#include "unet.hpp" #include "unet.hpp"
#include "wan.hpp"
#include "z_image.hpp"
struct DiffusionParams {
struct ggml_tensor* x = nullptr;
struct ggml_tensor* timesteps = nullptr;
struct ggml_tensor* context = nullptr;
struct ggml_tensor* c_concat = nullptr;
struct ggml_tensor* y = nullptr;
struct ggml_tensor* guidance = nullptr;
std::vector<ggml_tensor*> ref_latents = {};
bool increase_ref_index = false;
int num_video_frames = -1;
std::vector<struct ggml_tensor*> controls = {};
float control_strength = 0.f;
struct ggml_tensor* vace_context = nullptr;
float vace_strength = 1.f;
std::vector<int> skip_layers = {};
};
struct DiffusionModel { struct DiffusionModel {
virtual void compute(int n_threads, virtual std::string get_desc() = 0;
struct ggml_tensor* x, virtual bool compute(int n_threads,
struct ggml_tensor* timesteps, DiffusionParams diffusion_params,
struct ggml_tensor* context, struct ggml_tensor** output = nullptr,
struct ggml_tensor* c_concat, struct ggml_context* output_ctx = nullptr) = 0;
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) = 0;
virtual void alloc_params_buffer() = 0; virtual void alloc_params_buffer() = 0;
virtual void free_params_buffer() = 0; virtual void free_params_buffer() = 0;
virtual void free_compute_buffer() = 0; virtual void free_compute_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0; virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0; virtual size_t get_params_buffer_size() = 0;
virtual int64_t get_adm_in_channels() = 0; virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
virtual int64_t get_adm_in_channels() = 0;
virtual void set_flash_attn_enabled(bool enabled) = 0;
}; };
struct UNetModel : public DiffusionModel { struct UNetModel : public DiffusionModel {
UNetModelRunner unet; UNetModelRunner unet;
UNetModel(ggml_backend_t backend, UNetModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types, bool offload_params_to_cpu,
SDVersion version = VERSION_SD1, const String2TensorStorage& tensor_storage_map = {},
bool flash_attn = false) SDVersion version = VERSION_SD1)
: unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) { : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
} }
void alloc_params_buffer() { std::string get_desc() override {
return unet.get_desc();
}
void alloc_params_buffer() override {
unet.alloc_params_buffer(); unet.alloc_params_buffer();
} }
void free_params_buffer() { void free_params_buffer() override {
unet.free_params_buffer(); unet.free_params_buffer();
} }
void free_compute_buffer() { void free_compute_buffer() override {
unet.free_compute_buffer(); unet.free_compute_buffer();
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
unet.get_param_tensors(tensors, "model.diffusion_model"); unet.get_param_tensors(tensors, "model.diffusion_model");
} }
size_t get_params_buffer_size() { size_t get_params_buffer_size() override {
return unet.get_params_buffer_size(); return unet.get_params_buffer_size();
} }
int64_t get_adm_in_channels() { void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
unet.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return unet.unet.adm_in_channels; return unet.unet.adm_in_channels;
} }
void compute(int n_threads, void set_flash_attn_enabled(bool enabled) {
struct ggml_tensor* x, unet.set_flash_attention_enabled(enabled);
struct ggml_tensor* timesteps, }
struct ggml_tensor* context,
struct ggml_tensor* c_concat, bool compute(int n_threads,
struct ggml_tensor* y, DiffusionParams diffusion_params,
struct ggml_tensor* guidance, struct ggml_tensor** output = nullptr,
std::vector<ggml_tensor*> ref_latents = {}, struct ggml_context* output_ctx = nullptr) override {
int num_video_frames = -1, return unet.compute(n_threads,
std::vector<struct ggml_tensor*> controls = {}, diffusion_params.x,
float control_strength = 0.f, diffusion_params.timesteps,
struct ggml_tensor** output = NULL, diffusion_params.context,
struct ggml_context* output_ctx = NULL, diffusion_params.c_concat,
std::vector<int> skip_layers = std::vector<int>()) { diffusion_params.y,
(void)skip_layers; // SLG doesn't work with UNet models diffusion_params.num_video_frames,
return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx); diffusion_params.controls,
diffusion_params.control_strength, output, output_ctx);
} }
}; };
@ -85,49 +107,59 @@ struct MMDiTModel : public DiffusionModel {
MMDiTRunner mmdit; MMDiTRunner mmdit;
MMDiTModel(ggml_backend_t backend, MMDiTModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types) bool offload_params_to_cpu,
: mmdit(backend, tensor_types, "model.diffusion_model") { const String2TensorStorage& tensor_storage_map = {})
: mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
} }
void alloc_params_buffer() { std::string get_desc() override {
return mmdit.get_desc();
}
void alloc_params_buffer() override {
mmdit.alloc_params_buffer(); mmdit.alloc_params_buffer();
} }
void free_params_buffer() { void free_params_buffer() override {
mmdit.free_params_buffer(); mmdit.free_params_buffer();
} }
void free_compute_buffer() { void free_compute_buffer() override {
mmdit.free_compute_buffer(); mmdit.free_compute_buffer();
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
mmdit.get_param_tensors(tensors, "model.diffusion_model"); mmdit.get_param_tensors(tensors, "model.diffusion_model");
} }
size_t get_params_buffer_size() { size_t get_params_buffer_size() override {
return mmdit.get_params_buffer_size(); return mmdit.get_params_buffer_size();
} }
int64_t get_adm_in_channels() { void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
mmdit.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768 + 1280; return 768 + 1280;
} }
void compute(int n_threads, void set_flash_attn_enabled(bool enabled) {
struct ggml_tensor* x, mmdit.set_flash_attention_enabled(enabled);
struct ggml_tensor* timesteps, }
struct ggml_tensor* context,
struct ggml_tensor* c_concat, bool compute(int n_threads,
struct ggml_tensor* y, DiffusionParams diffusion_params,
struct ggml_tensor* guidance, struct ggml_tensor** output = nullptr,
std::vector<ggml_tensor*> ref_latents = {}, struct ggml_context* output_ctx = nullptr) override {
int num_video_frames = -1, return mmdit.compute(n_threads,
std::vector<struct ggml_tensor*> controls = {}, diffusion_params.x,
float control_strength = 0.f, diffusion_params.timesteps,
struct ggml_tensor** output = NULL, diffusion_params.context,
struct ggml_context* output_ctx = NULL, diffusion_params.y,
std::vector<int> skip_layers = std::vector<int>()) { output,
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers); output_ctx,
diffusion_params.skip_layers);
} }
}; };
@ -135,52 +167,257 @@ struct FluxModel : public DiffusionModel {
Flux::FluxRunner flux; Flux::FluxRunner flux;
FluxModel(ggml_backend_t backend, FluxModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types, bool offload_params_to_cpu,
SDVersion version = VERSION_FLUX, const String2TensorStorage& tensor_storage_map = {},
bool flash_attn = false, SDVersion version = VERSION_FLUX,
bool use_mask = false) bool use_mask = false)
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) { : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
} }
void alloc_params_buffer() { std::string get_desc() override {
return flux.get_desc();
}
void alloc_params_buffer() override {
flux.alloc_params_buffer(); flux.alloc_params_buffer();
} }
void free_params_buffer() { void free_params_buffer() override {
flux.free_params_buffer(); flux.free_params_buffer();
} }
void free_compute_buffer() { void free_compute_buffer() override {
flux.free_compute_buffer(); flux.free_compute_buffer();
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
flux.get_param_tensors(tensors, "model.diffusion_model"); flux.get_param_tensors(tensors, "model.diffusion_model");
} }
size_t get_params_buffer_size() { size_t get_params_buffer_size() override {
return flux.get_params_buffer_size(); return flux.get_params_buffer_size();
} }
int64_t get_adm_in_channels() { void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
flux.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768; return 768;
} }
void compute(int n_threads, void set_flash_attn_enabled(bool enabled) {
struct ggml_tensor* x, flux.set_flash_attention_enabled(enabled);
struct ggml_tensor* timesteps, }
struct ggml_tensor* context,
struct ggml_tensor* c_concat, bool compute(int n_threads,
struct ggml_tensor* y, DiffusionParams diffusion_params,
struct ggml_tensor* guidance, struct ggml_tensor** output = nullptr,
std::vector<ggml_tensor*> ref_latents = {}, struct ggml_context* output_ctx = nullptr) override {
int num_video_frames = -1, return flux.compute(n_threads,
std::vector<struct ggml_tensor*> controls = {}, diffusion_params.x,
float control_strength = 0.f, diffusion_params.timesteps,
struct ggml_tensor** output = NULL, diffusion_params.context,
struct ggml_context* output_ctx = NULL, diffusion_params.c_concat,
std::vector<int> skip_layers = std::vector<int>()) { diffusion_params.y,
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers); diffusion_params.guidance,
diffusion_params.ref_latents,
diffusion_params.increase_ref_index,
output,
output_ctx,
diffusion_params.skip_layers);
}
};
struct WanModel : public DiffusionModel {
std::string prefix;
WAN::WanRunner wan;
WanModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_WAN2)
: prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return wan.get_desc();
}
void alloc_params_buffer() override {
wan.alloc_params_buffer();
}
void free_params_buffer() override {
wan.free_params_buffer();
}
void free_compute_buffer() override {
wan.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
wan.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return wan.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
wan.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attn_enabled(bool enabled) {
wan.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return wan.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.y,
diffusion_params.c_concat,
nullptr,
diffusion_params.vace_context,
diffusion_params.vace_strength,
output,
output_ctx);
}
};
struct QwenImageModel : public DiffusionModel {
std::string prefix;
Qwen::QwenImageRunner qwen_image;
QwenImageModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_QWEN_IMAGE)
: prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return qwen_image.get_desc();
}
void alloc_params_buffer() override {
qwen_image.alloc_params_buffer();
}
void free_params_buffer() override {
qwen_image.free_params_buffer();
}
void free_compute_buffer() override {
qwen_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
qwen_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return qwen_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
qwen_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attn_enabled(bool enabled) {
qwen_image.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return qwen_image.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.ref_latents,
true, // increase_ref_index
output,
output_ctx);
}
};
struct ZImageModel : public DiffusionModel {
std::string prefix;
ZImage::ZImageRunner z_image;
ZImageModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_Z_IMAGE)
: prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return z_image.get_desc();
}
void alloc_params_buffer() override {
z_image.alloc_params_buffer();
}
void free_params_buffer() override {
z_image.free_params_buffer();
}
void free_compute_buffer() override {
z_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
z_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return z_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
z_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attn_enabled(bool enabled) {
z_image.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return z_image.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.ref_latents,
true, // increase_ref_index
output,
output_ctx);
} }
}; };

173
docs/build.md Normal file
View File

@ -0,0 +1,173 @@
# Build from scratch
## Get the Code
```
git clone --recursive https://github.com/leejet/stable-diffusion.cpp
cd stable-diffusion.cpp
```
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
```
cd stable-diffusion.cpp
git pull origin master
git submodule init
git submodule update
```
## Build (CPU only)
If you don't have a GPU or CUDA installed, you can build a CPU-only version.
```shell
mkdir build && cd build
cmake ..
cmake --build . --config Release
```
## Build with OpenBLAS
```shell
mkdir build && cd build
cmake .. -DGGML_OPENBLAS=ON
cmake --build . --config Release
```
## Build with CUDA
This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
```shell
mkdir build && cd build
cmake .. -DSD_CUDA=ON
cmake --build . --config Release
```
## Build with HipBLAS
This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
```shell
mkdir build && cd build
if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
cmake --build . --config Release
```
## Build with MUSA
This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
```shell
mkdir build && cd build
cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
cmake --build . --config Release
```
## Build with Metal
Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
```shell
mkdir build && cd build
cmake .. -DSD_METAL=ON
cmake --build . --config Release
```
## Build with Vulkan
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
```shell
mkdir build && cd build
cmake .. -DSD_VULKAN=ON
cmake --build . --config Release
```
## Build with OpenCL (for Adreno GPU)
Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
Building for Android:
Android NDK:
Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
Setup OpenCL Dependencies for NDK:
You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
* OpenCL Headers:
```bash
# In a temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
cd ..
```
* OpenCL ICD Loader:
```shell
# In the same temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build_ndk && cd build_ndk
# Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=24 \
-DANDROID_STL=c++_shared
ninja
# Replace <YOUR_NDK_PATH>
# e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
cd ../..
```
Build `stable-diffusion.cpp` for Android with OpenCL:
```shell
mkdir build-android && cd build-android
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
cmake .. -G Ninja \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DGGML_OPENMP=OFF \
-DSD_OPENCL=ON
ninja
```
*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
## Build with SYCL
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
```shell
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake --build . --config Release
```

View File

@ -24,7 +24,7 @@ You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](h
For example: For example:
``` ```
.\bin\Release\sd.exe -diffusion-model ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask .\bin\Release\sd.exe --diffusion-model ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
``` ```
![](../assets/flux/chroma_v40.png) ![](../assets/flux/chroma_v40.png)

21
docs/chroma_radiance.md Normal file
View File

@ -0,0 +1,21 @@
# How to Use
## Download weights
- Download Chroma1-Radiance
- safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
- gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
- Download t5xxl
- safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Examples
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
```
<img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />

99
docs/distilled_sd.md Normal file
View File

@ -0,0 +1,99 @@
# Running distilled models: SSD1B and SDx.x with tiny U-Nets
## Preface
These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
## SSD1B
Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
* https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
* https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
Useful LoRAs are also available:
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
These files can be used out-of-the-box, unlike the models described in the next section.
## SD1.x, SD2.x with tiny U-Nets
These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
* https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
### SD2.x
NotaAI provides the following model online:
* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
```
Second, create the .safetensors file by running:
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
--checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
```
This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
### SD1.x
Several Tiny SD 1.x models are available online, such as:
* https://huggingface.co/segmind/tiny-sd
* https://huggingface.co/segmind/portrait-finetuned
* https://huggingface.co/nota-ai/bk-sdm-tiny
These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
Download and prepare the model using Python:
##### Download the model using Python on your computer, for example this way:
```python
import torch
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
unet=pipe.unet
for param in unet.parameters():
param.data = param.data.contiguous() # <- important here
pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
```
##### Run the conversion script:
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path ./segmindtiny-sd \
--checkpoint_path ./segmind_tiny-sd.ckpt --half
```
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
### Another available .ckpt file:
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
To use this file, you must first adjust its non-contiguous tensors:
```python
import torch
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
for key, value in ckpt['state_dict'].items():
if isinstance(value, torch.Tensor):
ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```

View File

@ -15,7 +15,7 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself. You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example: For example:
``` ```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0 .\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
``` ```
@ -28,7 +28,7 @@ Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully d
For example: For example:
``` ```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v .\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
``` ```
Using formats of different precisions will yield results of varying quality. Using formats of different precisions will yield results of varying quality.
@ -44,7 +44,7 @@ Using formats of different precisions will yield results of varying quality.
``` ```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 .\bin\Release\sd.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
``` ```
| q8_0 | | q8_0 |
@ -60,7 +60,7 @@ Since many flux LoRA training libraries have used various LoRA naming formats, i
- LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!) - LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
``` ```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models .\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
``` ```
![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png) ![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)

21
docs/flux2.md Normal file
View File

@ -0,0 +1,21 @@
# How to Use
## Download weights
- Download FLUX.2-dev
- gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
- gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
## Examples
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
```
<img alt="flux2 example" src="../assets/flux2/example.png" />

View File

@ -27,7 +27,7 @@ You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](ht
For example: For example:
``` ```
.\bin\Release\sd.exe -M edit -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v .\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
``` ```

View File

@ -10,4 +10,17 @@ Here's a simple example:
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
``` ```
`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
# Lora Apply Mode
There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
By default, the mode is selected automatically:
* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
* Otherwise, the **immediately** mode is used.
The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.

19
docs/ovis_image.md Normal file
View File

@ -0,0 +1,19 @@
# How to Use
## Download weights
- Download Ovis-Image-7B
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Ovis-Image-7B-GGUF
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Ovis 2.5
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/text_encoders
## Examples
```
.\bin\Release\sd.exe --diffusion-model ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
```
<img alt="ovis image example" src="../assets/ovis_image/example.png" />

26
docs/performance.md Normal file
View File

@ -0,0 +1,26 @@
## Use Flash Attention to save memory and improve speed.
Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
eg.:
- flux 768x768 ~600mb
- SD2 768x768 ~1400mb
For most backends, it slows things down, but for cuda it generally speeds it up too.
At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
Run by adding `--diffusion-fa` to the arguments and watch for:
```
[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
```
and the compute buffer shrink in the debug log:
```
[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
```
## Offload weights to the CPU to save VRAM without reducing generation speed.
Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.
## Use quantization to reduce memory usage.
[quantization](./quantization_and_gguf.md)

View File

@ -6,16 +6,15 @@ You can use [PhotoMaker](https://github.com/TencentARC/PhotoMaker) to personaliz
Download PhotoMaker model file (in safetensor format) [here](https://huggingface.co/bssrdf/PhotoMaker). The official release of the model file (in .bin format) does not work with ```stablediffusion.cpp```. Download PhotoMaker model file (in safetensor format) [here](https://huggingface.co/bssrdf/PhotoMaker). The official release of the model file (in .bin format) does not work with ```stablediffusion.cpp```.
- Specify the PhotoMaker model path using the `--stacked-id-embd-dir PATH` parameter. - Specify the PhotoMaker model path using the `--photo-maker PATH` parameter.
- Specify the input images path using the `--input-id-images-dir PATH` parameter. - Specify the input images path using the `--pm-id-images-dir PATH` parameter.
- input images **must** have the same width and height for preprocessing (to be improved)
In prompt, make sure you have a class word followed by the trigger word ```"img"``` (hard-coded for now). The class word could be one of ```"man, woman, girl, boy"```. If input ID images contain asian faces, add ```Asian``` before the class In prompt, make sure you have a class word followed by the trigger word ```"img"``` (hard-coded for now). The class word could be one of ```"man, woman, girl, boy"```. If input ID images contain asian faces, add ```Asian``` before the class
word. word.
Another PhotoMaker specific parameter: Another PhotoMaker specific parameter:
- ```--style-ratio (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality). - ```--pm-style-strength (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
Other parameters recommended for running Photomaker: Other parameters recommended for running Photomaker:
@ -28,7 +27,7 @@ If on low memory GPUs (<= 8GB), recommend running with ```--vae-on-cpu``` option
Example: Example:
```bash ```bash
bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --photo-maker ../models/photomaker-v1.safetensors --pm-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --pm-style-strength 10 --vae-on-cpu --steps 50
``` ```
## PhotoMaker Version 2 ## PhotoMaker Version 2
@ -41,7 +40,7 @@ Running PMV2 is now a two-step process:
``` ```
python face_detect.py input_image_dir python face_detect.py input_image_dir
``` ```
An ```id_embeds.safetensors``` file will be generated in ```input_images_dir``` An ```id_embeds.bin``` file will be generated in ```input_images_dir```
**Note: this step is only needed to run once; the same ```id_embeds``` can be reused** **Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
@ -49,6 +48,6 @@ An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2) You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
- All the command line parameters from Version 1 remain the same for Version 2 - All the command line parameters from Version 1 remain the same for Version 2 plus one extra pointing to a valid ```id_embeds``` file: --pm-id-embed-path [path_to__id_embeds.bin]

23
docs/qwen_image.md Normal file
View File

@ -0,0 +1,23 @@
# How to Use
## Download weights
- Download Qwen Image
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Qwen-Image-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
- Download qwen_2.5_vl 7b
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
## Examples
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线 探索视觉生成基础模型的极限开创理解与生成一体化的未来。二、Qwen-Image的模型特色1、复杂文字渲染。支持中英渲染、自动布局 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
```
<img alt="qwen example" src="../assets/qwen/example.png" />

35
docs/qwen_image_edit.md Normal file
View File

@ -0,0 +1,35 @@
# How to Use
## Download weights
- Download Qwen Image
- Qwen Image Edit
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-GGUF/tree/main
- Qwen Image Edit 2509
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-2509-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
- Download qwen_2.5_vl 7b
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
## Examples
### Qwen Image Edit
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
```
<img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
### Qwen Image Edit 2509
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
```
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />

37
docs/sd.md Normal file
View File

@ -0,0 +1,37 @@
## Download weights
- download original weights(.ckpt or .safetensors). For example
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
- Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
- Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
### txt2img example
```sh
./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
```
Using formats of different precisions will yield results of varying quality.
| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
| ---- |---- |---- |---- |---- |---- |---- |
| ![](../assets/f32.png) |![](../assets/f16.png) |![](../assets/q8_0.png) |![](../assets/q5_0.png) |![](../assets/q5_1.png) |![](../assets/q4_0.png) |![](../assets/q4_1.png) |
### img2img example
- `./output.png` is the image generated from the above txt2img pipeline
```
./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
```
<p align="center">
<img src="../assets/img2img_output.png" width="256x">
</p>

View File

@ -14,7 +14,7 @@
For example: For example:
``` ```
.\bin\Release\sd.exe -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v .\bin\Release\sd.exe -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
``` ```
![](../assets/sd3.5_large.png) ![](../assets/sd3.5_large.png)

View File

@ -7,7 +7,7 @@ You can use TAESD to accelerate the decoding of latent images by following these
Or curl Or curl
```bash ```bash
curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors curl -L -O https://huggingface.co/madebyollin/taesd/resolve/main/diffusion_pytorch_model.safetensors
``` ```
- Specify the model path using the `--taesd PATH` parameter. example: - Specify the model path using the `--taesd PATH` parameter. example:

204
docs/wan.md Normal file
View File

@ -0,0 +1,204 @@
# How to Use
## Download weights
- Download Wan
- Wan2.1
- Wan2.1 T2V 1.3B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- Wan2.1 T2V 14B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/city96/Wan2.1-T2V-14B-gguf/tree/main
- Wan2.1 I2V 14B 480P
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/tree/main
- Wan2.1 I2V 14B 720P
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-720P-gguf/tree/main
- Wan2.1 FLF2V 14B 720P
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/city96/Wan2.1-FLF2V-14B-720P-gguf/tree/main
- Wan2.1 VACE 1.3B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/calcuis/wan-1.3b-gguf/tree/main
- Wan2.1 VACE 14B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Wan2.1_14B_VACE-GGUF/tree/main
- Wan2.2
- Wan2.2 TI2V 5B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Wan2.2-TI2V-5B-GGUF/tree/main
- Wan2.2 T2V A14B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Wan2.2-T2V-A14B-GGUF/tree/main
- Wan2.2 I2V A14B
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Wan2.2-I2V-A14B-GGUF/tree/main
- Download vae
- wan_2.1_vae (for all the wan model except Wan2.2 TI2V 5B)
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors
- wan_2.2_vae (for Wan2.2 TI2V 5B only)
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors
- Download umt5_xxl
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors
- gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main
- Download clip_vison_h (for Wan2.1 I2V/FLF2V only)
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/clip_vision/clip_vision_h.safetensors
## Examples
### Wan2.1 T2V 1.3B
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1_t2v_1.3B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.1_1.3B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 T2V 14B
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-t2v-14b-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.1_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 I2V 14B
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-i2v-14b-480p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.1_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 T2V A14B
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 I2V A14B
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 T2V A14B T2I
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --flow-shift 3.0
```
<img width="832" height="480" alt="Wan2 2_14B_t2i" src="../assets/wan/Wan2.2_14B_t2i.png" />
### Wan2.2 T2V 14B with Lora
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat<lora:wan2.2_t2v_lightx2v_4steps_lora_v1.1_low_noise:1><lora:|high_noise|wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise:1>" --cfg-scale 3.5 --sampling-method euler --steps 4 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 4 -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --lora-model-dir ..\..\ComfyUI\models\loras --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_14B_t2v_lora.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 TI2V 5B
#### T2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_5B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### I2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_5B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 FLF2V 14B
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-flf2v-14b-720p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.1_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.2 FLF2V 14B
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -p "glass flower blossom" -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部畸形的毁容的形态畸形的肢体手指融合静止不动的画面杂乱的背景三条腿背景人很多倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
```
<video src=../assets/wan/Wan2.2_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 VACE 1.3B
#### T2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_1.3B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### R2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_1.3B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### V2V
```
mkdir post+depth
ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\frame_%04d.jpg
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_1.3B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
### Wan2.1 VACE 14B
#### T2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_14B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### R2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_14B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
#### V2V
```
.\bin\Release\sd.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽过曝静态细节模糊不清字幕风格作品画作画面静止整体发灰最差质量低质量JPEG压缩残留丑陋的残缺的多余的手指画得不好的手部画得不好的脸部 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
```
<video src=../assets/wan/Wan2.1_14B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>

28
docs/z_image.md Normal file
View File

@ -0,0 +1,28 @@
# How to Use
You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
## Download weights
- Download Z-Image-Turbo
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Qwen3 4b
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main
## Examples
```
.\bin\Release\sd.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
```
<img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
## Comparison of Different Quantization Types
| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|
|---|---|---|---|---|---|---|---|
| <img width="256" alt="bf16" src="../assets/z_image/bf16.png" /> | <img width="256" alt="q8_0" src="../assets/z_image/q8_0.png" /> | <img width="256" alt="q6_K" src="../assets/z_image/q6_K.png" /> | <img width="256" alt="q5_0" src="../assets/z_image/q5_0.png" /> | <img width="256" alt="q4_K" src="../assets/z_image/q4_K.png" /> | <img width="256" alt="q4_0" src="../assets/z_image/q4_0.png" /> | <img width="256" alt="q3_K" src="../assets/z_image/q3_K.png" /> | <img width="256" alt="q2_K" src="../assets/z_image/q2_K.png" /> |

265
easycache.hpp Normal file
View File

@ -0,0 +1,265 @@
#include <cmath>
#include <limits>
#include <unordered_map>
#include <vector>
#include "denoiser.hpp"
#include "ggml_extend.hpp"
struct EasyCacheConfig {
bool enabled = false;
float reuse_threshold = 0.2f;
float start_percent = 0.15f;
float end_percent = 0.95f;
};
struct EasyCacheCacheEntry {
std::vector<float> diff;
};
struct EasyCacheState {
EasyCacheConfig config;
Denoiser* denoiser = nullptr;
float start_sigma = std::numeric_limits<float>::max();
float end_sigma = 0.0f;
bool initialized = false;
bool initial_step = true;
bool skip_current_step = false;
bool step_active = false;
const SDCondition* anchor_condition = nullptr;
std::unordered_map<const SDCondition*, EasyCacheCacheEntry> cache_diffs;
std::vector<float> prev_input;
std::vector<float> prev_output;
float output_prev_norm = 0.0f;
bool has_prev_input = false;
bool has_prev_output = false;
bool has_output_prev_norm = false;
bool has_relative_transformation_rate = false;
float relative_transformation_rate = 0.0f;
float cumulative_change_rate = 0.0f;
float last_input_change = 0.0f;
bool has_last_input_change = false;
int total_steps_skipped = 0;
int current_step_index = -1;
void reset_runtime() {
initial_step = true;
skip_current_step = false;
step_active = false;
anchor_condition = nullptr;
cache_diffs.clear();
prev_input.clear();
prev_output.clear();
output_prev_norm = 0.0f;
has_prev_input = false;
has_prev_output = false;
has_output_prev_norm = false;
has_relative_transformation_rate = false;
relative_transformation_rate = 0.0f;
cumulative_change_rate = 0.0f;
last_input_change = 0.0f;
has_last_input_change = false;
total_steps_skipped = 0;
current_step_index = -1;
}
void init(const EasyCacheConfig& cfg, Denoiser* d) {
config = cfg;
denoiser = d;
initialized = cfg.enabled && d != nullptr;
reset_runtime();
if (initialized) {
start_sigma = percent_to_sigma(config.start_percent);
end_sigma = percent_to_sigma(config.end_percent);
}
}
bool enabled() const {
return initialized && config.enabled;
}
float percent_to_sigma(float percent) const {
if (!denoiser) {
return 0.0f;
}
if (percent <= 0.0f) {
return std::numeric_limits<float>::max();
}
if (percent >= 1.0f) {
return 0.0f;
}
float t = (1.0f - percent) * (TIMESTEPS - 1);
return denoiser->t_to_sigma(t);
}
void begin_step(int step_index, float sigma) {
if (!enabled()) {
return;
}
if (step_index == current_step_index) {
return;
}
current_step_index = step_index;
skip_current_step = false;
has_last_input_change = false;
step_active = false;
if (sigma > start_sigma) {
return;
}
if (!(sigma > end_sigma)) {
return;
}
step_active = true;
}
bool step_is_active() const {
return enabled() && step_active;
}
bool is_step_skipped() const {
return enabled() && step_active && skip_current_step;
}
bool has_cache(const SDCondition* cond) const {
auto it = cache_diffs.find(cond);
return it != cache_diffs.end() && !it->second.diff.empty();
}
void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
EasyCacheCacheEntry& entry = cache_diffs[cond];
size_t ne = static_cast<size_t>(ggml_nelements(output));
entry.diff.resize(ne);
float* out_data = (float*)output->data;
float* in_data = (float*)input->data;
for (size_t i = 0; i < ne; ++i) {
entry.diff[i] = out_data[i] - in_data[i];
}
}
void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
auto it = cache_diffs.find(cond);
if (it == cache_diffs.end() || it->second.diff.empty()) {
return;
}
copy_ggml_tensor(output, input);
float* out_data = (float*)output->data;
const std::vector<float>& diff = it->second.diff;
for (size_t i = 0; i < diff.size(); ++i) {
out_data[i] += diff[i];
}
}
bool before_condition(const SDCondition* cond,
ggml_tensor* input,
ggml_tensor* output,
float sigma,
int step_index) {
if (!enabled() || step_index < 0) {
return false;
}
if (step_index != current_step_index) {
begin_step(step_index, sigma);
}
if (!step_active) {
return false;
}
if (initial_step) {
anchor_condition = cond;
initial_step = false;
}
bool is_anchor = (cond == anchor_condition);
if (skip_current_step) {
if (has_cache(cond)) {
apply_cache(cond, input, output);
return true;
}
return false;
}
if (!is_anchor) {
return false;
}
if (!has_prev_input || !has_prev_output || !has_cache(cond)) {
return false;
}
size_t ne = static_cast<size_t>(ggml_nelements(input));
if (prev_input.size() != ne) {
return false;
}
float* input_data = (float*)input->data;
last_input_change = 0.0f;
for (size_t i = 0; i < ne; ++i) {
last_input_change += std::fabs(input_data[i] - prev_input[i]);
}
if (ne > 0) {
last_input_change /= static_cast<float>(ne);
}
has_last_input_change = true;
if (has_output_prev_norm && has_relative_transformation_rate && last_input_change > 0.0f && output_prev_norm > 0.0f) {
float approx_output_change_rate = (relative_transformation_rate * last_input_change) / output_prev_norm;
cumulative_change_rate += approx_output_change_rate;
if (cumulative_change_rate < config.reuse_threshold) {
skip_current_step = true;
total_steps_skipped++;
apply_cache(cond, input, output);
return true;
} else {
cumulative_change_rate = 0.0f;
}
}
return false;
}
void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
if (!step_is_active()) {
return;
}
update_cache(cond, input, output);
if (cond != anchor_condition) {
return;
}
size_t ne = static_cast<size_t>(ggml_nelements(input));
float* in_data = (float*)input->data;
prev_input.resize(ne);
for (size_t i = 0; i < ne; ++i) {
prev_input[i] = in_data[i];
}
has_prev_input = true;
float* out_data = (float*)output->data;
float output_change = 0.0f;
if (has_prev_output && prev_output.size() == ne) {
for (size_t i = 0; i < ne; ++i) {
output_change += std::fabs(out_data[i] - prev_output[i]);
}
if (ne > 0) {
output_change /= static_cast<float>(ne);
}
}
prev_output.resize(ne);
for (size_t i = 0; i < ne; ++i) {
prev_output[i] = out_data[i];
}
has_prev_output = true;
float mean_abs = 0.0f;
for (size_t i = 0; i < ne; ++i) {
mean_abs += std::fabs(out_data[i]);
}
output_prev_norm = (ne > 0) ? (mean_abs / static_cast<float>(ne)) : 0.0f;
has_output_prev_norm = output_prev_norm > 0.0f;
if (has_last_input_change && last_input_change > 0.0f && output_change > 0.0f) {
float rate = output_change / last_input_change;
if (std::isfinite(rate)) {
relative_transformation_rate = rate;
has_relative_transformation_rate = true;
}
}
cumulative_change_rate = 0.0f;
has_last_input_change = false;
}
};

View File

@ -27,11 +27,11 @@ public:
blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1})); blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
return ggml_leaky_relu(ctx, x, 0.2f, true); return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [n, num_feat, h, w] // x: [n, num_feat, h, w]
// return: [n, num_feat, h, w] // return: [n, num_feat, h, w]
@ -42,16 +42,16 @@ public:
auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]); auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
auto x1 = lrelu(ctx, conv1->forward(ctx, x)); auto x1 = lrelu(ctx, conv1->forward(ctx, x));
auto x_cat = ggml_concat(ctx, x, x1, 2); auto x_cat = ggml_concat(ctx->ggml_ctx, x, x1, 2);
auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat)); auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x2, 2); x_cat = ggml_concat(ctx->ggml_ctx, x_cat, x2, 2);
auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat)); auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x3, 2); x_cat = ggml_concat(ctx->ggml_ctx, x_cat, x3, 2);
auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat)); auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x4, 2); x_cat = ggml_concat(ctx->ggml_ctx, x_cat, x4, 2);
auto x5 = conv5->forward(ctx, x_cat); auto x5 = conv5->forward(ctx, x_cat);
x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x); x5 = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, x5, 0.2f), x);
return x5; return x5;
} }
}; };
@ -64,7 +64,7 @@ public:
blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch)); blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [n, num_feat, h, w] // x: [n, num_feat, h, w]
// return: [n, num_feat, h, w] // return: [n, num_feat, h, w]
@ -76,46 +76,51 @@ public:
out = rdb2->forward(ctx, out); out = rdb2->forward(ctx, out);
out = rdb3->forward(ctx, out); out = rdb3->forward(ctx, out);
out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x); out = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, out, 0.2f), x);
return out; return out;
} }
}; };
class RRDBNet : public GGMLBlock { class RRDBNet : public GGMLBlock {
protected: protected:
int scale = 4; // default RealESRGAN_x4plus_anime_6B int scale = 4;
int num_block = 6; // default RealESRGAN_x4plus_anime_6B int num_block = 23;
int num_in_ch = 3; int num_in_ch = 3;
int num_out_ch = 3; int num_out_ch = 3;
int num_feat = 64; // default RealESRGAN_x4plus_anime_6B int num_feat = 64;
int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B int num_grow_ch = 32;
public: public:
RRDBNet() { RRDBNet(int scale, int num_block, int num_in_ch, int num_out_ch, int num_feat, int num_grow_ch)
: scale(scale), num_block(num_block), num_in_ch(num_in_ch), num_out_ch(num_out_ch), num_feat(num_feat), num_grow_ch(num_grow_ch) {
blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1})); blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
for (int i = 0; i < num_block; i++) { for (int i = 0; i < num_block; i++) {
std::string name = "body." + std::to_string(i); std::string name = "body." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch)); blocks[name] = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
} }
blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
// upsample if (scale >= 2) {
blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); }
if (scale == 4) {
blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
}
blocks["conv_hr"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); blocks["conv_hr"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1})); blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) { int get_scale() { return scale; }
return ggml_leaky_relu(ctx, x, 0.2f, true); int get_num_block() { return num_block; }
struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [n, num_in_ch, h, w] // x: [n, num_in_ch, h, w]
// return: [n, num_out_ch, h*4, w*4] // return: [n, num_out_ch, h*scale, w*scale]
auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]); auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
auto conv_body = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]); auto conv_body = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
auto conv_hr = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]); auto conv_hr = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
auto conv_last = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]); auto conv_last = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
@ -128,69 +133,235 @@ public:
body_feat = block->forward(ctx, body_feat); body_feat = block->forward(ctx, body_feat);
} }
body_feat = conv_body->forward(ctx, body_feat); body_feat = conv_body->forward(ctx, body_feat);
feat = ggml_add(ctx, feat, body_feat); feat = ggml_add(ctx->ggml_ctx, feat, body_feat);
// upsample // upsample
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); if (scale >= 2) {
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
if (scale == 4) {
auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
}
}
// for all scales
auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat))); auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
return out; return out;
} }
}; };
struct ESRGAN : public GGMLRunner { struct ESRGAN : public GGMLRunner {
RRDBNet rrdb_net; std::unique_ptr<RRDBNet> rrdb_net;
int scale = 4; int scale = 4;
int tile_size = 128; // avoid cuda OOM for 4gb VRAM int tile_size = 128; // avoid cuda OOM for 4gb VRAM
ESRGAN(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types) ESRGAN(ggml_backend_t backend,
: GGMLRunner(backend) { bool offload_params_to_cpu,
rrdb_net.init(params_ctx, tensor_types, ""); int tile_size = 128,
const String2TensorStorage& tensor_storage_map = {})
: GGMLRunner(backend, offload_params_to_cpu) {
this->tile_size = tile_size;
} }
std::string get_desc() { std::string get_desc() override {
return "esrgan"; return "esrgan";
} }
bool load_from_file(const std::string& file_path) { bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading esrgan from '%s'", file_path.c_str()); LOG_INFO("loading esrgan from '%s'", file_path.c_str());
alloc_params_buffer();
std::map<std::string, ggml_tensor*> esrgan_tensors;
rrdb_net.get_param_tensors(esrgan_tensors);
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
return false; return false;
} }
bool success = model_loader.load_tensors(esrgan_tensors, backend); // Get tensor names
auto tensor_names = model_loader.get_tensor_names();
// Detect if it's ESRGAN format
bool is_ESRGAN = std::find(tensor_names.begin(), tensor_names.end(), "model.0.weight") != tensor_names.end();
// Detect parameters from tensor names
int detected_num_block = 0;
if (is_ESRGAN) {
for (const auto& name : tensor_names) {
if (name.find("model.1.sub.") == 0) {
size_t first_dot = name.find('.', 12);
if (first_dot != std::string::npos) {
size_t second_dot = name.find('.', first_dot + 1);
if (second_dot != std::string::npos && name.substr(first_dot + 1, 3) == "RDB") {
try {
int idx = std::stoi(name.substr(12, first_dot - 12));
detected_num_block = std::max(detected_num_block, idx + 1);
} catch (...) {
}
}
}
}
}
} else {
// Original format
for (const auto& name : tensor_names) {
if (name.find("body.") == 0) {
size_t pos = name.find('.', 5);
if (pos != std::string::npos) {
try {
int idx = std::stoi(name.substr(5, pos - 5));
detected_num_block = std::max(detected_num_block, idx + 1);
} catch (...) {
}
}
}
}
}
int detected_scale = 4; // default
if (is_ESRGAN) {
// For ESRGAN format, detect scale by highest model number
int max_model_num = 0;
for (const auto& name : tensor_names) {
if (name.find("model.") == 0) {
size_t dot_pos = name.find('.', 6);
if (dot_pos != std::string::npos) {
try {
int num = std::stoi(name.substr(6, dot_pos - 6));
max_model_num = std::max(max_model_num, num);
} catch (...) {
}
}
}
}
if (max_model_num <= 4) {
detected_scale = 1;
} else if (max_model_num <= 7) {
detected_scale = 2;
} else {
detected_scale = 4;
}
} else {
// Original format
bool has_conv_up2 = std::any_of(tensor_names.begin(), tensor_names.end(), [](const std::string& name) {
return name == "conv_up2.weight";
});
bool has_conv_up1 = std::any_of(tensor_names.begin(), tensor_names.end(), [](const std::string& name) {
return name == "conv_up1.weight";
});
if (has_conv_up2) {
detected_scale = 4;
} else if (has_conv_up1) {
detected_scale = 2;
} else {
detected_scale = 1;
}
}
int detected_num_in_ch = 3;
int detected_num_out_ch = 3;
int detected_num_feat = 64;
int detected_num_grow_ch = 32;
// Create RRDBNet with detected parameters
rrdb_net = std::make_unique<RRDBNet>(detected_scale, detected_num_block, detected_num_in_ch, detected_num_out_ch, detected_num_feat, detected_num_grow_ch);
rrdb_net->init(params_ctx, {}, "");
alloc_params_buffer();
std::map<std::string, ggml_tensor*> esrgan_tensors;
rrdb_net->get_param_tensors(esrgan_tensors);
bool success;
if (is_ESRGAN) {
// Build name mapping for ESRGAN format
std::map<std::string, std::string> expected_to_model;
expected_to_model["conv_first.weight"] = "model.0.weight";
expected_to_model["conv_first.bias"] = "model.0.bias";
for (int i = 0; i < detected_num_block; i++) {
for (int j = 1; j <= 3; j++) {
for (int k = 1; k <= 5; k++) {
std::string expected_weight = "body." + std::to_string(i) + ".rdb" + std::to_string(j) + ".conv" + std::to_string(k) + ".weight";
std::string model_weight = "model.1.sub." + std::to_string(i) + ".RDB" + std::to_string(j) + ".conv" + std::to_string(k) + ".0.weight";
expected_to_model[expected_weight] = model_weight;
std::string expected_bias = "body." + std::to_string(i) + ".rdb" + std::to_string(j) + ".conv" + std::to_string(k) + ".bias";
std::string model_bias = "model.1.sub." + std::to_string(i) + ".RDB" + std::to_string(j) + ".conv" + std::to_string(k) + ".0.bias";
expected_to_model[expected_bias] = model_bias;
}
}
}
if (detected_scale == 1) {
expected_to_model["conv_body.weight"] = "model.1.sub." + std::to_string(detected_num_block) + ".weight";
expected_to_model["conv_body.bias"] = "model.1.sub." + std::to_string(detected_num_block) + ".bias";
expected_to_model["conv_hr.weight"] = "model.2.weight";
expected_to_model["conv_hr.bias"] = "model.2.bias";
expected_to_model["conv_last.weight"] = "model.4.weight";
expected_to_model["conv_last.bias"] = "model.4.bias";
} else {
expected_to_model["conv_body.weight"] = "model.1.sub." + std::to_string(detected_num_block) + ".weight";
expected_to_model["conv_body.bias"] = "model.1.sub." + std::to_string(detected_num_block) + ".bias";
if (detected_scale >= 2) {
expected_to_model["conv_up1.weight"] = "model.3.weight";
expected_to_model["conv_up1.bias"] = "model.3.bias";
}
if (detected_scale == 4) {
expected_to_model["conv_up2.weight"] = "model.6.weight";
expected_to_model["conv_up2.bias"] = "model.6.bias";
expected_to_model["conv_hr.weight"] = "model.8.weight";
expected_to_model["conv_hr.bias"] = "model.8.bias";
expected_to_model["conv_last.weight"] = "model.10.weight";
expected_to_model["conv_last.bias"] = "model.10.bias";
} else if (detected_scale == 2) {
expected_to_model["conv_hr.weight"] = "model.5.weight";
expected_to_model["conv_hr.bias"] = "model.5.bias";
expected_to_model["conv_last.weight"] = "model.7.weight";
expected_to_model["conv_last.bias"] = "model.7.bias";
}
}
std::map<std::string, ggml_tensor*> model_tensors;
for (auto& p : esrgan_tensors) {
auto it = expected_to_model.find(p.first);
if (it != expected_to_model.end()) {
model_tensors[it->second] = p.second;
}
}
success = model_loader.load_tensors(model_tensors, {}, n_threads);
} else {
success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
}
if (!success) { if (!success) {
LOG_ERROR("load esrgan tensors from model loader failed"); LOG_ERROR("load esrgan tensors from model loader failed");
return false; return false;
} }
LOG_INFO("esrgan model loaded"); scale = rrdb_net->get_scale();
LOG_INFO("esrgan model loaded with scale=%d, num_block=%d", scale, detected_num_block);
return success; return success;
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* x) { struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); if (!rrdb_net)
x = to_backend(x); return nullptr;
struct ggml_tensor* out = rrdb_net.forward(compute_ctx, x); constexpr int kGraphNodes = 1 << 16; // 65k
struct ggml_cgraph* gf = new_graph_custom(kGraphNodes);
x = to_backend(x);
auto runner_ctx = get_context();
struct ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* x, struct ggml_tensor* x,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx = NULL) { ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x); return build_graph(x);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
}; };

View File

@ -3,4 +3,4 @@ set(TARGET sd)
add_executable(${TARGET} main.cpp) add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PUBLIC cxx_std_11) target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)

128
examples/cli/README.md Normal file
View File

@ -0,0 +1,128 @@
# Run
```
usage: ./bin/sd [options]
CLI Options:
-o, --output <string> path to write result image to (default: ./output.png)
--preview-path <string> path to write preview image to (default: ./preview.png)
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step)
--canny apply canny preprocessor (edge detection)
-v, --verbose print extra info
--color colors the logging tags according to level
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
-h, --help show this help message and exit
Context Options:
-m, --model <string> path to full model
--clip_l <string> path to the clip-l text encoder
--clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
--diffusion-model <string> path to the standalone diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--vae <string> path to standalone vae model
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--control-net <string> path to control net model
--embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
--chroma-disable-dit-mask disable dit mask for chroma
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the
other hand, is exactly the opposite.
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
Generation Options:
-p, --prompt <string> the prompt to render
-n, --negative-prompt <string> the negative prompt (default: "")
-i, --init-img <string> path to the init image
--end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image
--control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
lexicographical (character) order. For example, if the control video path is
`frames`, the directory contain images such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
-H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
medium
--skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> eta in DDIM, only for DDIM and TCD (default: 0)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
--strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
default: discrete
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
```

217
examples/cli/avi_writer.h Normal file
View File

@ -0,0 +1,217 @@
#ifndef __AVI_WRITER_H__
#define __AVI_WRITER_H__
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include "stable-diffusion.h"
#ifndef INCLUDE_STB_IMAGE_WRITE_H
#include "stb_image_write.h"
#endif
typedef struct {
uint32_t offset;
uint32_t size;
} avi_index_entry;
// Write 32-bit little-endian integer
void write_u32_le(FILE* f, uint32_t val) {
fwrite(&val, 4, 1, f);
}
// Write 16-bit little-endian integer
void write_u16_le(FILE* f, uint16_t val) {
fwrite(&val, 2, 1, f);
}
/**
* Create an MJPG AVI file from an array of sd_image_t images.
* Images are encoded to JPEG using stb_image_write.
*
* @param filename Output AVI file name.
* @param images Array of input images.
* @param num_images Number of images in the array.
* @param fps Frames per second for the video.
* @param quality JPEG quality (0-100).
* @return 0 on success, -1 on failure.
*/
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality = 90) {
if (num_images == 0) {
fprintf(stderr, "Error: Image array is empty.\n");
return -1;
}
FILE* f = fopen(filename, "wb");
if (!f) {
perror("Error opening file for writing");
return -1;
}
uint32_t width = images[0].width;
uint32_t height = images[0].height;
uint32_t channels = images[0].channel;
if (channels != 3 && channels != 4) {
fprintf(stderr, "Error: Unsupported channel count: %u\n", channels);
fclose(f);
return -1;
}
// --- RIFF AVI Header ---
fwrite("RIFF", 4, 1, f);
long riff_size_pos = ftell(f);
write_u32_le(f, 0); // Placeholder for file size
fwrite("AVI ", 4, 1, f);
// 'hdrl' LIST (header list)
fwrite("LIST", 4, 1, f);
write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
fwrite("hdrl", 4, 1, f);
// 'avih' chunk (AVI main header)
fwrite("avih", 4, 1, f);
write_u32_le(f, 56);
write_u32_le(f, 1000000 / fps); // Microseconds per frame
write_u32_le(f, 0); // Max bytes per second
write_u32_le(f, 0); // Padding granularity
write_u32_le(f, 0x110); // Flags (HASINDEX | ISINTERLEAVED)
write_u32_le(f, num_images); // Total frames
write_u32_le(f, 0); // Initial frames
write_u32_le(f, 1); // Number of streams
write_u32_le(f, width * height * 3); // Suggested buffer size
write_u32_le(f, width);
write_u32_le(f, height);
write_u32_le(f, 0); // Reserved
write_u32_le(f, 0); // Reserved
write_u32_le(f, 0); // Reserved
write_u32_le(f, 0); // Reserved
// 'strl' LIST (stream list)
fwrite("LIST", 4, 1, f);
write_u32_le(f, 4 + 8 + 56 + 8 + 40);
fwrite("strl", 4, 1, f);
// 'strh' chunk (stream header)
fwrite("strh", 4, 1, f);
write_u32_le(f, 56);
fwrite("vids", 4, 1, f); // Stream type: video
fwrite("MJPG", 4, 1, f); // Codec: Motion JPEG
write_u32_le(f, 0); // Flags
write_u16_le(f, 0); // Priority
write_u16_le(f, 0); // Language
write_u32_le(f, 0); // Initial frames
write_u32_le(f, 1); // Scale
write_u32_le(f, fps); // Rate
write_u32_le(f, 0); // Start
write_u32_le(f, num_images); // Length
write_u32_le(f, width * height * 3); // Suggested buffer size
write_u32_le(f, (uint32_t)-1); // Quality
write_u32_le(f, 0); // Sample size
write_u16_le(f, 0); // rcFrame.left
write_u16_le(f, 0); // rcFrame.top
write_u16_le(f, 0); // rcFrame.right
write_u16_le(f, 0); // rcFrame.bottom
// 'strf' chunk (stream format: BITMAPINFOHEADER)
fwrite("strf", 4, 1, f);
write_u32_le(f, 40);
write_u32_le(f, 40); // biSize
write_u32_le(f, width);
write_u32_le(f, height);
write_u16_le(f, 1); // biPlanes
write_u16_le(f, 24); // biBitCount
fwrite("MJPG", 4, 1, f); // biCompression (FOURCC)
write_u32_le(f, width * height * 3); // biSizeImage
write_u32_le(f, 0); // XPelsPerMeter
write_u32_le(f, 0); // YPelsPerMeter
write_u32_le(f, 0); // Colors used
write_u32_le(f, 0); // Colors important
// 'movi' LIST (video frames)
// long movi_list_pos = ftell(f);
fwrite("LIST", 4, 1, f);
long movi_size_pos = ftell(f);
write_u32_le(f, 0); // Placeholder for movi size
fwrite("movi", 4, 1, f);
avi_index_entry* index = (avi_index_entry*)malloc(sizeof(avi_index_entry) * num_images);
if (!index) {
fclose(f);
return -1;
}
// Encode and write each frame as JPEG
struct {
uint8_t* buf;
size_t size;
} jpeg_data;
for (int i = 0; i < num_images; i++) {
jpeg_data.buf = nullptr;
jpeg_data.size = 0;
// Callback function to collect JPEG data into memory
auto write_to_buf = [](void* context, void* data, int size) {
auto jd = (decltype(jpeg_data)*)context;
jd->buf = (uint8_t*)realloc(jd->buf, jd->size + size);
memcpy(jd->buf + jd->size, data, size);
jd->size += size;
};
// Encode to JPEG in memory
stbi_write_jpg_to_func(
write_to_buf,
&jpeg_data,
images[i].width,
images[i].height,
channels,
images[i].data,
quality);
// Write '00dc' chunk (video frame)
fwrite("00dc", 4, 1, f);
write_u32_le(f, jpeg_data.size);
index[i].offset = ftell(f) - 8;
index[i].size = jpeg_data.size;
fwrite(jpeg_data.buf, 1, jpeg_data.size, f);
// Align to even byte size
if (jpeg_data.size % 2)
fputc(0, f);
free(jpeg_data.buf);
}
// Finalize 'movi' size
long cur_pos = ftell(f);
long movi_size = cur_pos - movi_size_pos - 4;
fseek(f, movi_size_pos, SEEK_SET);
write_u32_le(f, movi_size);
fseek(f, cur_pos, SEEK_SET);
// Write 'idx1' index
fwrite("idx1", 4, 1, f);
write_u32_le(f, num_images * 16);
for (int i = 0; i < num_images; i++) {
fwrite("00dc", 4, 1, f);
write_u32_le(f, 0x10);
write_u32_le(f, index[i].offset);
write_u32_le(f, index[i].size);
}
// Finalize RIFF size
cur_pos = ftell(f);
long file_size = cur_pos - riff_size_pos - 4;
fseek(f, riff_size_pos, SEEK_SET);
write_u32_le(f, file_size);
fseek(f, cur_pos, SEEK_SET);
fclose(f);
free(index);
return 0;
}
#endif // __AVI_WRITER_H__

File diff suppressed because it is too large Load Diff

1409
flux.hpp

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,8 @@
clang-format -style=file -i *.cpp *.h *.hpp for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/cli/*.h; do
clang-format -style=file -i examples/cli/*.cpp [[ "$f" == vocab* ]] && continue
echo "formatting '$f'"
# if [ "$f" != "stable-diffusion.h" ]; then
# clang-tidy -fix -p build_linux/ "$f"
# fi
clang-format -style=file -i "$f"
done

2
ggml

@ -1 +1 @@
Subproject commit 9e4bee1c5afc2d677a5b32ecb90cbdb483e81fff Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275

File diff suppressed because it is too large Load Diff

231
gguf_reader.hpp Normal file
View File

@ -0,0 +1,231 @@
#ifndef __GGUF_READER_HPP__
#define __GGUF_READER_HPP__
#include <cstdint>
#include <fstream>
#include <string>
#include <vector>
#include "ggml.h"
#include "util.h"
struct GGUFTensorInfo {
std::string name;
ggml_type type;
std::vector<int64_t> shape;
size_t offset;
};
enum class GGUFMetadataType : uint32_t {
UINT8 = 0,
INT8 = 1,
UINT16 = 2,
INT16 = 3,
UINT32 = 4,
INT32 = 5,
FLOAT32 = 6,
BOOL = 7,
STRING = 8,
ARRAY = 9,
UINT64 = 10,
INT64 = 11,
FLOAT64 = 12,
};
class GGUFReader {
private:
std::vector<GGUFTensorInfo> tensors_;
size_t data_offset_;
size_t alignment_ = 32; // default alignment is 32
template <typename T>
bool safe_read(std::ifstream& fin, T& value) {
fin.read(reinterpret_cast<char*>(&value), sizeof(T));
return fin.good();
}
bool safe_read(std::ifstream& fin, char* buffer, size_t size) {
fin.read(buffer, size);
return fin.good();
}
bool safe_seek(std::ifstream& fin, std::streamoff offset, std::ios::seekdir dir) {
fin.seekg(offset, dir);
return fin.good();
}
bool read_metadata(std::ifstream& fin) {
uint64_t key_len = 0;
if (!safe_read(fin, key_len))
return false;
std::string key(key_len, '\0');
if (!safe_read(fin, (char*)key.data(), key_len))
return false;
uint32_t type = 0;
if (!safe_read(fin, type))
return false;
if (key == "general.alignment") {
uint32_t align_val = 0;
if (!safe_read(fin, align_val))
return false;
if (align_val != 0 && (align_val & (align_val - 1)) == 0) {
alignment_ = align_val;
LOG_DEBUG("Found alignment: %zu", alignment_);
} else {
LOG_ERROR("Invalid alignment value %u, fallback to default %zu", align_val, alignment_);
}
return true;
}
switch (static_cast<GGUFMetadataType>(type)) {
case GGUFMetadataType::UINT8:
case GGUFMetadataType::INT8:
case GGUFMetadataType::BOOL:
return safe_seek(fin, 1, std::ios::cur);
case GGUFMetadataType::UINT16:
case GGUFMetadataType::INT16:
return safe_seek(fin, 2, std::ios::cur);
case GGUFMetadataType::UINT32:
case GGUFMetadataType::INT32:
case GGUFMetadataType::FLOAT32:
return safe_seek(fin, 4, std::ios::cur);
case GGUFMetadataType::UINT64:
case GGUFMetadataType::INT64:
case GGUFMetadataType::FLOAT64:
return safe_seek(fin, 8, std::ios::cur);
case GGUFMetadataType::STRING: {
uint64_t len = 0;
if (!safe_read(fin, len))
return false;
return safe_seek(fin, len, std::ios::cur);
}
case GGUFMetadataType::ARRAY: {
uint32_t elem_type = 0;
uint64_t len = 0;
if (!safe_read(fin, elem_type))
return false;
if (!safe_read(fin, len))
return false;
for (uint64_t i = 0; i < len; i++) {
if (!read_metadata(fin))
return false;
}
return true;
}
default:
LOG_ERROR("Unknown metadata type=%u", type);
return false;
}
}
GGUFTensorInfo read_tensor_info(std::ifstream& fin) {
GGUFTensorInfo info;
uint64_t name_len;
if (!safe_read(fin, name_len))
throw std::runtime_error("read tensor name length failed");
info.name.resize(name_len);
if (!safe_read(fin, (char*)info.name.data(), name_len))
throw std::runtime_error("read tensor name failed");
uint32_t n_dims;
if (!safe_read(fin, n_dims))
throw std::runtime_error("read tensor dims failed");
info.shape.resize(n_dims);
for (uint32_t i = 0; i < n_dims; i++) {
if (!safe_read(fin, info.shape[i]))
throw std::runtime_error("read tensor shape failed");
}
if (n_dims > GGML_MAX_DIMS) {
for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
info.shape[GGML_MAX_DIMS - 1] *= info.shape[i]; // stack to last dim;
}
info.shape.resize(GGML_MAX_DIMS);
n_dims = GGML_MAX_DIMS;
}
uint32_t type;
if (!safe_read(fin, type))
throw std::runtime_error("read tensor type failed");
info.type = static_cast<ggml_type>(type);
if (!safe_read(fin, info.offset))
throw std::runtime_error("read tensor offset failed");
return info;
}
public:
bool load(const std::string& file_path) {
std::ifstream fin(file_path, std::ios::binary);
if (!fin) {
LOG_ERROR("failed to open '%s'", file_path.c_str());
return false;
}
// --- Header ---
char magic[4];
if (!safe_read(fin, magic, 4) || strncmp(magic, "GGUF", 4) != 0) {
LOG_ERROR("not a valid GGUF file");
return false;
}
uint32_t version;
if (!safe_read(fin, version))
return false;
uint64_t tensor_count, metadata_kv_count;
if (!safe_read(fin, tensor_count))
return false;
if (!safe_read(fin, metadata_kv_count))
return false;
LOG_DEBUG("GGUF v%u, tensor_count=%llu, metadata_kv_count=%llu",
version, (unsigned long long)tensor_count, (unsigned long long)metadata_kv_count);
// --- Read Metadata ---
for (uint64_t i = 0; i < metadata_kv_count; i++) {
if (!read_metadata(fin)) {
LOG_ERROR("read meta data failed");
return false;
}
}
// --- Tensor Infos ---
tensors_.clear();
try {
for (uint64_t i = 0; i < tensor_count; i++) {
tensors_.push_back(read_tensor_info(fin));
}
} catch (const std::runtime_error& e) {
LOG_ERROR("%s", e.what());
return false;
}
data_offset_ = static_cast<size_t>(fin.tellg());
if ((data_offset_ % alignment_) != 0) {
data_offset_ = ((data_offset_ + alignment_ - 1) / alignment_) * alignment_;
}
fin.close();
return true;
}
const std::vector<GGUFTensorInfo>& tensors() const { return tensors_; }
size_t data_offset() const { return data_offset_; }
};
#endif // __GGUF_READER_HPP__

234
latent-preview.h Normal file
View File

@ -0,0 +1,234 @@
#include <cstddef>
#include <cstdint>
#include "ggml.h"
const float wan_21_latent_rgb_proj[16][3] = {
{0.015123f, -0.148418f, 0.479828f},
{0.003652f, -0.010680f, -0.037142f},
{0.212264f, 0.063033f, 0.016779f},
{0.232999f, 0.406476f, 0.220125f},
{-0.051864f, -0.082384f, -0.069396f},
{0.085005f, -0.161492f, 0.010689f},
{-0.245369f, -0.506846f, -0.117010f},
{-0.151145f, 0.017721f, 0.007207f},
{-0.293239f, -0.207936f, -0.421135f},
{-0.187721f, 0.050783f, 0.177649f},
{-0.013067f, 0.265964f, 0.166578f},
{0.028327f, 0.109329f, 0.108642f},
{-0.205343f, 0.043991f, 0.148914f},
{0.014307f, -0.048647f, -0.007219f},
{0.217150f, 0.053074f, 0.319923f},
{0.155357f, 0.083156f, 0.064780f}};
float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f};
const float wan_22_latent_rgb_proj[48][3] = {
{0.017126f, -0.027230f, -0.019257f},
{-0.113739f, -0.028715f, -0.022885f},
{-0.000106f, 0.021494f, 0.004629f},
{-0.013273f, -0.107137f, -0.033638f},
{-0.000381f, 0.000279f, 0.025877f},
{-0.014216f, -0.003975f, 0.040528f},
{0.001638f, -0.000748f, 0.011022f},
{0.029238f, -0.006697f, 0.035933f},
{0.021641f, -0.015874f, 0.040531f},
{-0.101984f, -0.070160f, -0.028855f},
{0.033207f, -0.021068f, 0.002663f},
{-0.104711f, 0.121673f, 0.102981f},
{0.082647f, -0.004991f, 0.057237f},
{-0.027375f, 0.031581f, 0.006868f},
{-0.045434f, 0.029444f, 0.019287f},
{-0.046572f, -0.012537f, 0.006675f},
{0.074709f, 0.033690f, 0.025289f},
{-0.008251f, -0.002745f, -0.006999f},
{0.012685f, -0.061856f, -0.048658f},
{0.042304f, -0.007039f, 0.000295f},
{-0.007644f, -0.060843f, -0.033142f},
{0.159909f, 0.045628f, 0.367541f},
{0.095171f, 0.086438f, 0.010271f},
{0.006812f, 0.019643f, 0.029637f},
{0.003467f, -0.010705f, 0.014252f},
{-0.099681f, -0.066272f, -0.006243f},
{0.047357f, 0.037040f, 0.000185f},
{-0.041797f, -0.089225f, -0.032257f},
{0.008928f, 0.017028f, 0.018684f},
{-0.042255f, 0.016045f, 0.006849f},
{0.011268f, 0.036462f, 0.037387f},
{0.011553f, -0.016375f, -0.048589f},
{0.046266f, -0.027189f, 0.056979f},
{0.009640f, -0.017576f, 0.030324f},
{-0.045794f, -0.036083f, -0.010616f},
{0.022418f, 0.039783f, -0.032939f},
{-0.052714f, -0.015525f, 0.007438f},
{0.193004f, 0.223541f, 0.264175f},
{-0.059406f, -0.008188f, 0.022867f},
{-0.156742f, -0.263791f, -0.007385f},
{-0.015717f, 0.016570f, 0.033969f},
{0.037969f, 0.109835f, 0.200449f},
{-0.000782f, -0.009566f, -0.008058f},
{0.010709f, 0.052960f, -0.044195f},
{0.017271f, 0.045839f, 0.034569f},
{0.009424f, 0.013088f, -0.001714f},
{-0.024805f, -0.059378f, -0.033756f},
{-0.078293f, 0.029070f, 0.026129f}};
float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f};
const float flux_latent_rgb_proj[16][3] = {
{-0.041168f, 0.019917f, 0.097253f},
{0.028096f, 0.026730f, 0.129576f},
{0.065618f, -0.067950f, -0.014651f},
{-0.012998f, -0.014762f, 0.081251f},
{0.078567f, 0.059296f, -0.024687f},
{-0.015987f, -0.003697f, 0.005012f},
{0.033605f, 0.138999f, 0.068517f},
{-0.024450f, -0.063567f, -0.030101f},
{-0.040194f, -0.016710f, 0.127185f},
{0.112681f, 0.088764f, -0.041940f},
{-0.023498f, 0.093664f, 0.025543f},
{0.082899f, 0.048320f, 0.007491f},
{0.075712f, 0.074139f, 0.081965f},
{-0.143501f, 0.018263f, -0.136138f},
{-0.025767f, -0.082035f, -0.040023f},
{-0.111849f, -0.055589f, -0.032361f}};
float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
const float flux2_latent_rgb_proj[32][3] = {
{0.000736f, -0.008385f, -0.019710f},
{-0.001352f, -0.016392f, 0.020693f},
{-0.006376f, 0.002428f, 0.036736f},
{0.039384f, 0.074167f, 0.119789f},
{0.007464f, -0.005705f, -0.004734f},
{-0.004086f, 0.005287f, -0.000409f},
{-0.032835f, 0.050802f, -0.028120f},
{-0.003158f, -0.000835f, 0.000406f},
{-0.112840f, -0.084337f, -0.023083f},
{0.001462f, -0.006656f, 0.000549f},
{-0.009980f, -0.007480f, 0.009702f},
{0.032540f, 0.000214f, -0.061388f},
{0.011023f, 0.000694f, 0.007143f},
{-0.001468f, -0.006723f, -0.001678f},
{-0.005921f, -0.010320f, -0.003907f},
{-0.028434f, 0.027584f, 0.018457f},
{0.014349f, 0.011523f, 0.000441f},
{0.009874f, 0.003081f, 0.001507f},
{0.002218f, 0.005712f, 0.001563f},
{0.053010f, -0.019844f, 0.008683f},
{-0.002507f, 0.005384f, 0.000938f},
{-0.002177f, -0.011366f, 0.003559f},
{-0.000261f, 0.015121f, -0.003240f},
{-0.003944f, -0.002083f, 0.005043f},
{-0.009138f, 0.011336f, 0.003781f},
{0.011429f, 0.003985f, -0.003855f},
{0.010518f, -0.005586f, 0.010131f},
{0.007883f, 0.002912f, -0.001473f},
{-0.003318f, -0.003160f, 0.003684f},
{-0.034560f, -0.008740f, 0.012996f},
{0.000166f, 0.001079f, -0.012153f},
{0.017772f, 0.000937f, -0.011953f}};
float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
// This one was taken straight from
// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
// (MiT Licence)
const float sd3_latent_rgb_proj[16][3] = {
{-0.0645f, 0.0177f, 0.1052f},
{0.0028f, 0.0312f, 0.0650f},
{0.1848f, 0.0762f, 0.0360f},
{0.0944f, 0.0360f, 0.0889f},
{0.0897f, 0.0506f, -0.0364f},
{-0.0020f, 0.1203f, 0.0284f},
{0.0855f, 0.0118f, 0.0283f},
{-0.0539f, 0.0658f, 0.1047f},
{-0.0057f, 0.0116f, 0.0700f},
{-0.0412f, 0.0281f, -0.0039f},
{0.1106f, 0.1171f, 0.1220f},
{-0.0248f, 0.0682f, -0.0481f},
{0.0815f, 0.0846f, 0.1207f},
{-0.0120f, -0.0055f, -0.0867f},
{-0.0749f, -0.0634f, -0.0456f},
{-0.1418f, -0.1457f, -0.1259f},
};
float sd3_latent_rgb_bias[3] = {0, 0, 0};
const float sdxl_latent_rgb_proj[4][3] = {
{0.258303f, 0.277640f, 0.329699f},
{-0.299701f, 0.105446f, 0.014194f},
{0.050522f, 0.186163f, -0.143257f},
{-0.211938f, -0.149892f, -0.080036f}};
float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f};
const float sd_latent_rgb_proj[4][3] = {
{0.337366f, 0.216344f, 0.257386f},
{0.165636f, 0.386828f, 0.046994f},
{-0.267803f, 0.237036f, 0.223517f},
{-0.178022f, -0.200862f, -0.678514f}};
float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
size_t buffer_head = 0;
uint32_t latent_width = latents->ne[0];
uint32_t latent_height = latents->ne[1];
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
uint32_t frames = 1;
if (ggml_n_dims(latents) == 4) {
frames = latents->ne[2];
}
uint32_t rgb_width = latent_width * patch_size;
uint32_t rgb_height = latent_height * patch_size;
uint32_t unpatched_dim = dim / (patch_size * patch_size);
for (int k = 0; k < frames; k++) {
for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
int latent_x = rgb_x / patch_size;
int latent_y = rgb_y / patch_size;
int channel_offset = 0;
if (patch_size > 1) {
channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
}
size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
// should be incremented by 1 for each pixel
size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
float r = 0, g = 0, b = 0;
if (latent_rgb_proj != nullptr) {
for (int d = 0; d < unpatched_dim; d++) {
float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
r += value * latent_rgb_proj[d][0];
g += value * latent_rgb_proj[d][1];
b += value * latent_rgb_proj[d][2];
}
} else {
// interpret first 3 channels as RGB
r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]);
g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
}
if (latent_rgb_bias != nullptr) {
// bias
r += latent_rgb_bias[0];
g += latent_rgb_bias[1];
b += latent_rgb_bias[2];
}
// change range
r = r * .5f + .5f;
g = g * .5f + .5f;
b = b * .5f + .5f;
// clamp rgb values to [0,1] range
r = r >= 0 ? r <= 1 ? r : 1 : 0;
g = g >= 0 ? g <= 1 ? g : 1 : 0;
b = b >= 0 ? b <= 1 ? b : 1 : 0;
buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
}
}
}
}

1669
llm.hpp Normal file

File diff suppressed because it is too large Load Diff

1487
lora.hpp

File diff suppressed because it is too large Load Diff

74
ltxv.hpp Normal file
View File

@ -0,0 +1,74 @@
#ifndef __LTXV_HPP__
#define __LTXV_HPP__
#include "common.hpp"
#include "ggml_extend.hpp"
namespace LTXV {
class CausalConv3d : public GGMLBlock {
protected:
int time_kernel_size;
public:
CausalConv3d(int64_t in_channels,
int64_t out_channels,
int kernel_size = 3,
std::tuple<int, int, int> stride = {1, 1, 1},
int dilation = 1,
bool bias = true) {
time_kernel_size = kernel_size / 2;
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
out_channels,
{kernel_size, kernel_size, kernel_size},
stride,
{0, kernel_size / 2, kernel_size / 2},
{dilation, 1, 1},
bias));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
bool causal = true) {
// x: [N*IC, ID, IH, IW]
// result: [N*OC, OD, OH, OW]
auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
if (causal) {
auto h = ggml_cont(ctx, ggml_permute(ctx, x, 0, 1, 3, 2)); // [ID, N*IC, IH, IW]
auto first_frame = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], 0); // [N*IC, IH, IW]
first_frame = ggml_reshape_4d(ctx, first_frame, first_frame->ne[0], first_frame->ne[1], 1, first_frame->ne[2]); // [N*IC, 1, IH, IW]
auto first_frame_pad = first_frame;
for (int i = 1; i < time_kernel_size - 1; i++) {
first_frame_pad = ggml_concat(ctx, first_frame_pad, first_frame, 2);
}
x = ggml_concat(ctx, first_frame_pad, x, 2);
} else {
auto h = ggml_cont(ctx, ggml_permute(ctx, x, 0, 1, 3, 2)); // [ID, N*IC, IH, IW]
int64_t offset = h->nb[2] * h->ne[2];
auto first_frame = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], 0); // [N*IC, IH, IW]
first_frame = ggml_reshape_4d(ctx, first_frame, first_frame->ne[0], first_frame->ne[1], 1, first_frame->ne[2]); // [N*IC, 1, IH, IW]
auto first_frame_pad = first_frame;
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
first_frame_pad = ggml_concat(ctx, first_frame_pad, first_frame, 2);
}
auto last_frame = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], offset * (h->ne[3] - 1)); // [N*IC, IH, IW]
last_frame = ggml_reshape_4d(ctx, last_frame, last_frame->ne[0], last_frame->ne[1], 1, last_frame->ne[2]); // [N*IC, 1, IH, IW]
auto last_frame_pad = last_frame;
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
last_frame_pad = ggml_concat(ctx, last_frame_pad, last_frame, 2);
}
x = ggml_concat(ctx, first_frame_pad, x, 2);
x = ggml_concat(ctx, x, last_frame_pad, 2);
}
x = conv->forward(ctx, x);
return x;
}
};
};
#endif

288
mmdit.hpp
View File

@ -1,6 +1,8 @@
#ifndef __MMDIT_HPP__ #ifndef __MMDIT_HPP__
#define __MMDIT_HPP__ #define __MMDIT_HPP__
#include <memory>
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "model.h" #include "model.h"
@ -25,13 +27,13 @@ public:
blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias)); blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, n_token, in_features] // x: [N, n_token, in_features]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]); auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]); auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
x = fc1->forward(ctx, x); x = fc1->forward(ctx, x);
x = ggml_gelu_inplace(ctx, x); x = ggml_gelu_inplace(ctx->ggml_ctx, x);
x = fc2->forward(ctx, x); x = fc2->forward(ctx, x);
return x; return x;
} }
@ -70,7 +72,7 @@ public:
bias)); bias));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, C, H, W] // x: [N, C, H, W]
// return: [N, H*W, embed_dim] // return: [N, H*W, embed_dim]
auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]); auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);
@ -80,13 +82,13 @@ public:
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int pad_h = (patch_size - H % patch_size) % patch_size; int pad_h = (patch_size - H % patch_size) % patch_size;
int pad_w = (patch_size - W % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size;
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // TODO: reflect pad mode x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // TODO: reflect pad mode
} }
x = proj->forward(ctx, x); x = proj->forward(ctx, x);
if (flatten) { if (flatten) {
x = ggml_reshape_3d(ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]); x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));
} }
return x; return x;
} }
@ -99,22 +101,26 @@ protected:
public: public:
TimestepEmbedder(int64_t hidden_size, TimestepEmbedder(int64_t hidden_size,
int64_t frequency_embedding_size = 256) int64_t frequency_embedding_size = 256,
int64_t out_channels = 0)
: frequency_embedding_size(frequency_embedding_size) { : frequency_embedding_size(frequency_embedding_size) {
if (out_channels <= 0) {
out_channels = hidden_size;
}
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true)); blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true)); blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, out_channels, true, true));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* t) {
// t: [N, ] // t: [N, ]
// return: [N, hidden_size] // return: [N, hidden_size]
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]); auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]); auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
auto t_freq = ggml_nn_timestep_embedding(ctx, t, frequency_embedding_size); // [N, frequency_embedding_size] auto t_freq = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size); // [N, frequency_embedding_size]
auto t_emb = mlp_0->forward(ctx, t_freq); auto t_emb = mlp_0->forward(ctx, t_freq);
t_emb = ggml_silu_inplace(ctx, t_emb); t_emb = ggml_silu_inplace(ctx->ggml_ctx, t_emb);
t_emb = mlp_2->forward(ctx, t_emb); t_emb = mlp_2->forward(ctx, t_emb);
return t_emb; return t_emb;
} }
@ -129,43 +135,19 @@ public:
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true)); blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, input_dim] // x: [N, input_dim]
// return: [N, hidden_size] // return: [N, hidden_size]
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]); auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]); auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
x = mlp_0->forward(ctx, x); x = mlp_0->forward(ctx, x);
x = ggml_silu_inplace(ctx, x); x = ggml_silu_inplace(ctx->ggml_ctx, x);
x = mlp_2->forward(ctx, x); x = mlp_2->forward(ctx, x);
return x; return x;
} }
}; };
class RMSNorm : public UnaryBlock {
protected:
int64_t hidden_size;
float eps;
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
}
public:
RMSNorm(int64_t hidden_size,
float eps = 1e-06f)
: hidden_size(hidden_size),
eps(eps) {}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
struct ggml_tensor* w = params["weight"];
x = ggml_rms_norm(ctx, x, eps);
x = ggml_mul(ctx, x, w);
return x;
}
};
class SelfAttention : public GGMLBlock { class SelfAttention : public GGMLBlock {
public: public:
int64_t num_heads; int64_t num_heads;
@ -193,15 +175,15 @@ public:
} }
} }
std::vector<struct ggml_tensor*> pre_attention(struct ggml_context* ctx, struct ggml_tensor* x) { std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]); auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
auto qkv = qkv_proj->forward(ctx, x); auto qkv = qkv_proj->forward(ctx, x);
auto qkv_vec = split_qkv(ctx, qkv); auto qkv_vec = split_qkv(ctx->ggml_ctx, qkv);
int64_t head_dim = qkv_vec[0]->ne[0] / num_heads; int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
auto q = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]); // [N, n_token, n_head, d_head] auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]); // [N, n_token, n_head, d_head]
auto k = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]); // [N, n_token, n_head, d_head] auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]); // [N, n_token, n_head, d_head]
auto v = qkv_vec[2]; // [N, n_token, n_head*d_head] auto v = qkv_vec[2]; // [N, n_token, n_head*d_head]
if (qk_norm == "rms" || qk_norm == "ln") { if (qk_norm == "rms" || qk_norm == "ln") {
auto ln_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_q"]); auto ln_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_q"]);
@ -210,13 +192,13 @@ public:
k = ln_k->forward(ctx, k); k = ln_k->forward(ctx, k);
} }
q = ggml_reshape_3d(ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]); // [N, n_token, n_head*d_head] q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]); // [N, n_token, n_head*d_head]
k = ggml_reshape_3d(ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]); // [N, n_token, n_head*d_head] k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]); // [N, n_token, n_head*d_head]
return {q, k, v}; return {q, k, v};
} }
struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
GGML_ASSERT(!pre_only); GGML_ASSERT(!pre_only);
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]); auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@ -226,10 +208,11 @@ public:
} }
// x: [N, n_token, dim] // x: [N, n_token, dim]
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x) {
auto qkv = pre_attention(ctx, x); auto qkv = pre_attention(ctx, x);
x = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads); // [N, n_token, dim] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim]
x = post_attention(ctx, x); // [N, n_token, dim] x = post_attention(ctx, x); // [N, n_token, dim]
return x; return x;
} }
}; };
@ -290,9 +273,9 @@ public:
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size)); blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size));
} }
std::tuple<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention_x(struct ggml_context* ctx, std::tuple<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention_x(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
GGML_ASSERT(self_attn); GGML_ASSERT(self_attn);
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// c: [N, hidden_size] // c: [N, hidden_size]
@ -302,35 +285,35 @@ public:
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]); auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
int64_t n_mods = 9; int64_t n_mods = 9;
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, n_mods * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, n_mods * hidden_size]
m = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size] m = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size]
m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size] m = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size]
int64_t offset = m->nb[1] * m->ne[1]; int64_t offset = m->nb[1] * m->ne[1];
auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size] auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size] auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
auto gate_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size] auto gate_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size]
auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size] auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size]
auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size] auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size]
auto gate_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size] auto gate_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size]
auto shift_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6); // [N, hidden_size] auto shift_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6); // [N, hidden_size]
auto scale_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7); // [N, hidden_size] auto scale_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7); // [N, hidden_size]
auto gate_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8); // [N, hidden_size] auto gate_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8); // [N, hidden_size]
auto x_norm = norm1->forward(ctx, x); auto x_norm = norm1->forward(ctx, x);
auto attn_in = modulate(ctx, x_norm, shift_msa, scale_msa); auto attn_in = modulate(ctx->ggml_ctx, x_norm, shift_msa, scale_msa);
auto qkv = attn->pre_attention(ctx, attn_in); auto qkv = attn->pre_attention(ctx, attn_in);
auto attn2_in = modulate(ctx, x_norm, shift_msa2, scale_msa2); auto attn2_in = modulate(ctx->ggml_ctx, x_norm, shift_msa2, scale_msa2);
auto qkv2 = attn2->pre_attention(ctx, attn2_in); auto qkv2 = attn2->pre_attention(ctx, attn2_in);
return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}}; return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}};
} }
std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(struct ggml_context* ctx, std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
@ -343,33 +326,33 @@ public:
if (pre_only) { if (pre_only) {
n_mods = 2; n_mods = 2;
} }
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, n_mods * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, n_mods * hidden_size]
m = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size] m = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size]
m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size] m = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size]
int64_t offset = m->nb[1] * m->ne[1]; int64_t offset = m->nb[1] * m->ne[1];
auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size] auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size] auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
if (!pre_only) { if (!pre_only) {
auto gate_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size] auto gate_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size]
auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size] auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size]
auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size] auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size]
auto gate_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size] auto gate_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size]
auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa); auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
auto qkv = attn->pre_attention(ctx, attn_in); auto qkv = attn->pre_attention(ctx, attn_in);
return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}}; return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}};
} else { } else {
auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa); auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
auto qkv = attn->pre_attention(ctx, attn_in); auto qkv = attn->pre_attention(ctx, attn_in);
return {qkv, {NULL, NULL, NULL, NULL, NULL}}; return {qkv, {nullptr, nullptr, nullptr, nullptr, nullptr}};
} }
} }
struct ggml_tensor* post_attention_x(struct ggml_context* ctx, struct ggml_tensor* post_attention_x(GGMLRunnerContext* ctx,
struct ggml_tensor* attn_out, struct ggml_tensor* attn_out,
struct ggml_tensor* attn2_out, struct ggml_tensor* attn2_out,
struct ggml_tensor* x, struct ggml_tensor* x,
@ -392,22 +375,22 @@ public:
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]); auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]); auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);
gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size] gate_msa = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size]
gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size] gate_mlp = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size]
gate_msa2 = ggml_reshape_3d(ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]); // [N, 1, hidden_size] gate_msa2 = ggml_reshape_3d(ctx->ggml_ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]); // [N, 1, hidden_size]
attn_out = attn->post_attention(ctx, attn_out); attn_out = attn->post_attention(ctx, attn_out);
attn2_out = attn2->post_attention(ctx, attn2_out); attn2_out = attn2->post_attention(ctx, attn2_out);
x = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
x = ggml_add(ctx, x, ggml_mul(ctx, attn2_out, gate_msa2)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn2_out, gate_msa2));
auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp)); auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
x = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));
return x; return x;
} }
struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* post_attention(GGMLRunnerContext* ctx,
struct ggml_tensor* attn_out, struct ggml_tensor* attn_out,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* gate_msa, struct ggml_tensor* gate_msa,
@ -427,19 +410,21 @@ public:
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]); auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]); auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);
gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size] gate_msa = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size]
gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size] gate_mlp = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size]
attn_out = attn->post_attention(ctx, attn_out); attn_out = attn->post_attention(ctx, attn_out);
x = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp)); auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
x = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));
return x; return x;
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* c) { struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// c: [N, hidden_size] // c: [N, hidden_size]
// return: [N, n_token, hidden_size] // return: [N, n_token, hidden_size]
@ -454,8 +439,8 @@ public:
auto qkv2 = std::get<1>(qkv_intermediates); auto qkv2 = std::get<1>(qkv_intermediates);
auto intermediates = std::get<2>(qkv_intermediates); auto intermediates = std::get<2>(qkv_intermediates);
auto attn_out = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads); // [N, n_token, dim] auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim]
auto attn2_out = ggml_nn_attention_ext(ctx, qkv2[0], qkv2[1], qkv2[2], num_heads); // [N, n_token, dim] auto attn2_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim]
x = post_attention_x(ctx, x = post_attention_x(ctx,
attn_out, attn_out,
attn2_out, attn2_out,
@ -471,7 +456,7 @@ public:
auto qkv = qkv_intermediates.first; auto qkv = qkv_intermediates.first;
auto intermediates = qkv_intermediates.second; auto intermediates = qkv_intermediates.second;
auto attn_out = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads); // [N, n_token, dim] auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim]
x = post_attention(ctx, x = post_attention(ctx,
attn_out, attn_out,
intermediates[0], intermediates[0],
@ -485,7 +470,7 @@ public:
}; };
__STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*> __STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*>
block_mixing(struct ggml_context* ctx, block_mixing(GGMLRunnerContext* ctx,
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c, struct ggml_tensor* c,
@ -512,29 +497,29 @@ block_mixing(struct ggml_context* ctx,
} }
std::vector<struct ggml_tensor*> qkv; std::vector<struct ggml_tensor*> qkv;
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1)); qkv.push_back(ggml_concat(ctx->ggml_ctx, context_qkv[i], x_qkv[i], 1));
} }
auto attn = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], x_block->num_heads); // [N, n_context + n_token, hidden_size] auto attn = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_context + n_token, hidden_size]
attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size] attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size]
auto context_attn = ggml_view_3d(ctx, auto context_attn = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1], attn->ne[1],
context->ne[1], context->ne[1],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
0); // [n_context, N, hidden_size] 0); // [n_context, N, hidden_size]
context_attn = ggml_cont(ctx, ggml_permute(ctx, context_attn, 0, 2, 1, 3)); // [N, n_context, hidden_size] context_attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context_attn, 0, 2, 1, 3)); // [N, n_context, hidden_size]
auto x_attn = ggml_view_3d(ctx, auto x_attn = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1], attn->ne[1],
x->ne[1], x->ne[1],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
attn->nb[2] * context->ne[1]); // [n_token, N, hidden_size] attn->nb[2] * context->ne[1]); // [n_token, N, hidden_size]
x_attn = ggml_cont(ctx, ggml_permute(ctx, x_attn, 0, 2, 1, 3)); // [N, n_token, hidden_size] x_attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x_attn, 0, 2, 1, 3)); // [N, n_token, hidden_size]
if (!context_block->pre_only) { if (!context_block->pre_only) {
context = context_block->post_attention(ctx, context = context_block->post_attention(ctx,
@ -545,11 +530,11 @@ block_mixing(struct ggml_context* ctx,
context_intermediates[3], context_intermediates[3],
context_intermediates[4]); context_intermediates[4]);
} else { } else {
context = NULL; context = nullptr;
} }
if (x_block->self_attn) { if (x_block->self_attn) {
auto attn2 = ggml_nn_attention_ext(ctx, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads); // [N, n_token, hidden_size] auto attn2 = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, hidden_size]
x = x_block->post_attention_x(ctx, x = x_block->post_attention_x(ctx,
x_attn, x_attn,
@ -582,11 +567,11 @@ public:
bool qkv_bias = false, bool qkv_bias = false,
bool pre_only = false, bool pre_only = false,
bool self_attn_x = false) { bool self_attn_x = false) {
blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only)); blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false));
blocks["x_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x)); blocks["x_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
} }
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
@ -609,7 +594,7 @@ public:
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size)); blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
@ -619,15 +604,15 @@ public:
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]); auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]); auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, 2 * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 2 * hidden_size]
m = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]); // [N, 2, hidden_size] m = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], 2, c->ne[1]); // [N, 2, hidden_size]
m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [2, N, hidden_size] m = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3)); // [2, N, hidden_size]
int64_t offset = m->nb[1] * m->ne[1]; int64_t offset = m->nb[1] * m->ne[1];
auto shift = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size] auto shift = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
auto scale = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size] auto scale = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
x = modulate(ctx, norm_final->forward(ctx, x), shift, scale); x = modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
x = linear->forward(ctx, x); x = linear->forward(ctx, x);
return x; return x;
@ -652,13 +637,13 @@ protected:
int64_t hidden_size; int64_t hidden_size;
std::string qk_norm; std::string qk_norm;
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "pos_embed") != tensor_types.end()) ? tensor_types[prefix + "pos_embed"] : GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1); params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
} }
public: public:
MMDiT(std::map<std::string, enum ggml_type>& tensor_types) { MMDiT(const String2TensorStorage& tensor_storage_map = {}) {
// input_size is always None // input_size is always None
// learn_sigma is always False // learn_sigma is always False
// register_length is alwalys 0 // register_length is alwalys 0
@ -671,8 +656,7 @@ public:
// pos_embed_offset is not used // pos_embed_offset is not used
// context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}} // context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}
// read tensors from tensor_types for (auto pair : tensor_storage_map) {
for (auto pair : tensor_types) {
std::string tensor_name = pair.first; std::string tensor_name = pair.first;
if (tensor_name.find("model.diffusion_model.") == std::string::npos) if (tensor_name.find("model.diffusion_model.") == std::string::npos)
continue; continue;
@ -794,7 +778,7 @@ public:
return x; return x;
} }
struct ggml_tensor* forward_core_with_concat(struct ggml_context* ctx, struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c_mod, struct ggml_tensor* c_mod,
struct ggml_tensor* context, struct ggml_tensor* context,
@ -823,11 +807,11 @@ public:
return x; return x;
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* t, struct ggml_tensor* t,
struct ggml_tensor* y = NULL, struct ggml_tensor* y = nullptr,
struct ggml_tensor* context = NULL, struct ggml_tensor* context = nullptr,
std::vector<int> skip_layers = std::vector<int>()) { std::vector<int> skip_layers = std::vector<int>()) {
// Forward pass of DiT. // Forward pass of DiT.
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
@ -841,19 +825,19 @@ public:
int64_t w = x->ne[0]; int64_t w = x->ne[0];
int64_t h = x->ne[1]; int64_t h = x->ne[1];
auto patch_embed = x_embedder->forward(ctx, x); // [N, H*W, hidden_size] auto patch_embed = x_embedder->forward(ctx, x); // [N, H*W, hidden_size]
auto pos_embed = cropped_pos_embed(ctx, h, w); // [1, H*W, hidden_size] auto pos_embed = cropped_pos_embed(ctx->ggml_ctx, h, w); // [1, H*W, hidden_size]
x = ggml_add(ctx, patch_embed, pos_embed); // [N, H*W, hidden_size] x = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed); // [N, H*W, hidden_size]
auto c = t_embedder->forward(ctx, t); // [N, hidden_size] auto c = t_embedder->forward(ctx, t); // [N, hidden_size]
if (y != NULL && adm_in_channels != -1) { if (y != nullptr && adm_in_channels != -1) {
auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]); auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);
y = y_embedder->forward(ctx, y); // [N, hidden_size] y = y_embedder->forward(ctx, y); // [N, hidden_size]
c = ggml_add(ctx, c, y); c = ggml_add(ctx->ggml_ctx, c, y);
} }
if (context != NULL) { if (context != nullptr) {
auto context_embedder = std::dynamic_pointer_cast<Linear>(blocks["context_embedder"]); auto context_embedder = std::dynamic_pointer_cast<Linear>(blocks["context_embedder"]);
context = context_embedder->forward(ctx, context); // [N, L, D] aka [N, L, 1536] context = context_embedder->forward(ctx, context); // [N, L, D] aka [N, L, 1536]
@ -861,7 +845,7 @@ public:
x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels) x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels)
x = unpatchify(ctx, x, h, w); // [N, C, H, W] x = unpatchify(ctx->ggml_ctx, x, h, w); // [N, C, H, W]
return x; return x;
} }
@ -869,16 +853,15 @@ public:
struct MMDiTRunner : public GGMLRunner { struct MMDiTRunner : public GGMLRunner {
MMDiT mmdit; MMDiT mmdit;
static std::map<std::string, enum ggml_type> empty_tensor_types;
MMDiTRunner(ggml_backend_t backend, MMDiTRunner(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool offload_params_to_cpu,
const std::string prefix = "") const String2TensorStorage& tensor_storage_map = {},
: GGMLRunner(backend), mmdit(tensor_types) { const std::string prefix = "")
mmdit.init(params_ctx, tensor_types, prefix); : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_storage_map) {
mmdit.init(params_ctx, tensor_storage_map, prefix);
} }
std::string get_desc() { std::string get_desc() override {
return "mmdit"; return "mmdit";
} }
@ -891,14 +874,15 @@ struct MMDiTRunner : public GGMLRunner {
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* y, struct ggml_tensor* y,
std::vector<int> skip_layers = std::vector<int>()) { std::vector<int> skip_layers = std::vector<int>()) {
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false); struct ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);
x = to_backend(x); x = to_backend(x);
context = to_backend(context); context = to_backend(context);
y = to_backend(y); y = to_backend(y);
timesteps = to_backend(timesteps); timesteps = to_backend(timesteps);
struct ggml_tensor* out = mmdit.forward(compute_ctx, auto runner_ctx = get_context();
struct ggml_tensor* out = mmdit.forward(&runner_ctx,
x, x,
timesteps, timesteps,
y, y,
@ -910,13 +894,13 @@ struct MMDiTRunner : public GGMLRunner {
return gf; return gf;
} }
void compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* y, struct ggml_tensor* y,
struct ggml_tensor** output = NULL, struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = NULL, struct ggml_context* output_ctx = nullptr,
std::vector<int> skip_layers = std::vector<int>()) { std::vector<int> skip_layers = std::vector<int>()) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// timesteps: [N, ] // timesteps: [N, ]
@ -926,17 +910,17 @@ struct MMDiTRunner : public GGMLRunner {
return build_graph(x, timesteps, context, y, skip_layers); return build_graph(x, timesteps, context, y, skip_layers);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
void test() { void test() {
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
params.mem_buffer = NULL; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); struct ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != NULL); GGML_ASSERT(work_ctx != nullptr);
{ {
// cpu f16: pass // cpu f16: pass
@ -957,7 +941,7 @@ struct MMDiTRunner : public GGMLRunner {
ggml_set_f32(y, 0.01f); ggml_set_f32(y, 0.01f);
// print_ggml_tensor(y); // print_ggml_tensor(y);
struct ggml_tensor* out = NULL; struct ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int t0 = ggml_time_ms();
compute(8, x, timesteps, context, y, &out, work_ctx); compute(8, x, timesteps, context, y, &out, work_ctx);
@ -972,7 +956,7 @@ struct MMDiTRunner : public GGMLRunner {
// ggml_backend_t backend = ggml_backend_cuda_init(0); // ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init(); ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16; ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend)); std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false);
{ {
LOG_INFO("loading from '%s'", file_path.c_str()); LOG_INFO("loading from '%s'", file_path.c_str());
@ -981,12 +965,12 @@ struct MMDiTRunner : public GGMLRunner {
mmdit->get_param_tensors(tensors, "model.diffusion_model"); mmdit->get_param_tensors(tensors, "model.diffusion_model");
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }
bool success = model_loader.load_tensors(tensors, backend); bool success = model_loader.load_tensors(tensors);
if (!success) { if (!success) {
LOG_ERROR("load tensors from model loader failed"); LOG_ERROR("load tensors from model loader failed");

1545
model.cpp

File diff suppressed because it is too large Load Diff

171
model.h
View File

@ -8,12 +8,14 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <utility>
#include <vector> #include <vector>
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml.h" #include "ggml.h"
#include "gguf.h" #include "gguf.h"
#include "json.hpp" #include "json.hpp"
#include "ordered_map.hpp"
#include "zip.h" #include "zip.h"
#define SD_MAX_DIMS 5 #define SD_MAX_DIMS 5
@ -21,19 +23,57 @@
enum SDVersion { enum SDVersion {
VERSION_SD1, VERSION_SD1,
VERSION_SD1_INPAINT, VERSION_SD1_INPAINT,
VERSION_SD1_PIX2PIX,
VERSION_SD1_TINY_UNET,
VERSION_SD2, VERSION_SD2,
VERSION_SD2_INPAINT, VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET,
VERSION_SDXL, VERSION_SDXL,
VERSION_SDXL_INPAINT, VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX,
VERSION_SDXL_SSD1B,
VERSION_SVD, VERSION_SVD,
VERSION_SD3, VERSION_SD3,
VERSION_FLUX, VERSION_FLUX,
VERSION_FLUX_FILL, VERSION_FLUX_FILL,
VERSION_FLUX_CONTROLS,
VERSION_FLEX_2,
VERSION_CHROMA_RADIANCE,
VERSION_WAN2,
VERSION_WAN2_2_I2V,
VERSION_WAN2_2_TI2V,
VERSION_QWEN_IMAGE,
VERSION_FLUX2,
VERSION_Z_IMAGE,
VERSION_OVIS_IMAGE,
VERSION_COUNT, VERSION_COUNT,
}; };
static inline bool sd_version_is_flux(SDVersion version) { static inline bool sd_version_is_sd1(SDVersion version) {
if (version == VERSION_FLUX || version == VERSION_FLUX_FILL) { if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) {
return true;
}
return false;
}
static inline bool sd_version_is_sd2(SDVersion version) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
return true;
}
return false;
}
static inline bool sd_version_is_sdxl(SDVersion version) {
if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX || version == VERSION_SDXL_SSD1B) {
return true;
}
return false;
}
static inline bool sd_version_is_unet(SDVersion version) {
if (sd_version_is_sd1(version) ||
sd_version_is_sd2(version) ||
sd_version_is_sdxl(version)) {
return true; return true;
} }
return false; return false;
@ -46,41 +86,81 @@ static inline bool sd_version_is_sd3(SDVersion version) {
return false; return false;
} }
static inline bool sd_version_is_sd1(SDVersion version) { static inline bool sd_version_is_flux(SDVersion version) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) { if (version == VERSION_FLUX ||
version == VERSION_FLUX_FILL ||
version == VERSION_FLUX_CONTROLS ||
version == VERSION_FLEX_2 ||
version == VERSION_OVIS_IMAGE ||
version == VERSION_CHROMA_RADIANCE) {
return true; return true;
} }
return false; return false;
} }
static inline bool sd_version_is_sd2(SDVersion version) { static inline bool sd_version_is_flux2(SDVersion version) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) { if (version == VERSION_FLUX2) {
return true; return true;
} }
return false; return false;
} }
static inline bool sd_version_is_sdxl(SDVersion version) { static inline bool sd_version_is_wan(SDVersion version) {
if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) { if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
return true;
}
return false;
}
static inline bool sd_version_is_qwen_image(SDVersion version) {
if (version == VERSION_QWEN_IMAGE) {
return true;
}
return false;
}
static inline bool sd_version_is_z_image(SDVersion version) {
if (version == VERSION_Z_IMAGE) {
return true; return true;
} }
return false; return false;
} }
static inline bool sd_version_is_inpaint(SDVersion version) { static inline bool sd_version_is_inpaint(SDVersion version) {
if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL) { if (version == VERSION_SD1_INPAINT ||
version == VERSION_SD2_INPAINT ||
version == VERSION_SDXL_INPAINT ||
version == VERSION_FLUX_FILL ||
version == VERSION_FLEX_2) {
return true; return true;
} }
return false; return false;
} }
static inline bool sd_version_is_dit(SDVersion version) { static inline bool sd_version_is_dit(SDVersion version) {
if (sd_version_is_flux(version) || sd_version_is_sd3(version)) { if (sd_version_is_flux(version) ||
sd_version_is_flux2(version) ||
sd_version_is_sd3(version) ||
sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) ||
sd_version_is_z_image(version)) {
return true; return true;
} }
return false; return false;
} }
static inline bool sd_version_is_unet_edit(SDVersion version) {
return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
}
static inline bool sd_version_is_control(SDVersion version) {
return version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2;
}
static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version) || sd_version_is_control(version);
}
enum PMVersion { enum PMVersion {
PM_VERSION_1, PM_VERSION_1,
PM_VERSION_2, PM_VERSION_2,
@ -89,20 +169,22 @@ enum PMVersion {
struct TensorStorage { struct TensorStorage {
std::string name; std::string name;
ggml_type type = GGML_TYPE_F32; ggml_type type = GGML_TYPE_F32;
bool is_bf16 = false; ggml_type expected_type = GGML_TYPE_COUNT;
bool is_f8_e4m3 = false; bool is_f8_e4m3 = false;
bool is_f8_e5m2 = false; bool is_f8_e5m2 = false;
bool is_f64 = false;
bool is_i64 = false;
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1}; int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
int n_dims = 0; int n_dims = 0;
size_t file_index = 0; size_t file_index = 0;
int index_in_zip = -1; // >= means stored in a zip file int index_in_zip = -1; // >= means stored in a zip file
size_t offset = 0; // offset in file uint64_t offset = 0; // offset in file
TensorStorage() = default; TensorStorage() = default;
TensorStorage(const std::string& name, ggml_type type, int64_t* ne, int n_dims, size_t file_index, size_t offset = 0) TensorStorage(std::string name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
: name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) { : name(std::move(name)), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
for (int i = 0; i < n_dims; i++) { for (int i = 0; i < n_dims; i++) {
this->ne[i] = ne[i]; this->ne[i] = ne[i];
} }
@ -121,8 +203,10 @@ struct TensorStorage {
} }
int64_t nbytes_to_read() const { int64_t nbytes_to_read() const {
if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) { if (is_f8_e4m3 || is_f8_e5m2) {
return nbytes() / 2; return nbytes() / 2;
} else if (is_f64 || is_i64) {
return nbytes() * 2;
} else { } else {
return nbytes(); return nbytes();
} }
@ -140,10 +224,10 @@ struct TensorStorage {
std::vector<TensorStorage> chunk(size_t n) { std::vector<TensorStorage> chunk(size_t n) {
std::vector<TensorStorage> chunks; std::vector<TensorStorage> chunks;
size_t chunk_size = nbytes_to_read() / n; uint64_t chunk_size = nbytes_to_read() / n;
// printf("%d/%d\n", chunk_size, nbytes_to_read()); // printf("%d/%d\n", chunk_size, nbytes_to_read());
reverse_ne(); reverse_ne();
for (int i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
TensorStorage chunk_i = *this; TensorStorage chunk_i = *this;
chunk_i.ne[0] = ne[0] / n; chunk_i.ne[0] = ne[0] / n;
chunk_i.offset = offset + i * chunk_size; chunk_i.offset = offset + i * chunk_size;
@ -167,12 +251,14 @@ struct TensorStorage {
std::string to_string() const { std::string to_string() const {
std::stringstream ss; std::stringstream ss;
const char* type_name = ggml_type_name(type); const char* type_name = ggml_type_name(type);
if (is_bf16) { if (is_f8_e4m3) {
type_name = "bf16";
} else if (is_f8_e4m3) {
type_name = "f8_e4m3"; type_name = "f8_e4m3";
} else if (is_f8_e5m2) { } else if (is_f8_e5m2) {
type_name = "f8_e5m2"; type_name = "f8_e5m2";
} else if (is_f64) {
type_name = "f64";
} else if (is_i64) {
type_name = "i64";
} }
ss << name << " | " << type_name << " | "; ss << name << " | " << type_name << " | ";
ss << n_dims << " ["; ss << n_dims << " [";
@ -189,10 +275,15 @@ struct TensorStorage {
typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t; typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
class ModelLoader { class ModelLoader {
protected: protected:
SDVersion version_ = VERSION_COUNT;
std::vector<std::string> file_paths_; std::vector<std::string> file_paths_;
std::vector<TensorStorage> tensor_storages; String2TensorStorage tensor_storage_map;
void add_tensor_storage(const TensorStorage& tensor_storage);
bool parse_data_pkl(uint8_t* buffer, bool parse_data_pkl(uint8_t* buffer,
size_t buffer_size, size_t buffer_size,
@ -207,28 +298,42 @@ protected:
bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = ""); bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
public: public:
std::map<std::string, enum ggml_type> tensor_storages_types;
bool init_from_file(const std::string& file_path, const std::string& prefix = ""); bool init_from_file(const std::string& file_path, const std::string& prefix = "");
bool model_is_unet(); void convert_tensors_name();
bool init_from_file_and_convert_name(const std::string& file_path,
const std::string& prefix = "",
SDVersion version = VERSION_COUNT);
SDVersion get_sd_version(); SDVersion get_sd_version();
ggml_type get_sd_wtype(); std::map<ggml_type, uint32_t> get_wtype_stat();
ggml_type get_conditioner_wtype(); std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();
ggml_type get_diffusion_model_wtype(); std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
ggml_type get_vae_wtype(); std::map<ggml_type, uint32_t> get_vae_wtype_stat();
void set_wtype_override(ggml_type wtype, std::string prefix = ""); String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend); void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors, bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
ggml_backend_t backend, std::set<std::string> ignore_tensors = {},
std::set<std::string> ignore_tensors = {}); int n_threads = 0);
bool save_to_gguf_file(const std::string& file_path, ggml_type type); std::vector<std::string> get_tensor_names() const {
std::vector<std::string> names;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
names.push_back(name);
}
return names;
}
bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
~ModelLoader() = default; ~ModelLoader() = default;
static std::string load_merges(); static std::string load_merges();
static std::string load_qwen2_merges();
static std::string load_mistral_merges();
static std::string load_mistral_vocab_json();
static std::string load_t5_tokenizer_json(); static std::string load_t5_tokenizer_json();
static std::string load_umt5_tokenizer_json();
}; };
#endif // __MODEL_H__ #endif // __MODEL_H__

1105
name_conversion.cpp Normal file

File diff suppressed because it is too large Load Diff

14
name_conversion.h Normal file
View File

@ -0,0 +1,14 @@
#ifndef __NAME_CONVERSTION_H__
#define __NAME_CONVERSTION_H__
#include <string>
#include "model.h"
bool is_cond_stage_model_name(const std::string& name);
bool is_diffusion_model_name(const std::string& name);
bool is_first_stage_model_name(const std::string& name);
std::string convert_tensor_name(std::string name, SDVersion version);
#endif // __NAME_CONVERSTION_H__

177
ordered_map.hpp Normal file
View File

@ -0,0 +1,177 @@
#ifndef __ORDERED_MAP_HPP__
#define __ORDERED_MAP_HPP__
#include <iostream>
#include <list>
#include <string>
#include <unordered_map>
#include <initializer_list>
#include <iterator>
#include <list>
#include <stdexcept>
#include <unordered_map>
#include <utility>
template <typename Key, typename T>
class OrderedMap {
public:
using key_type = Key;
using mapped_type = T;
using value_type = std::pair<const Key, T>;
using list_type = std::list<value_type>;
using size_type = typename list_type::size_type;
using difference_type = typename list_type::difference_type;
using iterator = typename list_type::iterator;
using const_iterator = typename list_type::const_iterator;
private:
list_type data_;
std::unordered_map<Key, iterator> index_;
public:
// --- constructors ---
OrderedMap() = default;
OrderedMap(std::initializer_list<value_type> init) {
for (const auto& kv : init)
insert(kv);
}
OrderedMap(const OrderedMap&) = default;
OrderedMap(OrderedMap&&) noexcept = default;
OrderedMap& operator=(const OrderedMap&) = default;
OrderedMap& operator=(OrderedMap&&) noexcept = default;
// --- element access ---
T& at(const Key& key) {
auto it = index_.find(key);
if (it == index_.end())
throw std::out_of_range("OrderedMap::at: key not found");
return it->second->second;
}
const T& at(const Key& key) const {
auto it = index_.find(key);
if (it == index_.end())
throw std::out_of_range("OrderedMap::at: key not found");
return it->second->second;
}
T& operator[](const Key& key) {
auto it = index_.find(key);
if (it == index_.end()) {
data_.emplace_back(key, T{});
auto iter = std::prev(data_.end());
index_[key] = iter;
return iter->second;
}
return it->second->second;
}
// --- iterators ---
iterator begin() noexcept { return data_.begin(); }
const_iterator begin() const noexcept { return data_.begin(); }
const_iterator cbegin() const noexcept { return data_.cbegin(); }
iterator end() noexcept { return data_.end(); }
const_iterator end() const noexcept { return data_.end(); }
const_iterator cend() const noexcept { return data_.cend(); }
// --- capacity ---
bool empty() const noexcept { return data_.empty(); }
size_type size() const noexcept { return data_.size(); }
// --- modifiers ---
void clear() noexcept {
data_.clear();
index_.clear();
}
std::pair<iterator, bool> insert(const value_type& value) {
auto it = index_.find(value.first);
if (it != index_.end()) {
return {it->second, false};
}
data_.push_back(value);
auto iter = std::prev(data_.end());
index_[value.first] = iter;
return {iter, true};
}
std::pair<iterator, bool> insert(value_type&& value) {
auto it = index_.find(value.first);
if (it != index_.end()) {
return {it->second, false};
}
data_.push_back(std::move(value));
auto iter = std::prev(data_.end());
index_[iter->first] = iter;
return {iter, true};
}
void erase(const Key& key) {
auto it = index_.find(key);
if (it != index_.end()) {
data_.erase(it->second);
index_.erase(it);
}
}
iterator erase(iterator pos) {
index_.erase(pos->first);
return data_.erase(pos);
}
// --- lookup ---
size_type count(const Key& key) const {
return index_.count(key);
}
iterator find(const Key& key) {
auto it = index_.find(key);
if (it == index_.end())
return data_.end();
return it->second;
}
const_iterator find(const Key& key) const {
auto it = index_.find(key);
if (it == index_.end())
return data_.end();
return it->second;
}
bool contains(const Key& key) const {
return index_.find(key) != index_.end();
}
// --- comparison ---
bool operator==(const OrderedMap& other) const {
return data_ == other.data_;
}
bool operator!=(const OrderedMap& other) const {
return !(*this == other);
}
template <typename... Args>
std::pair<iterator, bool> emplace(Args&&... args) {
value_type value(std::forward<Args>(args)...);
auto it = index_.find(value.first);
if (it != index_.end()) {
return {it->second, false};
}
data_.push_back(std::move(value));
auto iter = std::prev(data_.end());
index_[iter->first] = iter;
return {iter, true};
}
void swap(OrderedMap& other) noexcept {
data_.swap(other.data_);
index_.swap(other.index_);
}
};
#endif // __ORDERED_MAP_HPP__

376
pmid.hpp
View File

@ -21,7 +21,7 @@ public:
blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim)); blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]); auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
@ -29,54 +29,19 @@ public:
auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]); auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]);
struct ggml_tensor* r = x; struct ggml_tensor* r = x;
// x = ggml_nn_layer_norm(ctx, x, ln_w, ln_b); // x = ggml_ext_layer_norm(ctx, x, ln_w, ln_b);
x = layer_norm->forward(ctx, x); x = layer_norm->forward(ctx, x);
// x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x), fc1_b); // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x), fc1_b);
x = fc1->forward(ctx, x); x = fc1->forward(ctx, x);
x = ggml_gelu_inplace(ctx, x); x = ggml_gelu_inplace(ctx->ggml_ctx, x);
x = fc2->forward(ctx, x); x = fc2->forward(ctx, x);
// x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x), fc2_b); // x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x), fc2_b);
if (use_residue) if (use_residue)
x = ggml_add(ctx, x, r); x = ggml_add(ctx->ggml_ctx, x, r);
return x; return x;
} }
}; };
/*
class QFormerPerceiver(nn.Module):
def __init__(self, id_embeddings_dim, cross_attention_dim, num_tokens, embedding_dim=1024, use_residual=True, ratio=4):
super().__init__()
self.num_tokens = num_tokens
self.cross_attention_dim = cross_attention_dim
self.use_residual = use_residual
print(cross_attention_dim*num_tokens)
self.token_proj = nn.Sequential(
nn.Linear(id_embeddings_dim, id_embeddings_dim*ratio),
nn.GELU(),
nn.Linear(id_embeddings_dim*ratio, cross_attention_dim*num_tokens),
)
self.token_norm = nn.LayerNorm(cross_attention_dim)
self.perceiver_resampler = FacePerceiverResampler(
dim=cross_attention_dim,
depth=4,
dim_head=128,
heads=cross_attention_dim // 128,
embedding_dim=embedding_dim,
output_dim=cross_attention_dim,
ff_mult=4,
)
def forward(self, x, last_hidden_state):
x = self.token_proj(x)
x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
x = self.token_norm(x) # cls token
out = self.perceiver_resampler(x, last_hidden_state) # retrieve from patch tokens
if self.use_residual: # TODO: if use_residual is not true
out = x + 1.0 * out
return out
*/
struct PMFeedForward : public GGMLBlock { struct PMFeedForward : public GGMLBlock {
// network hparams // network hparams
int dim; int dim;
@ -89,7 +54,7 @@ public:
blocks["1"] = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false)); blocks["1"] = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x) { struct ggml_tensor* x) {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]); auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
auto ff = std::dynamic_pointer_cast<Mlp>(blocks["1"]); auto ff = std::dynamic_pointer_cast<Mlp>(blocks["1"]);
@ -122,17 +87,8 @@ public:
int64_t ne[4]; int64_t ne[4];
for (int i = 0; i < 4; ++i) for (int i = 0; i < 4; ++i)
ne[i] = x->ne[i]; ne[i] = x->ne[i];
// print_ggml_tensor(x, true, "PerceiverAttention reshape x 0: ");
// printf("heads = %d \n", heads);
// x = ggml_view_4d(ctx, x, x->ne[0], x->ne[1], heads, x->ne[2]/heads,
// x->nb[1], x->nb[2], x->nb[3], 0);
x = ggml_reshape_4d(ctx, x, x->ne[0] / heads, heads, x->ne[1], x->ne[2]); x = ggml_reshape_4d(ctx, x, x->ne[0] / heads, heads, x->ne[1], x->ne[2]);
// x = ggml_view_4d(ctx, x, x->ne[0]/heads, heads, x->ne[1], x->ne[2],
// x->nb[1], x->nb[2], x->nb[3], 0);
// x = ggml_cont(ctx, x);
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));
// print_ggml_tensor(x, true, "PerceiverAttention reshape x 1: ");
// x = ggml_reshape_4d(ctx, x, ne[0], heads, ne[1], ne[2]/heads);
return x; return x;
} }
@ -144,7 +100,7 @@ public:
ggml_cont(ctx, tli)}; ggml_cont(ctx, tli)};
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* latents) { struct ggml_tensor* latents) {
// x (torch.Tensor): image features // x (torch.Tensor): image features
@ -162,33 +118,33 @@ public:
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]); auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
auto q = to_q->forward(ctx, latents); auto q = to_q->forward(ctx, latents);
auto kv_input = ggml_concat(ctx, x, latents, 1); auto kv_input = ggml_concat(ctx->ggml_ctx, x, latents, 1);
auto to_kv = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]); auto to_kv = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
auto kv = to_kv->forward(ctx, kv_input); auto kv = to_kv->forward(ctx, kv_input);
auto k = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0); auto k = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0);
auto v = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2)); auto v = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2));
k = ggml_cont(ctx, k); k = ggml_cont(ctx->ggml_ctx, k);
v = ggml_cont(ctx, v); v = ggml_cont(ctx->ggml_ctx, v);
q = reshape_tensor(ctx, q, heads); q = reshape_tensor(ctx->ggml_ctx, q, heads);
k = reshape_tensor(ctx, k, heads); k = reshape_tensor(ctx->ggml_ctx, k, heads);
v = reshape_tensor(ctx, v, heads); v = reshape_tensor(ctx->ggml_ctx, v, heads);
scale = 1.f / sqrt(sqrt((float)dim_head)); scale = 1.f / sqrt(sqrt((float)dim_head));
k = ggml_scale_inplace(ctx, k, scale); k = ggml_scale_inplace(ctx->ggml_ctx, k, scale);
q = ggml_scale_inplace(ctx, q, scale); q = ggml_scale_inplace(ctx->ggml_ctx, q, scale);
// auto weight = ggml_mul_mat(ctx, q, k); // auto weight = ggml_mul_mat(ctx, q, k);
auto weight = ggml_mul_mat(ctx, k, q); // NOTE order of mul is opposite to pytorch auto weight = ggml_mul_mat(ctx->ggml_ctx, k, q); // NOTE order of mul is opposite to pytorch
// GGML's softmax() is equivalent to pytorch's softmax(x, dim=-1) // GGML's softmax() is equivalent to pytorch's softmax(x, dim=-1)
// in this case, dimension along which Softmax will be computed is the last dim // in this case, dimension along which Softmax will be computed is the last dim
// in torch and the first dim in GGML, consistent with the convention that pytorch's // in torch and the first dim in GGML, consistent with the convention that pytorch's
// last dimension (varying most rapidly) corresponds to GGML's first (varying most rapidly). // last dimension (varying most rapidly) corresponds to GGML's first (varying most rapidly).
// weight = ggml_soft_max(ctx, weight); // weight = ggml_soft_max(ctx, weight);
weight = ggml_soft_max_inplace(ctx, weight); weight = ggml_soft_max_inplace(ctx->ggml_ctx, weight);
v = ggml_cont(ctx, ggml_transpose(ctx, v)); v = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, v));
// auto out = ggml_mul_mat(ctx, weight, v); // auto out = ggml_mul_mat(ctx, weight, v);
auto out = ggml_mul_mat(ctx, v, weight); // NOTE order of mul is opposite to pytorch auto out = ggml_mul_mat(ctx->ggml_ctx, v, weight); // NOTE order of mul is opposite to pytorch
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));
out = ggml_reshape_3d(ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1])); out = ggml_reshape_3d(ctx->ggml_ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1]));
auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]); auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
out = to_out->forward(ctx, out); out = to_out->forward(ctx, out);
return out; return out;
@ -220,7 +176,7 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* latents, struct ggml_tensor* latents,
struct ggml_tensor* x) { struct ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
@ -235,9 +191,9 @@ public:
name = "layers." + std::to_string(i) + ".1"; name = "layers." + std::to_string(i) + ".1";
auto ff = std::dynamic_pointer_cast<PMFeedForward>(blocks[name]); auto ff = std::dynamic_pointer_cast<PMFeedForward>(blocks[name]);
auto t = attn->forward(ctx, x, latents); auto t = attn->forward(ctx, x, latents);
latents = ggml_add(ctx, t, latents); latents = ggml_add(ctx->ggml_ctx, t, latents);
t = ff->forward(ctx, latents); t = ff->forward(ctx, latents);
latents = ggml_add(ctx, t, latents); latents = ggml_add(ctx->ggml_ctx, t, latents);
} }
latents = proj_out->forward(ctx, latents); latents = proj_out->forward(ctx, latents);
latents = norm_out->forward(ctx, latents); latents = norm_out->forward(ctx, latents);
@ -269,18 +225,7 @@ public:
4)); 4));
} }
/* struct ggml_tensor* forward(GGMLRunnerContext* ctx,
def forward(self, x, last_hidden_state):
x = self.token_proj(x)
x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
x = self.token_norm(x) # cls token
out = self.perceiver_resampler(x, last_hidden_state) # retrieve from patch tokens
if self.use_residual: # TODO: if use_residual is not true
out = x + 1.0 * out
return out
*/
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* last_hidden_state) { struct ggml_tensor* last_hidden_state) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
@ -290,122 +235,15 @@ public:
x = token_proj->forward(ctx, x); x = token_proj->forward(ctx, x);
int64_t nel = ggml_nelements(x); int64_t nel = ggml_nelements(x);
x = ggml_reshape_3d(ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens)); x = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
x = token_norm->forward(ctx, x); x = token_norm->forward(ctx, x);
struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state); struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
if (use_residul) if (use_residul)
out = ggml_add(ctx, x, out); out = ggml_add(ctx->ggml_ctx, x, out);
return out; return out;
} }
}; };
/*
class FacePerceiverResampler(torch.nn.Module):
def __init__(
self,
*,
dim=768,
depth=4,
dim_head=64,
heads=16,
embedding_dim=1280,
output_dim=768,
ff_mult=4,
):
super().__init__()
self.proj_in = torch.nn.Linear(embedding_dim, dim)
self.proj_out = torch.nn.Linear(dim, output_dim)
self.norm_out = torch.nn.LayerNorm(output_dim)
self.layers = torch.nn.ModuleList([])
for _ in range(depth):
self.layers.append(
torch.nn.ModuleList(
[
PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
FeedForward(dim=dim, mult=ff_mult),
]
)
)
def forward(self, latents, x):
x = self.proj_in(x)
for attn, ff in self.layers:
latents = attn(x, latents) + latents
latents = ff(latents) + latents
latents = self.proj_out(latents)
return self.norm_out(latents)
*/
/*
def FeedForward(dim, mult=4):
inner_dim = int(dim * mult)
return nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, inner_dim, bias=False),
nn.GELU(),
nn.Linear(inner_dim, dim, bias=False),
)
def reshape_tensor(x, heads):
bs, length, width = x.shape
# (bs, length, width) --> (bs, length, n_heads, dim_per_head)
x = x.view(bs, length, heads, -1)
# (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
x = x.transpose(1, 2)
# (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
x = x.reshape(bs, heads, length, -1)
return x
class PerceiverAttention(nn.Module):
def __init__(self, *, dim, dim_head=64, heads=8):
super().__init__()
self.scale = dim_head**-0.5
self.dim_head = dim_head
self.heads = heads
inner_dim = dim_head * heads
self.norm1 = nn.LayerNorm(dim)
self.norm2 = nn.LayerNorm(dim)
self.to_q = nn.Linear(dim, inner_dim, bias=False)
self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
self.to_out = nn.Linear(inner_dim, dim, bias=False)
def forward(self, x, latents):
"""
Args:
x (torch.Tensor): image features
shape (b, n1, D)
latent (torch.Tensor): latent features
shape (b, n2, D)
"""
x = self.norm1(x)
latents = self.norm2(latents)
b, l, _ = latents.shape
q = self.to_q(latents)
kv_input = torch.cat((x, latents), dim=-2)
k, v = self.to_kv(kv_input).chunk(2, dim=-1)
q = reshape_tensor(q, self.heads)
k = reshape_tensor(k, self.heads)
v = reshape_tensor(v, self.heads)
# attention
scale = 1 / math.sqrt(math.sqrt(self.dim_head))
weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
out = weight @ v
out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
return self.to_out(out)
*/
struct FuseModule : public GGMLBlock { struct FuseModule : public GGMLBlock {
// network hparams // network hparams
int embed_dim; int embed_dim;
@ -418,42 +256,24 @@ public:
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim)); blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
} }
struct ggml_tensor* fuse_fn(struct ggml_context* ctx, struct ggml_tensor* fuse_fn(GGMLRunnerContext* ctx,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds) { struct ggml_tensor* id_embeds) {
auto mlp1 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]); auto mlp1 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
auto mlp2 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]); auto mlp2 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]); auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);
// print_ggml_tensor(id_embeds, true, "Fuseblock id_embeds: "); auto stacked_id_embeds = ggml_concat(ctx->ggml_ctx, prompt_embeds, id_embeds, 0);
// print_ggml_tensor(prompt_embeds, true, "Fuseblock prompt_embeds: ");
// auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
// auto id_embeds0 = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
// print_ggml_tensor(id_embeds0, true, "Fuseblock id_embeds0: ");
// print_ggml_tensor(prompt_embeds0, true, "Fuseblock prompt_embeds0: ");
// concat is along dim 2
// auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds, id_embeds, 0);
// print_ggml_tensor(stacked_id_embeds, true, "Fuseblock stacked_id_embeds 0: ");
// stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));
// print_ggml_tensor(stacked_id_embeds, true, "Fuseblock stacked_id_embeds 1: ");
// stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
// stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
// stacked_id_embeds = mlp2.forward(ctx, stacked_id_embeds);
// stacked_id_embeds = ggml_nn_layer_norm(ctx, stacked_id_embeds, ln_w, ln_b);
stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds); stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds);
stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds); stacked_id_embeds = ggml_add(ctx->ggml_ctx, stacked_id_embeds, prompt_embeds);
stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds); stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds);
stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds); stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds);
// print_ggml_tensor(stacked_id_embeds, true, "Fuseblock stacked_id_embeds 1: ");
return stacked_id_embeds; return stacked_id_embeds;
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds, struct ggml_tensor* id_embeds,
struct ggml_tensor* class_tokens_mask, struct ggml_tensor* class_tokens_mask,
@ -464,38 +284,28 @@ public:
struct ggml_tensor* valid_id_embeds = id_embeds; struct ggml_tensor* valid_id_embeds = id_embeds;
// # slice out the image token embeddings // # slice out the image token embeddings
// print_ggml_tensor(class_tokens_mask_pos, false);
ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos"); ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
ggml_set_name(prompt_embeds, "prompt_embeds"); ggml_set_name(prompt_embeds, "prompt_embeds");
// print_ggml_tensor(valid_id_embeds, true, "valid_id_embeds"); struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos);
// print_ggml_tensor(class_tokens_mask_pos, true, "class_tokens_mask_pos");
struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx, prompt_embeds, class_tokens_mask_pos);
ggml_set_name(image_token_embeds, "image_token_embeds"); ggml_set_name(image_token_embeds, "image_token_embeds");
valid_id_embeds = ggml_reshape_2d(ctx, valid_id_embeds, valid_id_embeds->ne[0], valid_id_embeds = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0],
ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]); ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds); struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);
// stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
// print_ggml_tensor(stacked_id_embeds, true, "AA stacked_id_embeds");
// print_ggml_tensor(left, true, "AA left");
// print_ggml_tensor(right, true, "AA right");
if (left && right) { if (left && right) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
} else if (left) { } else if (left) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
} else if (right) { } else if (right) {
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
} }
// print_ggml_tensor(stacked_id_embeds, true, "BB stacked_id_embeds");
// stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3)); class_tokens_mask = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask));
// print_ggml_tensor(stacked_id_embeds, true, "CC stacked_id_embeds"); class_tokens_mask = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds);
class_tokens_mask = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask)); prompt_embeds = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask);
class_tokens_mask = ggml_repeat(ctx, class_tokens_mask, prompt_embeds); struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds);
prompt_embeds = ggml_mul(ctx, prompt_embeds, class_tokens_mask);
struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx, prompt_embeds, stacked_id_embeds);
ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds"); ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
// print_ggml_tensor(updated_prompt_embeds, true, "updated_prompt_embeds: ");
return updated_prompt_embeds; return updated_prompt_embeds;
} }
}; };
@ -507,7 +317,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048)); blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* id_pixel_values, struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* class_tokens_mask, struct ggml_tensor* class_tokens_mask,
@ -524,11 +334,11 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
struct ggml_tensor* id_embeds = visual_projection->forward(ctx, shared_id_embeds); // [N, proj_dim(768)] struct ggml_tensor* id_embeds = visual_projection->forward(ctx, shared_id_embeds); // [N, proj_dim(768)]
struct ggml_tensor* id_embeds_2 = visual_projection_2->forward(ctx, shared_id_embeds); // [N, 1280] struct ggml_tensor* id_embeds_2 = visual_projection_2->forward(ctx, shared_id_embeds); // [N, 1280]
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3)); id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 2, 0, 1, 3));
id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3)); id_embeds_2 = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds_2, 2, 0, 1, 3));
id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right id_embeds = ggml_concat(ctx->ggml_ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3)); id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 1, 2, 0, 3));
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
prompt_embeds, prompt_embeds,
@ -550,35 +360,12 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo
num_tokens(2) { num_tokens(2) {
blocks["visual_projection_2"] = std::shared_ptr<GGMLBlock>(new Linear(1024, 1280, false)); blocks["visual_projection_2"] = std::shared_ptr<GGMLBlock>(new Linear(1024, 1280, false));
blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048)); blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
/* blocks["qformer_perceiver"] = std::shared_ptr<GGMLBlock>(new QFormerPerceiver(id_embeddings_dim,
cross_attention_dim = 2048 cross_attention_dim,
# projection num_tokens));
self.num_tokens = 2
self.cross_attention_dim = cross_attention_dim
self.qformer_perceiver = QFormerPerceiver(
id_embeddings_dim,
cross_attention_dim,
self.num_tokens,
)*/
blocks["qformer_perceiver"] = std::shared_ptr<GGMLBlock>(new QFormerPerceiver(id_embeddings_dim,
cross_attention_dim,
num_tokens));
} }
/* struct ggml_tensor* forward(GGMLRunnerContext* ctx,
def forward(self, id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds):
b, num_inputs, c, h, w = id_pixel_values.shape
id_pixel_values = id_pixel_values.view(b * num_inputs, c, h, w)
last_hidden_state = self.vision_model(id_pixel_values)[0]
id_embeds = id_embeds.view(b * num_inputs, -1)
id_embeds = self.qformer_perceiver(id_embeds, last_hidden_state)
id_embeds = id_embeds.view(b, num_inputs, self.num_tokens, -1)
updated_prompt_embeds = self.fuse_module(prompt_embeds, id_embeds, class_tokens_mask)
*/
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* id_pixel_values, struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* class_tokens_mask, struct ggml_tensor* class_tokens_mask,
@ -623,15 +410,21 @@ public:
std::vector<float> zeros_right; std::vector<float> zeros_right;
public: public:
PhotoMakerIDEncoder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix, SDVersion version = VERSION_SDXL, PMVersion pm_v = PM_VERSION_1, float sty = 20.f) PhotoMakerIDEncoder(ggml_backend_t backend,
: GGMLRunner(backend), bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
SDVersion version = VERSION_SDXL,
PMVersion pm_v = PM_VERSION_1,
float sty = 20.f)
: GGMLRunner(backend, offload_params_to_cpu),
version(version), version(version),
pm_version(pm_v), pm_version(pm_v),
style_strength(sty) { style_strength(sty) {
if (pm_version == PM_VERSION_1) { if (pm_version == PM_VERSION_1) {
id_encoder.init(params_ctx, tensor_types, prefix); id_encoder.init(params_ctx, tensor_storage_map, prefix);
} else if (pm_version == PM_VERSION_2) { } else if (pm_version == PM_VERSION_2) {
id_encoder2.init(params_ctx, tensor_types, prefix); id_encoder2.init(params_ctx, tensor_storage_map, prefix);
} }
} }
@ -663,7 +456,7 @@ public:
zeros_right.clear(); zeros_right.clear();
zeros_right_16.clear(); zeros_right_16.clear();
ggml_context* ctx0 = compute_ctx; auto runner_ctx = get_context();
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
@ -671,14 +464,14 @@ public:
int64_t seq_length = prompt_embeds->ne[1]; int64_t seq_length = prompt_embeds->ne[1];
ggml_type type = GGML_TYPE_F32; ggml_type type = GGML_TYPE_F32;
struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(ctx0, type, class_tokens_mask.size()); struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size());
struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values); struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
struct ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds); struct ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds);
struct ggml_tensor* id_embeds_d = to_backend(id_embeds); struct ggml_tensor* id_embeds_d = to_backend(id_embeds);
struct ggml_tensor* left = NULL; struct ggml_tensor* left = nullptr;
struct ggml_tensor* right = NULL; struct ggml_tensor* right = nullptr;
for (int i = 0; i < class_tokens_mask.size(); i++) { for (int i = 0; i < class_tokens_mask.size(); i++) {
if (class_tokens_mask[i]) { if (class_tokens_mask[i]) {
// printf(" 1,"); // printf(" 1,");
@ -693,16 +486,16 @@ public:
} }
// printf("\n"); // printf("\n");
if (ctmpos[0] > 0) { if (ctmpos[0] > 0) {
// left = ggml_new_tensor_3d(ctx0, type, hidden_size, 1, ctmpos[0]); // left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, 1, ctmpos[0]);
left = ggml_new_tensor_3d(ctx0, type, hidden_size, ctmpos[0], 1); left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, ctmpos[0], 1);
} }
if (ctmpos[ctmpos.size() - 1] < seq_length - 1) { if (ctmpos[ctmpos.size() - 1] < seq_length - 1) {
// right = ggml_new_tensor_3d(ctx0, type, // right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
// hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1); // hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1);
right = ggml_new_tensor_3d(ctx0, type, right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1); hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1);
} }
struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ctmpos.size()); struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size());
{ {
if (type == GGML_TYPE_F16) if (type == GGML_TYPE_F16)
@ -733,16 +526,16 @@ public:
} }
} }
} }
struct ggml_tensor* updated_prompt_embeds = NULL; struct ggml_tensor* updated_prompt_embeds = nullptr;
if (pm_version == PM_VERSION_1) if (pm_version == PM_VERSION_1)
updated_prompt_embeds = id_encoder.forward(ctx0, updated_prompt_embeds = id_encoder.forward(&runner_ctx,
id_pixel_values_d, id_pixel_values_d,
prompt_embeds_d, prompt_embeds_d,
class_tokens_mask_d, class_tokens_mask_d,
class_tokens_mask_pos, class_tokens_mask_pos,
left, right); left, right);
else if (pm_version == PM_VERSION_2) else if (pm_version == PM_VERSION_2)
updated_prompt_embeds = id_encoder2.forward(ctx0, updated_prompt_embeds = id_encoder2.forward(&runner_ctx,
id_pixel_values_d, id_pixel_values_d,
prompt_embeds_d, prompt_embeds_d,
class_tokens_mask_d, class_tokens_mask_d,
@ -755,7 +548,7 @@ public:
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* id_pixel_values, struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds, struct ggml_tensor* id_embeds,
@ -768,7 +561,7 @@ public:
}; };
// GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds); // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
} }
}; };
@ -780,11 +573,12 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
bool applied = false; bool applied = false;
PhotoMakerIDEmbed(ggml_backend_t backend, PhotoMakerIDEmbed(ggml_backend_t backend,
bool offload_params_to_cpu,
ModelLoader* ml, ModelLoader* ml,
const std::string& file_path = "", const std::string& file_path = "",
const std::string& prefix = "") const std::string& prefix = "")
: file_path(file_path), GGMLRunner(backend), model_loader(ml) { : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
if (!model_loader->init_from_file(file_path, prefix)) { if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
load_failed = true; load_failed = true;
} }
} }
@ -793,7 +587,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return "id_embeds"; return "id_embeds";
} }
bool load_from_file(bool filter_tensor = false) { bool load_from_file(bool filter_tensor, int n_threads) {
LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str()); LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
if (load_failed) { if (load_failed) {
@ -801,7 +595,8 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return false; return false;
} }
bool dry_run = true; bool dry_run = true;
std::mutex tensor_mutex;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name; const std::string& name = tensor_storage.name;
@ -810,6 +605,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return true; return true;
} }
if (dry_run) { if (dry_run) {
std::lock_guard<std::mutex> lock(tensor_mutex);
struct ggml_tensor* real = ggml_new_tensor(params_ctx, struct ggml_tensor* real = ggml_new_tensor(params_ctx,
tensor_storage.type, tensor_storage.type,
tensor_storage.n_dims, tensor_storage.n_dims,
@ -823,11 +619,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return true; return true;
}; };
model_loader->load_tensors(on_new_tensor_cb, backend); model_loader->load_tensors(on_new_tensor_cb, n_threads);
alloc_params_buffer(); alloc_params_buffer();
dry_run = false; dry_run = false;
model_loader->load_tensors(on_new_tensor_cb, backend); model_loader->load_tensors(on_new_tensor_cb, n_threads);
LOG_DEBUG("finished loading PhotoMaker ID Embeds "); LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
return true; return true;
@ -838,7 +634,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
pos = tensors.find("pmid.id_embeds"); pos = tensors.find("pmid.id_embeds");
if (pos != tensors.end()) if (pos != tensors.end())
return pos->second; return pos->second;
return NULL; return nullptr;
} }
}; };

View File

@ -6,8 +6,8 @@
void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) { void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = 20 * 1024 * 1024; // 10 params.mem_size = 80 * input->ne[0] * input->ne[1]; // 20M for 512x512
params.mem_buffer = NULL; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* ctx0 = ggml_init(params); struct ggml_context* ctx0 = ggml_init(params);
struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1); struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
@ -28,7 +28,7 @@ void gaussian_kernel(struct ggml_tensor* kernel) {
for (int x = 0; x < kernel->ne[1]; x++) { for (int x = 0; x < kernel->ne[1]; x++) {
float gy = -ks_mid + x; float gy = -ks_mid + x;
float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal; float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
ggml_tensor_set_f32(kernel, k_, x, y); ggml_ext_tensor_set_f32(kernel, k_, x, y);
} }
} }
} }
@ -36,11 +36,11 @@ void gaussian_kernel(struct ggml_tensor* kernel) {
void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) { void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
for (int iy = 0; iy < rgb_img->ne[1]; iy++) { for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
for (int ix = 0; ix < rgb_img->ne[0]; ix++) { for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
float r = ggml_tensor_get_f32(rgb_img, ix, iy); float r = ggml_ext_tensor_get_f32(rgb_img, ix, iy);
float g = ggml_tensor_get_f32(rgb_img, ix, iy, 1); float g = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 1);
float b = ggml_tensor_get_f32(rgb_img, ix, iy, 2); float b = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 2);
float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b; float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
ggml_tensor_set_f32(grayscale, gray, ix, iy); ggml_ext_tensor_set_f32(grayscale, gray, ix, iy);
} }
} }
} }
@ -81,37 +81,37 @@ void normalize_tensor(struct ggml_tensor* g) {
void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) { void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
for (int iy = 1; iy < result->ne[1] - 1; iy++) { for (int iy = 1; iy < result->ne[1] - 1; iy++) {
for (int ix = 1; ix < result->ne[0] - 1; ix++) { for (int ix = 1; ix < result->ne[0] - 1; ix++) {
float angle = ggml_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_; float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
angle = angle < 0.0f ? angle += 180.0f : angle; angle = angle < 0.0f ? angle += 180.0f : angle;
float q = 1.0f; float q = 1.0f;
float r = 1.0f; float r = 1.0f;
// angle 0 // angle 0
if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) { if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
q = ggml_tensor_get_f32(G, ix, iy + 1); q = ggml_ext_tensor_get_f32(G, ix, iy + 1);
r = ggml_tensor_get_f32(G, ix, iy - 1); r = ggml_ext_tensor_get_f32(G, ix, iy - 1);
} }
// angle 45 // angle 45
else if (22.5f >= angle && angle < 67.5f) { else if (22.5f >= angle && angle < 67.5f) {
q = ggml_tensor_get_f32(G, ix + 1, iy - 1); q = ggml_ext_tensor_get_f32(G, ix + 1, iy - 1);
r = ggml_tensor_get_f32(G, ix - 1, iy + 1); r = ggml_ext_tensor_get_f32(G, ix - 1, iy + 1);
} }
// angle 90 // angle 90
else if (67.5f >= angle && angle < 112.5) { else if (67.5f >= angle && angle < 112.5) {
q = ggml_tensor_get_f32(G, ix + 1, iy); q = ggml_ext_tensor_get_f32(G, ix + 1, iy);
r = ggml_tensor_get_f32(G, ix - 1, iy); r = ggml_ext_tensor_get_f32(G, ix - 1, iy);
} }
// angle 135 // angle 135
else if (112.5 >= angle && angle < 157.5f) { else if (112.5 >= angle && angle < 157.5f) {
q = ggml_tensor_get_f32(G, ix - 1, iy - 1); q = ggml_ext_tensor_get_f32(G, ix - 1, iy - 1);
r = ggml_tensor_get_f32(G, ix + 1, iy + 1); r = ggml_ext_tensor_get_f32(G, ix + 1, iy + 1);
} }
float cur = ggml_tensor_get_f32(G, ix, iy); float cur = ggml_ext_tensor_get_f32(G, ix, iy);
if ((cur >= q) && (cur >= r)) { if ((cur >= q) && (cur >= r)) {
ggml_tensor_set_f32(result, cur, ix, iy); ggml_ext_tensor_set_f32(result, cur, ix, iy);
} else { } else {
ggml_tensor_set_f32(result, 0.0f, ix, iy); ggml_ext_tensor_set_f32(result, 0.0f, ix, iy);
} }
} }
} }
@ -138,9 +138,9 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
for (int iy = 0; iy < img->ne[1]; iy++) { for (int iy = 0; iy < img->ne[1]; iy++) {
for (int ix = 0; ix < img->ne[0]; ix++) { for (int ix = 0; ix < img->ne[0]; ix++) {
if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) { if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
ggml_tensor_set_f32(img, ggml_tensor_get_f32(img, ix, iy), ix, iy); ggml_ext_tensor_set_f32(img, ggml_ext_tensor_get_f32(img, ix, iy), ix, iy);
} else { } else {
ggml_tensor_set_f32(img, 0.0f, ix, iy); ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
} }
} }
} }
@ -148,30 +148,30 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
// hysteresis // hysteresis
for (int iy = 1; iy < img->ne[1] - 1; iy++) { for (int iy = 1; iy < img->ne[1] - 1; iy++) {
for (int ix = 1; ix < img->ne[0] - 1; ix++) { for (int ix = 1; ix < img->ne[0] - 1; ix++) {
float imd_v = ggml_tensor_get_f32(img, ix, iy); float imd_v = ggml_ext_tensor_get_f32(img, ix, iy);
if (imd_v == weak) { if (imd_v == weak) {
if (ggml_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix + 1, iy) == strong || if (ggml_ext_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix + 1, iy) == strong ||
ggml_tensor_get_f32(img, ix, iy - 1) == strong || ggml_tensor_get_f32(img, ix, iy + 1) == strong || ggml_ext_tensor_get_f32(img, ix, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix, iy + 1) == strong ||
ggml_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix - 1, iy) == strong) { ggml_ext_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix - 1, iy) == strong) {
ggml_tensor_set_f32(img, strong, ix, iy); ggml_ext_tensor_set_f32(img, strong, ix, iy);
} else { } else {
ggml_tensor_set_f32(img, 0.0f, ix, iy); ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
} }
} }
} }
} }
} }
uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) { bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 params.mem_size = static_cast<size_t>(40 * img.width * img.height); // 10MB for 512x512
params.mem_buffer = NULL; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); struct ggml_context* work_ctx = ggml_init(params);
if (!work_ctx) { if (!work_ctx) {
LOG_ERROR("ggml_init() failed"); LOG_ERROR("ggml_init() failed");
return NULL; return false;
} }
float kX[9] = { float kX[9] = {
@ -192,13 +192,13 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky)); memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
gaussian_kernel(gkernel); gaussian_kernel(gkernel);
struct ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); struct ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1);
struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1);
struct ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray); struct ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray); struct ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray); struct ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray); struct ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray);
sd_image_to_tensor(img, image); sd_image_to_ggml_tensor(img, image);
grayscale(image, image_gray); grayscale(image, image_gray);
convolve(image_gray, image_gray, gkernel, 2); convolve(image_gray, image_gray, gkernel, 2);
convolve(image_gray, iX, sf_kx, 1); convolve(image_gray, iX, sf_kx, 1);
@ -209,19 +209,18 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
non_max_supression(image_gray, G, tetha); non_max_supression(image_gray, G, tetha);
threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong); threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
// to RGB channels // to RGB channels
for (int iy = 0; iy < height; iy++) { for (int iy = 0; iy < img.height; iy++) {
for (int ix = 0; ix < width; ix++) { for (int ix = 0; ix < img.width; ix++) {
float gray = ggml_tensor_get_f32(image_gray, ix, iy); float gray = ggml_ext_tensor_get_f32(image_gray, ix, iy);
gray = inverse ? 1.0f - gray : gray; gray = inverse ? 1.0f - gray : gray;
ggml_tensor_set_f32(image, gray, ix, iy); ggml_ext_tensor_set_f32(image, gray, ix, iy);
ggml_tensor_set_f32(image, gray, ix, iy, 1); ggml_ext_tensor_set_f32(image, gray, ix, iy, 1);
ggml_tensor_set_f32(image, gray, ix, iy, 2); ggml_ext_tensor_set_f32(image, gray, ix, iy, 2);
} }
} }
free(img); ggml_tensor_to_sd_image(image, img.data);
uint8_t* output = sd_tensor_to_image(image);
ggml_free(work_ctx); ggml_free(work_ctx);
return output; return true;
} }
#endif // __PREPROCESSING_HPP__ #endif // __PREPROCESSING_HPP__

687
qwen_image.hpp Normal file
View File

@ -0,0 +1,687 @@
#ifndef __QWEN_IMAGE_HPP__
#define __QWEN_IMAGE_HPP__
#include <memory>
#include "common.hpp"
#include "flux.hpp"
#include "ggml_extend.hpp"
namespace Qwen {
constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
struct TimestepEmbedding : public GGMLBlock {
public:
TimestepEmbedding(int64_t in_channels,
int64_t time_embed_dim,
int64_t out_dim = 0,
int64_t cond_proj_dim = 0,
bool sample_proj_bias = true) {
blocks["linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim, sample_proj_bias));
if (cond_proj_dim > 0) {
blocks["cond_proj"] = std::shared_ptr<GGMLBlock>(new Linear(cond_proj_dim, in_channels, false));
}
if (out_dim <= 0) {
out_dim = time_embed_dim;
}
blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* sample,
struct ggml_tensor* condition = nullptr) {
if (condition != nullptr) {
auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]);
sample = ggml_add(ctx->ggml_ctx, sample, cond_proj->forward(ctx, condition));
}
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
sample = linear_1->forward(ctx, sample);
sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
sample = linear_2->forward(ctx, sample);
return sample;
}
};
struct QwenTimestepProjEmbeddings : public GGMLBlock {
public:
QwenTimestepProjEmbeddings(int64_t embedding_dim) {
blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* timesteps) {
// timesteps: [N,]
// return: [N, embedding_dim]
auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]);
auto timesteps_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1.f);
auto timesteps_emb = timestep_embedder->forward(ctx, timesteps_proj);
return timesteps_emb;
}
};
struct QwenImageAttention : public GGMLBlock {
protected:
int64_t dim_head;
public:
QwenImageAttention(int64_t query_dim,
int64_t dim_head,
int64_t num_heads,
int64_t out_dim = 0,
int64_t out_context_dim = 0,
bool bias = true,
bool out_bias = true,
float eps = 1e-6)
: dim_head(dim_head) {
int64_t inner_dim = out_dim > 0 ? out_dim : dim_head * num_heads;
out_dim = out_dim > 0 ? out_dim : query_dim;
out_context_dim = out_context_dim > 0 ? out_context_dim : query_dim;
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
blocks["norm_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
blocks["norm_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
blocks["add_q_proj"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
blocks["add_k_proj"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
blocks["add_v_proj"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
blocks["norm_added_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
blocks["norm_added_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
float scale = 1.f / 32.f;
bool force_prec_f32 = false;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example when using CUDA but the weights are k-quants (not all prompts).
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
// to_out.1 is nn.Dropout
blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
}
std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* img,
struct ggml_tensor* txt,
struct ggml_tensor* pe,
struct ggml_tensor* mask = nullptr) {
// img: [N, n_img_token, hidden_size]
// txt: [N, n_txt_token, hidden_size]
// pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
// return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size])
auto norm_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_q"]);
auto norm_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_k"]);
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
auto to_k = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
auto norm_added_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_q"]);
auto norm_added_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_k"]);
auto add_q_proj = std::dynamic_pointer_cast<Linear>(blocks["add_q_proj"]);
auto add_k_proj = std::dynamic_pointer_cast<Linear>(blocks["add_k_proj"]);
auto add_v_proj = std::dynamic_pointer_cast<Linear>(blocks["add_v_proj"]);
auto to_add_out = std::dynamic_pointer_cast<Linear>(blocks["to_add_out"]);
int64_t N = img->ne[2];
int64_t n_img_token = img->ne[1];
int64_t n_txt_token = txt->ne[1];
auto img_q = to_q->forward(ctx, img);
int64_t num_heads = img_q->ne[0] / dim_head;
img_q = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head]
auto img_k = to_k->forward(ctx, img);
img_k = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head]
auto img_v = to_v->forward(ctx, img);
img_v = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head]
img_q = norm_q->forward(ctx, img_q);
img_k = norm_k->forward(ctx, img_k);
auto txt_q = add_q_proj->forward(ctx, txt);
txt_q = ggml_reshape_4d(ctx->ggml_ctx, txt_q, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head]
auto txt_k = add_k_proj->forward(ctx, txt);
txt_k = ggml_reshape_4d(ctx->ggml_ctx, txt_k, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head]
auto txt_v = add_v_proj->forward(ctx, txt);
txt_v = ggml_reshape_4d(ctx->ggml_ctx, txt_v, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head]
txt_q = norm_added_q->forward(ctx, txt_q);
txt_k = norm_added_k->forward(ctx, txt_k);
auto q = ggml_concat(ctx->ggml_ctx, txt_q, img_q, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
auto k = ggml_concat(ctx->ggml_ctx, txt_k, img_k, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
auto attn = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f)); // [N, n_txt_token + n_img_token, n_head*d_head]
attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3)); // [n_txt_token + n_img_token, N, hidden_size]
auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
attn,
attn->ne[0],
attn->ne[1],
txt->ne[1],
attn->nb[1],
attn->nb[2],
0); // [n_txt_token, N, hidden_size]
txt_attn_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_attn_out, 0, 2, 1, 3)); // [N, n_txt_token, hidden_size]
auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
attn,
attn->ne[0],
attn->ne[1],
img->ne[1],
attn->nb[1],
attn->nb[2],
attn->nb[2] * txt->ne[1]); // [n_img_token, N, hidden_size]
img_attn_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, img_attn_out, 0, 2, 1, 3)); // [N, n_img_token, hidden_size]
img_attn_out = to_out_0->forward(ctx, img_attn_out);
txt_attn_out = to_add_out->forward(ctx, txt_attn_out);
return {img_attn_out, txt_attn_out};
}
};
class QwenImageTransformerBlock : public GGMLBlock {
public:
QwenImageTransformerBlock(int64_t dim,
int64_t num_attention_heads,
int64_t attention_head_dim,
float eps = 1e-6) {
// img_mod.0 is nn.SiLU()
blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));
blocks["img_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
blocks["img_mlp"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU, true));
// txt_mod.0 is nn.SiLU()
blocks["txt_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));
blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
blocks["txt_mlp"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU));
blocks["attn"] = std::shared_ptr<GGMLBlock>(new QwenImageAttention(dim,
attention_head_dim,
num_attention_heads,
0, // out_dim
0, // out_context-dim
true, // bias
true, // out_bias
eps));
}
virtual std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* img,
struct ggml_tensor* txt,
struct ggml_tensor* t_emb,
struct ggml_tensor* pe) {
// img: [N, n_img_token, hidden_size]
// txt: [N, n_txt_token, hidden_size]
// pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
// return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size])
auto img_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["img_mod.1"]);
auto img_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm1"]);
auto img_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm2"]);
auto img_mlp = std::dynamic_pointer_cast<FeedForward>(blocks["img_mlp"]);
auto txt_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["txt_mod.1"]);
auto txt_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm1"]);
auto txt_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm2"]);
auto txt_mlp = std::dynamic_pointer_cast<FeedForward>(blocks["txt_mlp"]);
auto attn = std::dynamic_pointer_cast<QwenImageAttention>(blocks["attn"]);
auto img_mod_params = ggml_silu(ctx->ggml_ctx, t_emb);
img_mod_params = img_mod_1->forward(ctx, img_mod_params);
auto img_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0);
auto txt_mod_params = ggml_silu(ctx->ggml_ctx, t_emb);
txt_mod_params = txt_mod_1->forward(ctx, txt_mod_params);
auto txt_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0);
auto img_normed = img_norm1->forward(ctx, img);
auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]);
auto img_gate1 = img_mod_param_vec[2];
auto txt_normed = txt_norm1->forward(ctx, txt);
auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]);
auto txt_gate1 = txt_mod_param_vec[2];
auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe);
img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_gate1));
txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_gate1));
auto img_normed2 = img_norm2->forward(ctx, img);
auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]);
auto img_gate2 = img_mod_param_vec[5];
auto txt_normed2 = txt_norm2->forward(ctx, txt);
auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]);
auto txt_gate2 = txt_mod_param_vec[5];
auto img_mlp_out = img_mlp->forward(ctx, img_modulated2);
auto txt_mlp_out = txt_mlp->forward(ctx, txt_modulated2);
img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_gate2));
txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_gate2));
return {img, txt};
}
};
struct AdaLayerNormContinuous : public GGMLBlock {
public:
AdaLayerNormContinuous(int64_t embedding_dim,
int64_t conditioning_embedding_dim,
bool elementwise_affine = true,
float eps = 1e-5f,
bool bias = true) {
blocks["norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(conditioning_embedding_dim, eps, elementwise_affine, bias));
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* c) {
// x: [N, n_token, hidden_size]
// c: [N, hidden_size]
// return: [N, n_token, patch_size * patch_size * out_channels]
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
auto emb = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, c));
auto mods = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
auto scale = mods[0];
auto shift = mods[1];
x = norm->forward(ctx, x);
x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
return x;
}
};
struct QwenImageParams {
int64_t patch_size = 2;
int64_t in_channels = 64;
int64_t out_channels = 16;
int64_t num_layers = 60;
int64_t attention_head_dim = 128;
int64_t num_attention_heads = 24;
int64_t joint_attention_dim = 3584;
float theta = 10000;
std::vector<int> axes_dim = {16, 56, 56};
int64_t axes_dim_sum = 128;
};
class QwenImageModel : public GGMLBlock {
protected:
QwenImageParams params;
public:
QwenImageModel() {}
QwenImageModel(QwenImageParams params)
: params(params) {
int64_t inner_dim = params.num_attention_heads * params.attention_head_dim;
blocks["time_text_embed"] = std::shared_ptr<GGMLBlock>(new QwenTimestepProjEmbeddings(inner_dim));
blocks["txt_norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.joint_attention_dim, 1e-6f));
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, inner_dim));
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.joint_attention_dim, inner_dim));
// blocks
for (int i = 0; i < params.num_layers; i++) {
auto block = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
params.num_attention_heads,
params.attention_head_dim,
1e-6f));
blocks["transformer_blocks." + std::to_string(i)] = block;
}
blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new AdaLayerNormContinuous(inner_dim, inner_dim, false, 1e-6f));
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
}
struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
struct ggml_tensor* x) {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w]
return x;
}
struct ggml_tensor* patchify(struct ggml_context* ctx,
struct ggml_tensor* x) {
// x: [N, C, H, W]
// return: [N, h*w, C * patch_size * patch_size]
int64_t N = x->ne[3];
int64_t C = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
int64_t p = params.patch_size;
int64_t h = H / params.patch_size;
int64_t w = W / params.patch_size;
GGML_ASSERT(h * p == H && w * p == W);
x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p]
x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, p*p]
x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N); // [N, h*w, C*p*p]
return x;
}
struct ggml_tensor* process_img(struct ggml_context* ctx,
struct ggml_tensor* x) {
x = pad_to_patch_size(ctx, x);
x = patchify(ctx, x);
return x;
}
struct ggml_tensor* unpatchify(struct ggml_context* ctx,
struct ggml_tensor* x,
int64_t h,
int64_t w) {
// x: [N, h*w, C*patch_size*patch_size]
// return: [N, C, H, W]
int64_t N = x->ne[2];
int64_t C = x->ne[0] / params.patch_size / params.patch_size;
int64_t H = h * params.patch_size;
int64_t W = w * params.patch_size;
int64_t p = params.patch_size;
GGML_ASSERT(C * p * p == x->ne[0]);
x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N); // [N, h*w, C, p*p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, p*p]
x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p]
x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p]
return x;
}
struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* timestep,
struct ggml_tensor* context,
struct ggml_tensor* pe) {
auto time_text_embed = std::dynamic_pointer_cast<QwenTimestepProjEmbeddings>(blocks["time_text_embed"]);
auto txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
auto norm_out = std::dynamic_pointer_cast<AdaLayerNormContinuous>(blocks["norm_out"]);
auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
auto t_emb = time_text_embed->forward(ctx, timestep);
auto img = img_in->forward(ctx, x);
auto txt = txt_norm->forward(ctx, context);
txt = txt_in->forward(ctx, txt);
for (int i = 0; i < params.num_layers; i++) {
auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
auto result = block->forward(ctx, img, txt, t_emb, pe);
img = result.first;
txt = result.second;
}
img = norm_out->forward(ctx, img, t_emb);
img = proj_out->forward(ctx, img);
return img;
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* timestep,
struct ggml_tensor* context,
struct ggml_tensor* pe,
std::vector<ggml_tensor*> ref_latents = {}) {
// Forward pass of DiT.
// x: [N, C, H, W]
// timestep: [N,]
// context: [N, L, D]
// pe: [L, d_head/2, 2, 2]
// return: [N, C, H, W]
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t C = x->ne[2];
int64_t N = x->ne[3];
auto img = process_img(ctx->ggml_ctx, x);
uint64_t img_tokens = img->ne[1];
if (ref_latents.size() > 0) {
for (ggml_tensor* ref : ref_latents) {
ref = process_img(ctx->ggml_ctx, ref);
img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
}
}
int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size);
int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);
auto out = forward_orig(ctx, img, timestep, context, pe); // [N, h_len*w_len, ph*pw*C]
if (out->ne[1] > img_tokens) {
out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
}
out = unpatchify(ctx->ggml_ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w]
// slice
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H); // [N, C, H, W + pad_w]
out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W); // [N, C, H, W]
return out;
}
};
struct QwenImageRunner : public GGMLRunner {
public:
QwenImageParams qwen_image_params;
QwenImageModel qwen_image;
std::vector<float> pe_vec;
SDVersion version;
QwenImageRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
SDVersion version = VERSION_QWEN_IMAGE)
: GGMLRunner(backend, offload_params_to_cpu) {
qwen_image_params.num_layers = 0;
for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first;
if (tensor_name.find(prefix) == std::string::npos)
continue;
size_t pos = tensor_name.find("transformer_blocks.");
if (pos != std::string::npos) {
tensor_name = tensor_name.substr(pos); // remove prefix
auto items = split_string(tensor_name, '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > qwen_image_params.num_layers) {
qwen_image_params.num_layers = block_index + 1;
}
}
continue;
}
}
LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
qwen_image = QwenImageModel(qwen_image_params);
qwen_image.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "qwen_image";
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
qwen_image.get_param_tensors(tensors, prefix);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false) {
GGML_ASSERT(x->ne[3] == 1);
struct ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
x = to_backend(x);
context = to_backend(context);
timesteps = to_backend(timesteps);
for (int i = 0; i < ref_latents.size(); i++) {
ref_latents[i] = to_backend(ref_latents[i]);
}
pe_vec = Rope::gen_qwen_image_pe(x->ne[1],
x->ne[0],
qwen_image_params.patch_size,
x->ne[3],
context->ne[1],
ref_latents,
increase_ref_index,
qwen_image_params.theta,
qwen_image_params.axes_dim);
int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2;
// LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
// pe->data = pe_vec.data();
// print_ggml_tensor(pe, true, "pe");
// pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data());
auto runner_ctx = get_context();
struct ggml_tensor* out = qwen_image.forward(&runner_ctx,
x,
timesteps,
context,
pe,
ref_latents);
ggml_build_forward_expand(gf, out);
return gf;
}
bool compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w]
// timesteps: [N, ]
// context: [N, max_position, hidden_size]
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
};
return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
}
void test() {
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
params.mem_buffer = nullptr;
params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != nullptr);
{
// auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
// ggml_set_f32(x, 0.01f);
auto x = load_tensor_from_file(work_ctx, "./qwen_image_x.bin");
print_ggml_tensor(x);
std::vector<float> timesteps_vec(1, 1000.f);
auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
// auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 3584, 256, 1);
// ggml_set_f32(context, 0.01f);
auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin");
print_ggml_tensor(context);
struct ggml_tensor* out = nullptr;
int t0 = ggml_time_ms();
compute(8, x, timesteps, context, {}, false, &out, work_ctx);
int t1 = ggml_time_ms();
print_ggml_tensor(out);
LOG_DEBUG("qwen_image test done in %dms", t1 - t0);
}
}
static void load_from_file_and_test(const std::string& file_path) {
// cuda q8: pass
// cuda q8 fa: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_Q8_0;
ModelLoader model_loader;
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return;
}
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
for (auto& [name, tensor_storage] : tensor_storage_map) {
if (ends_with(name, "weight")) {
tensor_storage.expected_type = model_data_type;
}
}
std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
false,
tensor_storage_map,
"model.diffusion_model",
VERSION_QWEN_IMAGE);
qwen_image->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
qwen_image->get_param_tensors(tensors, "model.diffusion_model");
bool success = model_loader.load_tensors(tensors);
if (!success) {
LOG_ERROR("load tensors from model loader failed");
return;
}
LOG_INFO("qwen_image model loaded");
qwen_image->test();
}
};
} // namespace name
#endif // __QWEN_IMAGE_HPP__

View File

@ -15,11 +15,11 @@ private:
std::default_random_engine generator; std::default_random_engine generator;
public: public:
void manual_seed(uint64_t seed) { void manual_seed(uint64_t seed) override {
generator.seed((unsigned int)seed); generator.seed((unsigned int)seed);
} }
std::vector<float> randn(uint32_t n) { std::vector<float> randn(uint32_t n) override {
std::vector<float> result; std::vector<float> result;
float mean = 0.0; float mean = 0.0;
float stddev = 1.0; float stddev = 1.0;

147
rng_mt19937.hpp Normal file
View File

@ -0,0 +1,147 @@
#ifndef __RNG_MT19937_HPP__
#define __RNG_MT19937_HPP__
#include <cmath>
#include <vector>
#include "rng.hpp"
// RNG imitiating torch cpu randn on CPU.
// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
class MT19937RNG : public RNG {
static const int N = 624;
static const int M = 397;
static const uint32_t MATRIX_A = 0x9908b0dfU;
static const uint32_t UMASK = 0x80000000U;
static const uint32_t LMASK = 0x7fffffffU;
struct State {
uint64_t seed_;
int left_;
bool seeded_;
uint32_t next_;
std::array<uint32_t, N> state_;
bool has_next_gauss = false;
double next_gauss = 0.0f;
};
State s;
uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
void next_state() {
uint32_t* p = s.state_.data();
s.left_ = N;
s.next_ = 0;
for (int j = N - M + 1; --j; p++)
p[0] = p[M] ^ twist(p[0], p[1]);
for (int j = M; --j; p++)
p[0] = p[M - N] ^ twist(p[0], p[1]);
p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
}
uint32_t rand_uint32() {
if (--s.left_ == 0)
next_state();
uint32_t y = s.state_[s.next_++];
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680U;
y ^= (y << 15) & 0xefc60000U;
y ^= (y >> 18);
return y;
}
uint64_t rand_uint64() {
uint64_t high = (uint64_t)rand_uint32();
uint64_t low = (uint64_t)rand_uint32();
return (high << 32) | low;
}
template <typename T, typename V>
T uniform_real(V val, T from, T to) {
constexpr auto MASK = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
T x = (val & MASK) * DIVISOR;
return (x * (to - from) + from);
}
double normal_double_value(double mean, double std) {
if (s.has_next_gauss) {
s.has_next_gauss = false;
return s.next_gauss;
}
double u1 = uniform_real(rand_uint64(), 0., 1.); // double
double u2 = uniform_real(rand_uint64(), 0., 1.); // double
double r = std::sqrt(-2.0 * std::log1p(-u2));
double theta = 2.0 * 3.14159265358979323846 * u1;
double value = r * std::cos(theta) * std + mean;
s.next_gauss = r * std::sin(theta) * std + mean;
s.has_next_gauss = true;
return value;
}
void normal_fill_16(float* data, float mean, float std) {
for (int j = 0; j < 8; ++j) {
float u1 = 1.0f - data[j];
float u2 = data[j + 8];
float r = std::sqrt(-2.0f * std::log(u1));
float theta = 2.0f * 3.14159265358979323846 * u2;
data[j] = r * std::cos(theta) * std + mean;
data[j + 8] = r * std::sin(theta) * std + mean;
}
}
void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
if (size >= 16) {
for (int64_t i = 0; i < size; i++) {
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
}
for (int64_t i = 0; i < size - 15; i += 16) {
normal_fill_16(data + i, mean, std);
}
if (size % 16 != 0) {
// Recompute the last 16 values.
data = data + size - 16;
for (int64_t i = 0; i < 16; i++) {
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
}
normal_fill_16(data, mean, std);
}
} else {
// Strange handling, hard to understand, but keeping it consistent with PyTorch.
for (int64_t i = 0; i < size; i++) {
data[i] = (float)normal_double_value(mean, std);
}
}
}
public:
MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
void manual_seed(uint64_t seed) override {
s.seed_ = seed;
s.seeded_ = true;
s.state_[0] = (uint32_t)(seed & 0xffffffffU);
for (int j = 1; j < N; j++) {
uint32_t prev = s.state_[j - 1];
s.state_[j] = 1812433253U * (prev ^ (prev >> 30)) + j;
}
s.left_ = 1;
s.next_ = 0;
s.has_next_gauss = false;
}
std::vector<float> randn(uint32_t n) override {
std::vector<float> out;
out.resize(n);
randn((float*)out.data(), out.size());
return out;
}
};
#endif // __RNG_MT19937_HPP__

View File

@ -93,12 +93,12 @@ public:
this->offset = 0; this->offset = 0;
} }
void manual_seed(uint64_t seed) { void manual_seed(uint64_t seed) override {
this->seed = seed; this->seed = seed;
this->offset = 0; this->offset = 0;
} }
std::vector<float> randn(uint32_t n) { std::vector<float> randn(uint32_t n) override {
std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0)); std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0));
for (uint32_t i = 0; i < n; i++) { for (uint32_t i = 0; i < n; i++) {
counter[0][i] = this->offset; counter[0][i] = this->offset;

496
rope.hpp Normal file
View File

@ -0,0 +1,496 @@
#ifndef __ROPE_HPP__
#define __ROPE_HPP__
#include <vector>
#include "ggml_extend.hpp"
namespace Rope {
template <class T>
__STATIC_INLINE__ std::vector<T> linspace(T start, T end, int num) {
std::vector<T> result(num);
if (num == 1) {
result[0] = start;
return result;
}
T step = (end - start) / (num - 1);
for (int i = 0; i < num; ++i) {
result[i] = start + i * step;
}
return result;
}
__STATIC_INLINE__ std::vector<std::vector<float>> transpose(const std::vector<std::vector<float>>& mat) {
int rows = mat.size();
int cols = mat[0].size();
std::vector<std::vector<float>> transposed(cols, std::vector<float>(rows));
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
transposed[j][i] = mat[i][j];
}
}
return transposed;
}
__STATIC_INLINE__ std::vector<float> flatten(const std::vector<std::vector<float>>& vec) {
std::vector<float> flat_vec;
for (const auto& sub_vec : vec) {
flat_vec.insert(flat_vec.end(), sub_vec.begin(), sub_vec.end());
}
return flat_vec;
}
__STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos, int dim, int theta) {
assert(dim % 2 == 0);
int half_dim = dim / 2;
std::vector<float> scale = linspace(0.f, (dim * 1.f - 2) / dim, half_dim);
std::vector<float> omega(half_dim);
for (int i = 0; i < half_dim; ++i) {
omega[i] = 1.0 / std::pow(theta, scale[i]);
}
int pos_size = pos.size();
std::vector<std::vector<float>> out(pos_size, std::vector<float>(half_dim));
for (int i = 0; i < pos_size; ++i) {
for (int j = 0; j < half_dim; ++j) {
out[i][j] = pos[i] * omega[j];
}
}
std::vector<std::vector<float>> result(pos_size, std::vector<float>(half_dim * 4));
for (int i = 0; i < pos_size; ++i) {
for (int j = 0; j < half_dim; ++j) {
result[i][4 * j] = std::cos(out[i][j]);
result[i][4 * j + 1] = -std::sin(out[i][j]);
result[i][4 * j + 2] = std::sin(out[i][j]);
result[i][4 * j + 3] = std::cos(out[i][j]);
}
}
return result;
}
// Generate IDs for image patches and text
__STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num, std::set<int> arange_dims) {
auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
for (int dim = 0; dim < axes_dim_num; dim++) {
if (arange_dims.find(dim) != arange_dims.end()) {
for (int i = 0; i < bs * context_len; i++) {
txt_ids[i][dim] = (i % context_len);
}
}
}
return txt_ids;
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_img_ids(int h,
int w,
int patch_size,
int bs,
int axes_dim_num,
int index = 0,
int h_offset = 0,
int w_offset = 0) {
int h_len = (h + (patch_size / 2)) / patch_size;
int w_len = (w + (patch_size / 2)) / patch_size;
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));
std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
std::vector<float> col_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
for (int i = 0; i < h_len; ++i) {
for (int j = 0; j < w_len; ++j) {
img_ids[i * w_len + j][0] = index;
img_ids[i * w_len + j][1] = row_ids[i];
img_ids[i * w_len + j][2] = col_ids[j];
}
}
std::vector<std::vector<float>> img_ids_repeated(bs * img_ids.size(), std::vector<float>(3));
for (int i = 0; i < bs; ++i) {
for (int j = 0; j < img_ids.size(); ++j) {
img_ids_repeated[i * img_ids.size() + j] = img_ids[j];
}
}
return img_ids_repeated;
}
__STATIC_INLINE__ std::vector<std::vector<float>> concat_ids(const std::vector<std::vector<float>>& a,
const std::vector<std::vector<float>>& b,
int bs) {
size_t a_len = a.size() / bs;
size_t b_len = b.size() / bs;
std::vector<std::vector<float>> ids(a.size() + b.size(), std::vector<float>(3));
for (int i = 0; i < bs; ++i) {
for (int j = 0; j < a_len; ++j) {
ids[i * (a_len + b_len) + j] = a[i * a_len + j];
}
for (int j = 0; j < b_len; ++j) {
ids[i * (a_len + b_len) + a_len + j] = b[i * b_len + j];
}
}
return ids;
}
__STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
int bs,
int theta,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> trans_ids = transpose(ids);
size_t pos_len = ids.size() / bs;
int num_axes = axes_dim.size();
// for (int i = 0; i < pos_len; i++) {
// std::cout << trans_ids[0][i] << " " << trans_ids[1][i] << " " << trans_ids[2][i] << std::endl;
// }
int emb_dim = 0;
for (int d : axes_dim)
emb_dim += d / 2;
std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0));
int offset = 0;
for (int i = 0; i < num_axes; ++i) {
std::vector<std::vector<float>> rope_emb = rope(trans_ids[i], axes_dim[i], theta); // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
for (int b = 0; b < bs; ++b) {
for (int j = 0; j < pos_len; ++j) {
for (int k = 0; k < rope_emb[0].size(); ++k) {
emb[b * pos_len + j][offset + k] = rope_emb[j][k];
}
}
}
offset += rope_emb[0].size();
}
return flatten(emb);
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
int bs,
int axes_dim_num,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index,
float ref_index_scale) {
std::vector<std::vector<float>> ids;
uint64_t curr_h_offset = 0;
uint64_t curr_w_offset = 0;
int index = 1;
for (ggml_tensor* ref : ref_latents) {
uint64_t h_offset = 0;
uint64_t w_offset = 0;
if (!increase_ref_index) {
if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
w_offset = curr_w_offset;
} else {
h_offset = curr_h_offset;
}
}
auto ref_ids = gen_flux_img_ids(ref->ne[1],
ref->ne[0],
patch_size,
bs,
axes_dim_num,
static_cast<int>(index * ref_index_scale),
h_offset,
w_offset);
ids = concat_ids(ids, ref_ids, bs);
if (increase_ref_index) {
index++;
}
curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
}
return ids;
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_ids(int h,
int w,
int patch_size,
int bs,
int axes_dim_num,
int context_len,
std::set<int> txt_arange_dims,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index,
float ref_index_scale) {
auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
auto ids = concat_ids(txt_ids, img_ids, bs);
if (ref_latents.size() > 0) {
auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale);
ids = concat_ids(ids, refs_ids, bs);
}
return ids;
}
// Generate flux positional embeddings
__STATIC_INLINE__ std::vector<float> gen_flux_pe(int h,
int w,
int patch_size,
int bs,
int context_len,
std::set<int> txt_arange_dims,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index,
float ref_index_scale,
int theta,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_flux_ids(h,
w,
patch_size,
bs,
static_cast<int>(axes_dim.size()),
context_len,
txt_arange_dims,
ref_latents,
increase_ref_index,
ref_index_scale);
return embed_nd(ids, bs, theta, axes_dim);
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen_image_ids(int h,
int w,
int patch_size,
int bs,
int context_len,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index) {
int h_len = (h + (patch_size / 2)) / patch_size;
int w_len = (w + (patch_size / 2)) / patch_size;
int txt_id_start = std::max(h_len, w_len);
auto txt_ids = linspace<float>(txt_id_start, context_len + txt_id_start, context_len);
std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
for (int i = 0; i < bs; ++i) {
for (int j = 0; j < txt_ids.size(); ++j) {
txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
}
}
int axes_dim_num = 3;
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
auto ids = concat_ids(txt_ids_repeated, img_ids, bs);
if (ref_latents.size() > 0) {
auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f);
ids = concat_ids(ids, refs_ids, bs);
}
return ids;
}
// Generate qwen_image positional embeddings
__STATIC_INLINE__ std::vector<float> gen_qwen_image_pe(int h,
int w,
int patch_size,
int bs,
int context_len,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index,
int theta,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
return embed_nd(ids, bs, theta, axes_dim);
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,
int h,
int w,
int pt,
int ph,
int pw,
int bs,
int t_offset = 0,
int h_offset = 0,
int w_offset = 0) {
int t_len = (t + (pt / 2)) / pt;
int h_len = (h + (ph / 2)) / ph;
int w_len = (w + (pw / 2)) / pw;
std::vector<std::vector<float>> vid_ids(t_len * h_len * w_len, std::vector<float>(3, 0.0));
std::vector<float> t_ids = linspace<float>(t_offset, t_len - 1 + t_offset, t_len);
std::vector<float> h_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
std::vector<float> w_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
for (int i = 0; i < t_len; ++i) {
for (int j = 0; j < h_len; ++j) {
for (int k = 0; k < w_len; ++k) {
int idx = i * h_len * w_len + j * w_len + k;
vid_ids[idx][0] = t_ids[i];
vid_ids[idx][1] = h_ids[j];
vid_ids[idx][2] = w_ids[k];
}
}
}
std::vector<std::vector<float>> vid_ids_repeated(bs * vid_ids.size(), std::vector<float>(3));
for (int i = 0; i < bs; ++i) {
for (int j = 0; j < vid_ids.size(); ++j) {
vid_ids_repeated[i * vid_ids.size() + j] = vid_ids[j];
}
}
return vid_ids_repeated;
}
// Generate wan positional embeddings
__STATIC_INLINE__ std::vector<float> gen_wan_pe(int t,
int h,
int w,
int pt,
int ph,
int pw,
int bs,
int theta,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_vid_ids(t, h, w, pt, ph, pw, bs);
return embed_nd(ids, bs, theta, axes_dim);
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen2vl_ids(int grid_h,
int grid_w,
int merge_size,
const std::vector<int>& window_index) {
std::vector<std::vector<float>> ids(grid_h * grid_w, std::vector<float>(2, 0.0));
int index = 0;
for (int ih = 0; ih < grid_h; ih += merge_size) {
for (int iw = 0; iw < grid_w; iw += merge_size) {
for (int iy = 0; iy < merge_size; iy++) {
for (int ix = 0; ix < merge_size; ix++) {
int inverse_index = window_index[index / (merge_size * merge_size)];
int i = inverse_index * (merge_size * merge_size) + index % (merge_size * merge_size);
GGML_ASSERT(i < grid_h * grid_w);
ids[i][0] = ih + iy;
ids[i][1] = iw + ix;
index++;
}
}
}
}
return ids;
}
// Generate qwen2vl positional embeddings
__STATIC_INLINE__ std::vector<float> gen_qwen2vl_pe(int grid_h,
int grid_w,
int merge_size,
const std::vector<int>& window_index,
int theta,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_qwen2vl_ids(grid_h, grid_w, merge_size, window_index);
return embed_nd(ids, 1, theta, axes_dim);
}
__STATIC_INLINE__ int bound_mod(int a, int m) {
return (m - (a % m)) % m;
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_z_image_ids(int h,
int w,
int patch_size,
int bs,
int context_len,
int seq_multi_of,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index) {
int padded_context_len = context_len + bound_mod(context_len, seq_multi_of);
auto txt_ids = std::vector<std::vector<float>>(bs * padded_context_len, std::vector<float>(3, 0.0f));
for (int i = 0; i < bs * padded_context_len; i++) {
txt_ids[i][0] = (i % padded_context_len) + 1.f;
}
int axes_dim_num = 3;
int index = padded_context_len + 1;
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, index);
int img_pad_len = bound_mod(static_cast<int>(img_ids.size() / bs), seq_multi_of);
if (img_pad_len > 0) {
std::vector<std::vector<float>> img_pad_ids(bs * img_pad_len, std::vector<float>(3, 0.f));
img_ids = concat_ids(img_ids, img_pad_ids, bs);
}
auto ids = concat_ids(txt_ids, img_ids, bs);
// ignore ref_latents for now
return ids;
}
// Generate z_image positional embeddings
__STATIC_INLINE__ std::vector<float> gen_z_image_pe(int h,
int w,
int patch_size,
int bs,
int context_len,
int seq_multi_of,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index,
int theta,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index);
return embed_nd(ids, bs, theta, axes_dim);
}
__STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* pe,
bool rope_interleaved = true) {
// x: [N, L, n_head, d_head]
// pe: [L, d_head/2, 2, 2], [[cos, -sin], [sin, cos]]
int64_t d_head = x->ne[0];
int64_t n_head = x->ne[1];
int64_t L = x->ne[2];
int64_t N = x->ne[3];
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, n_head, L, d_head]
if (rope_interleaved) {
x = ggml_reshape_4d(ctx, x, 2, d_head / 2, L, n_head * N); // [N * n_head, L, d_head/2, 2]
x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 0, 1, 2)); // [2, N * n_head, L, d_head/2]
} else {
x = ggml_reshape_4d(ctx, x, d_head / 2, 2, L, n_head * N); // [N * n_head, L, 2, d_head/2]
x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 3, 1)); // [2, N * n_head, L, d_head/2]
}
int64_t offset = x->nb[2] * x->ne[2];
auto x_0 = ggml_view_3d(ctx, x, x->ne[0], x->ne[1], x->ne[2], x->nb[1], x->nb[2], offset * 0); // [N * n_head, L, d_head/2]
auto x_1 = ggml_view_3d(ctx, x, x->ne[0], x->ne[1], x->ne[2], x->nb[1], x->nb[2], offset * 1); // [N * n_head, L, d_head/2]
x_0 = ggml_reshape_4d(ctx, x_0, 1, x_0->ne[0], x_0->ne[1], x_0->ne[2]); // [N * n_head, L, d_head/2, 1]
x_1 = ggml_reshape_4d(ctx, x_1, 1, x_1->ne[0], x_1->ne[1], x_1->ne[2]); // [N * n_head, L, d_head/2, 1]
auto temp_x = ggml_new_tensor_4d(ctx, x_0->type, 2, x_0->ne[1], x_0->ne[2], x_0->ne[3]);
x_0 = ggml_repeat(ctx, x_0, temp_x); // [N * n_head, L, d_head/2, 2]
x_1 = ggml_repeat(ctx, x_1, temp_x); // [N * n_head, L, d_head/2, 2]
pe = ggml_cont(ctx, ggml_permute(ctx, pe, 3, 0, 1, 2)); // [2, L, d_head/2, 2]
offset = pe->nb[2] * pe->ne[2];
auto pe_0 = ggml_view_3d(ctx, pe, pe->ne[0], pe->ne[1], pe->ne[2], pe->nb[1], pe->nb[2], offset * 0); // [L, d_head/2, 2]
auto pe_1 = ggml_view_3d(ctx, pe, pe->ne[0], pe->ne[1], pe->ne[2], pe->nb[1], pe->nb[2], offset * 1); // [L, d_head/2, 2]
auto x_out = ggml_add_inplace(ctx, ggml_mul(ctx, x_0, pe_0), ggml_mul(ctx, x_1, pe_1)); // [N * n_head, L, d_head/2, 2]
if (!rope_interleaved) {
x_out = ggml_cont(ctx, ggml_permute(ctx, x_out, 1, 0, 2, 3)); // [N * n_head, L, x, d_head/2]
}
x_out = ggml_reshape_3d(ctx, x_out, d_head, L, n_head * N); // [N*n_head, L, d_head]
return x_out;
}
__STATIC_INLINE__ struct ggml_tensor* attention(GGMLRunnerContext* ctx,
struct ggml_tensor* q,
struct ggml_tensor* k,
struct ggml_tensor* v,
struct ggml_tensor* pe,
struct ggml_tensor* mask,
float kv_scale = 1.0f,
bool rope_interleaved = true) {
// q,k,v: [N, L, n_head, d_head]
// pe: [L, d_head/2, 2, 2]
// return: [N, L, n_head*d_head]
q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head]
k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head]
auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, false, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head]
return x;
}
}; // namespace Rope
#endif // __ROPE_HPP__

File diff suppressed because it is too large Load Diff

View File

@ -30,33 +30,48 @@ extern "C" {
enum rng_type_t { enum rng_type_t {
STD_DEFAULT_RNG, STD_DEFAULT_RNG,
CUDA_RNG CUDA_RNG,
CPU_RNG,
RNG_TYPE_COUNT
}; };
enum sample_method_t { enum sample_method_t {
EULER_A, EULER_SAMPLE_METHOD,
EULER, EULER_A_SAMPLE_METHOD,
HEUN, HEUN_SAMPLE_METHOD,
DPM2, DPM2_SAMPLE_METHOD,
DPMPP2S_A, DPMPP2S_A_SAMPLE_METHOD,
DPMPP2M, DPMPP2M_SAMPLE_METHOD,
DPMPP2Mv2, DPMPP2Mv2_SAMPLE_METHOD,
IPNDM, IPNDM_SAMPLE_METHOD,
IPNDM_V, IPNDM_V_SAMPLE_METHOD,
LCM, LCM_SAMPLE_METHOD,
DDIM_TRAILING, DDIM_TRAILING_SAMPLE_METHOD,
TCD, TCD_SAMPLE_METHOD,
N_SAMPLE_METHODS SAMPLE_METHOD_COUNT
}; };
enum schedule_t { enum scheduler_t {
DEFAULT, DISCRETE_SCHEDULER,
DISCRETE, KARRAS_SCHEDULER,
KARRAS, EXPONENTIAL_SCHEDULER,
EXPONENTIAL, AYS_SCHEDULER,
AYS, GITS_SCHEDULER,
GITS, SGM_UNIFORM_SCHEDULER,
N_SCHEDULES SIMPLE_SCHEDULER,
SMOOTHSTEP_SCHEDULER,
LCM_SCHEDULER,
SCHEDULER_COUNT
};
enum prediction_t {
EPS_PRED,
V_PRED,
EDM_V_PRED,
FLOW_PRED,
FLUX_FLOW_PRED,
FLUX2_FLOW_PRED,
PREDICTION_COUNT
}; };
// same as enum ggml_type // same as enum ggml_type
@ -100,11 +115,10 @@ enum sd_type_t {
// SD_TYPE_IQ4_NL_4_4 = 36, // SD_TYPE_IQ4_NL_4_4 = 36,
// SD_TYPE_IQ4_NL_4_8 = 37, // SD_TYPE_IQ4_NL_4_8 = 37,
// SD_TYPE_IQ4_NL_8_8 = 38, // SD_TYPE_IQ4_NL_8_8 = 38,
SD_TYPE_COUNT = 39, SD_TYPE_MXFP4 = 39, // MXFP4 (1 block)
SD_TYPE_COUNT = 40,
}; };
SD_API const char* sd_type_name(enum sd_type_t type);
enum sd_log_level_t { enum sd_log_level_t {
SD_LOG_DEBUG, SD_LOG_DEBUG,
SD_LOG_INFO, SD_LOG_INFO,
@ -112,13 +126,75 @@ enum sd_log_level_t {
SD_LOG_ERROR SD_LOG_ERROR
}; };
typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); enum preview_t {
typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); PREVIEW_NONE,
PREVIEW_PROJ,
PREVIEW_TAE,
PREVIEW_VAE,
PREVIEW_COUNT
};
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); enum lora_apply_mode_t {
SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); LORA_APPLY_AUTO,
SD_API int32_t get_num_physical_cores(); LORA_APPLY_IMMEDIATELY,
SD_API const char* sd_get_system_info(); LORA_APPLY_AT_RUNTIME,
LORA_APPLY_MODE_COUNT,
};
typedef struct {
bool enabled;
int tile_size_x;
int tile_size_y;
float target_overlap;
float rel_size_x;
float rel_size_y;
} sd_tiling_params_t;
typedef struct {
const char* name;
const char* path;
} sd_embedding_t;
typedef struct {
const char* model_path;
const char* clip_l_path;
const char* clip_g_path;
const char* clip_vision_path;
const char* t5xxl_path;
const char* llm_path;
const char* llm_vision_path;
const char* diffusion_model_path;
const char* high_noise_diffusion_model_path;
const char* vae_path;
const char* taesd_path;
const char* control_net_path;
const char* lora_model_dir;
const sd_embedding_t* embeddings;
uint32_t embedding_count;
const char* photo_maker_path;
const char* tensor_type_rules;
bool vae_decode_only;
bool free_params_immediately;
int n_threads;
enum sd_type_t wtype;
enum rng_type_t rng_type;
enum rng_type_t sampler_rng_type;
enum prediction_t prediction;
enum lora_apply_mode_t lora_apply_mode;
bool offload_params_to_cpu;
bool keep_clip_on_cpu;
bool keep_control_net_on_cpu;
bool keep_vae_on_cpu;
bool diffusion_flash_attn;
bool tae_preview_only;
bool diffusion_conv_direct;
bool vae_conv_direct;
bool force_sdxl_vae_conv_scale;
bool chroma_use_dit_mask;
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
float flow_shift;
} sd_ctx_params_t;
typedef struct { typedef struct {
uint32_t width; uint32_t width;
@ -127,146 +203,175 @@ typedef struct {
uint8_t* data; uint8_t* data;
} sd_image_t; } sd_image_t;
typedef struct {
int* layers;
size_t layer_count;
float layer_start;
float layer_end;
float scale;
} sd_slg_params_t;
typedef struct {
float txt_cfg;
float img_cfg;
float distilled_guidance;
sd_slg_params_t slg;
} sd_guidance_params_t;
typedef struct {
sd_guidance_params_t guidance;
enum scheduler_t scheduler;
enum sample_method_t sample_method;
int sample_steps;
float eta;
int shifted_timestep;
} sd_sample_params_t;
typedef struct {
sd_image_t* id_images;
int id_images_count;
const char* id_embed_path;
float style_strength;
} sd_pm_params_t; // photo maker
typedef struct {
bool enabled;
float reuse_threshold;
float start_percent;
float end_percent;
} sd_easycache_params_t;
typedef struct {
bool is_high_noise;
float multiplier;
const char* path;
} sd_lora_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt;
const char* negative_prompt;
int clip_skip;
sd_image_t init_image;
sd_image_t* ref_images;
int ref_images_count;
bool auto_resize_ref_image;
bool increase_ref_index;
sd_image_t mask_image;
int width;
int height;
sd_sample_params_t sample_params;
float strength;
int64_t seed;
int batch_count;
sd_image_t control_image;
float control_strength;
sd_pm_params_t pm_params;
sd_tiling_params_t vae_tiling_params;
sd_easycache_params_t easycache;
} sd_img_gen_params_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt;
const char* negative_prompt;
int clip_skip;
sd_image_t init_image;
sd_image_t end_image;
sd_image_t* control_frames;
int control_frames_size;
int width;
int height;
sd_sample_params_t sample_params;
sd_sample_params_t high_noise_sample_params;
float moe_boundary;
float strength;
int64_t seed;
int video_frames;
float vace_strength;
sd_easycache_params_t easycache;
} sd_vid_gen_params_t;
typedef struct sd_ctx_t sd_ctx_t; typedef struct sd_ctx_t sd_ctx_t;
SD_API sd_ctx_t* new_sd_ctx(const char* model_path, typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
const char* clip_l_path, typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
const char* clip_g_path, typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy, void* data);
const char* t5xxl_path,
const char* diffusion_model_path,
const char* vae_path,
const char* taesd_path,
const char* control_net_path_c_str,
const char* lora_model_dir,
const char* embed_dir_c_str,
const char* stacked_id_embed_dir_c_str,
bool vae_decode_only,
bool vae_tiling,
bool free_params_immediately,
int n_threads,
enum sd_type_t wtype,
enum rng_type_t rng_type,
enum schedule_t s,
bool keep_clip_on_cpu,
bool keep_control_net_cpu,
bool keep_vae_on_cpu,
bool diffusion_flash_attn,
bool chroma_use_dit_mask,
bool chroma_use_t5_mask,
int chroma_t5_mask_pad);
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
SD_API int32_t sd_get_num_physical_cores();
SD_API const char* sd_get_system_info();
SD_API const char* sd_type_name(enum sd_type_t type);
SD_API enum sd_type_t str_to_sd_type(const char* str);
SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
SD_API enum rng_type_t str_to_rng_type(const char* str);
SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
SD_API enum sample_method_t str_to_sample_method(const char* str);
SD_API const char* sd_scheduler_name(enum scheduler_t scheduler);
SD_API enum scheduler_t str_to_scheduler(const char* str);
SD_API const char* sd_prediction_name(enum prediction_t prediction);
SD_API enum prediction_t str_to_prediction(const char* str);
SD_API const char* sd_preview_name(enum preview_t preview);
SD_API enum preview_t str_to_preview(const char* str);
SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
SD_API void sd_easycache_params_init(sd_easycache_params_t* easycache_params);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
const char* prompt, SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);
const char* negative_prompt,
int clip_skip,
float cfg_scale,
float guidance,
float eta,
int width,
int height,
enum sample_method_t sample_method,
int sample_steps,
int64_t seed,
int batch_count,
const sd_image_t* control_cond,
float control_strength,
float style_strength,
bool normalize_input,
const char* input_id_images_path,
int* skip_layers,
size_t skip_layers_count,
float slg_scale,
float skip_layer_start,
float skip_layer_end);
SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);
sd_image_t init_image, SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx);
sd_image_t mask_image,
const char* prompt,
const char* negative_prompt,
int clip_skip,
float cfg_scale,
float guidance,
float eta,
int width,
int height,
enum sample_method_t sample_method,
int sample_steps,
float strength,
int64_t seed,
int batch_count,
const sd_image_t* control_cond,
float control_strength,
float style_strength,
bool normalize_input,
const char* input_id_images_path,
int* skip_layers,
size_t skip_layers_count,
float slg_scale,
float skip_layer_start,
float skip_layer_end);
SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
sd_image_t init_image, SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
int width, SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
int height,
int video_frames,
int motion_bucket_id,
int fps,
float augmentation_level,
float min_cfg,
float cfg_scale,
enum sample_method_t sample_method,
int sample_steps,
float strength,
int64_t seed);
SD_API sd_image_t* edit(sd_ctx_t* sd_ctx, SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
sd_image_t* ref_images, SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out);
int ref_images_count,
const char* prompt,
const char* negative_prompt,
int clip_skip,
float cfg_scale,
float guidance,
float eta,
int width,
int height,
enum sample_method_t sample_method,
int sample_steps,
float strength,
int64_t seed,
int batch_count,
const sd_image_t* control_cond,
float control_strength,
float style_strength,
bool normalize_input,
int* skip_layers,
size_t skip_layers_count,
float slg_scale,
float skip_layer_start,
float skip_layer_end);
typedef struct upscaler_ctx_t upscaler_ctx_t; typedef struct upscaler_ctx_t upscaler_ctx_t;
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
int n_threads); bool offload_params_to_cpu,
bool direct,
int n_threads,
int tile_size);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
sd_image_t input_image,
uint32_t upscale_factor);
SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type); SD_API int get_upscale_factor(upscaler_ctx_t* upscaler_ctx);
SD_API uint8_t* preprocess_canny(uint8_t* img, SD_API bool convert(const char* input_path,
int width, const char* vae_path,
int height, const char* output_path,
float high_threshold, enum sd_type_t output_type,
float low_threshold, const char* tensor_type_rules);
float weak,
float strong, SD_API bool preprocess_canny(sd_image_t image,
bool inverse); float high_threshold,
float low_threshold,
float weak,
float strong,
bool inverse);
SD_API const char* sd_commit(void);
SD_API const char* sd_version(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

271
t5.hpp
View File

@ -1,7 +1,7 @@
#ifndef __T5_HPP__ #ifndef __T5_HPP__
#define __T5_HPP__ #define __T5_HPP__
#include <float.h> #include <cfloat>
#include <limits> #include <limits>
#include <map> #include <map>
#include <memory> #include <memory>
@ -124,7 +124,10 @@ protected:
return; return;
} }
std::string piece = item[0]; std::string piece = item[0];
float score = item[1]; if (piece.empty()) {
piece = "<empty_token>";
}
float score = item[1];
piece_score_pairs.emplace_back(piece, score); piece_score_pairs.emplace_back(piece, score);
} }
} }
@ -147,6 +150,7 @@ protected:
std::vector<const char*> key(pieces->size()); std::vector<const char*> key(pieces->size());
std::vector<int> value(pieces->size()); std::vector<int> value(pieces->size());
for (size_t i = 0; i < pieces->size(); ++i) { for (size_t i = 0; i < pieces->size(); ++i) {
// LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second);
key[i] = (*pieces)[i].first.data(); // sorted piece. key[i] = (*pieces)[i].first.data(); // sorted piece.
value[i] = (*pieces)[i].second; // vocab_id value[i] = (*pieces)[i].second; // vocab_id
} }
@ -335,9 +339,9 @@ protected:
} }
public: public:
explicit T5UniGramTokenizer(const std::string& json_str = "") { explicit T5UniGramTokenizer(bool is_umt5 = false) {
if (json_str.size() != 0) { if (is_umt5) {
InitializePieces(json_str); InitializePieces(ModelLoader::load_umt5_tokenizer_json());
} else { } else {
InitializePieces(ModelLoader::load_t5_tokenizer_json()); InitializePieces(ModelLoader::load_t5_tokenizer_json());
} }
@ -457,8 +461,8 @@ protected:
int64_t hidden_size; int64_t hidden_size;
float eps; float eps;
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
} }
@ -468,10 +472,10 @@ public:
: hidden_size(hidden_size), : hidden_size(hidden_size),
eps(eps) {} eps(eps) {}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
struct ggml_tensor* w = params["weight"]; struct ggml_tensor* w = params["weight"];
x = ggml_rms_norm(ctx, x, eps); x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
x = ggml_mul(ctx, x, w); x = ggml_mul(ctx->ggml_ctx, x, w);
return x; return x;
} }
}; };
@ -483,13 +487,13 @@ public:
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false)); blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]); auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
x = wi->forward(ctx, x); x = wi->forward(ctx, x);
x = ggml_relu_inplace(ctx, x); x = ggml_relu_inplace(ctx->ggml_ctx, x);
x = wo->forward(ctx, x); x = wo->forward(ctx, x);
return x; return x;
} }
@ -500,18 +504,20 @@ public:
T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false)); float scale = 1.f / 32.f;
// The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]); auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]); auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
auto hidden_gelu = ggml_gelu_inplace(ctx, wi_0->forward(ctx, x)); auto hidden_gelu = ggml_gelu_inplace(ctx->ggml_ctx, wi_0->forward(ctx, x));
auto hidden_linear = wi_1->forward(ctx, x); auto hidden_linear = wi_1->forward(ctx, x);
x = ggml_mul_inplace(ctx, hidden_gelu, hidden_linear); x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
x = wo->forward(ctx, x); x = wo->forward(ctx, x);
return x; return x;
} }
@ -524,14 +530,14 @@ public:
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]); auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
auto forwarded_states = layer_norm->forward(ctx, x); auto forwarded_states = layer_norm->forward(ctx, x);
forwarded_states = DenseReluDense->forward(ctx, forwarded_states); forwarded_states = DenseReluDense->forward(ctx, forwarded_states);
x = ggml_add_inplace(ctx, forwarded_states, x); x = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
return x; return x;
} }
}; };
@ -563,21 +569,21 @@ public:
} }
} }
struct ggml_tensor* compute_bias(struct ggml_context* ctx, struct ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
struct ggml_tensor* relative_position_bucket) { struct ggml_tensor* relative_position_bucket) {
auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]); auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads)
values = ggml_cont(ctx, ggml_permute(ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length) values = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length)
return values; return values;
} }
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* past_bias = NULL, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* mask = NULL, struct ggml_tensor* mask = nullptr,
struct ggml_tensor* relative_position_bucket = NULL) { struct ggml_tensor* relative_position_bucket = nullptr) {
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q"]); auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q"]);
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k"]); auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k"]);
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v"]); auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v"]);
@ -590,21 +596,21 @@ public:
auto k = k_proj->forward(ctx, x); auto k = k_proj->forward(ctx, x);
auto v = v_proj->forward(ctx, x); auto v = v_proj->forward(ctx, x);
if (using_relative_attention_bias && relative_position_bucket != NULL) { if (using_relative_attention_bias && relative_position_bucket != nullptr) {
past_bias = compute_bias(ctx, relative_position_bucket); past_bias = compute_bias(ctx, relative_position_bucket);
} }
if (past_bias != NULL) { if (past_bias != nullptr) {
if (mask != NULL) { if (mask != nullptr) {
mask = ggml_repeat(ctx, mask, past_bias); mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
mask = ggml_add(ctx, mask, past_bias); mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
} else { } else {
mask = past_bias; mask = past_bias;
} }
} }
k = ggml_scale_inplace(ctx, k, sqrt(d_head)); k = ggml_scale_inplace(ctx->ggml_ctx, k, sqrt(d_head));
x = ggml_nn_attention_ext(ctx, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head]
x = out_proj->forward(ctx, x); // [N, n_token, model_dim] x = out_proj->forward(ctx, x); // [N, n_token, model_dim]
return {x, past_bias}; return {x, past_bias};
@ -622,11 +628,11 @@ public:
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
} }
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* past_bias = NULL, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* mask = NULL, struct ggml_tensor* mask = nullptr,
struct ggml_tensor* relative_position_bucket = NULL) { struct ggml_tensor* relative_position_bucket = nullptr) {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]); auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
@ -636,7 +642,7 @@ public:
auto output = ret.first; auto output = ret.first;
past_bias = ret.second; past_bias = ret.second;
x = ggml_add_inplace(ctx, output, x); x = ggml_add_inplace(ctx->ggml_ctx, output, x);
return {x, past_bias}; return {x, past_bias};
} }
}; };
@ -648,11 +654,11 @@ public:
blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim)); blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
} }
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* past_bias = NULL, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* mask = NULL, struct ggml_tensor* mask = nullptr,
struct ggml_tensor* relative_position_bucket = NULL) { struct ggml_tensor* relative_position_bucket = nullptr) {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]); auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]); auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
@ -673,20 +679,21 @@ public:
int64_t model_dim, int64_t model_dim,
int64_t inner_dim, int64_t inner_dim,
int64_t ff_dim, int64_t ff_dim,
int64_t num_heads) int64_t num_heads,
bool relative_attention = true)
: num_layers(num_layers) { : num_layers(num_layers) {
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, i == 0)); blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
} }
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* past_bias = NULL, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* attention_mask = NULL, struct ggml_tensor* attention_mask = nullptr,
struct ggml_tensor* relative_position_bucket = NULL) { struct ggml_tensor* relative_position_bucket = nullptr) {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
@ -703,22 +710,37 @@ public:
} }
}; };
struct T5Params {
int64_t num_layers = 24;
int64_t model_dim = 4096;
int64_t ff_dim = 10240;
int64_t num_heads = 64;
int64_t vocab_size = 32128;
bool relative_attention = true;
};
struct T5 : public GGMLBlock { struct T5 : public GGMLBlock {
T5Params params;
public: public:
T5(int64_t num_layers, T5() {}
int64_t model_dim, T5(T5Params params)
int64_t ff_dim, : params(params) {
int64_t num_heads, blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
int64_t vocab_size) { params.model_dim,
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads)); params.model_dim,
blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, model_dim)); params.ff_dim,
params.num_heads,
params.relative_attention));
blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
params.model_dim));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* past_bias = NULL, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* attention_mask = NULL, struct ggml_tensor* attention_mask = nullptr,
struct ggml_tensor* relative_position_bucket = NULL) { struct ggml_tensor* relative_position_bucket = nullptr) {
// input_ids: [N, n_token] // input_ids: [N, n_token]
auto shared = std::dynamic_pointer_cast<Embedding>(blocks["shared"]); auto shared = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
@ -731,22 +753,25 @@ public:
}; };
struct T5Runner : public GGMLRunner { struct T5Runner : public GGMLRunner {
T5Params params;
T5 model; T5 model;
std::vector<int> relative_position_bucket_vec; std::vector<int> relative_position_bucket_vec;
T5Runner(ggml_backend_t backend, T5Runner(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types, bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
int64_t num_layers = 24, bool is_umt5 = false)
int64_t model_dim = 4096, : GGMLRunner(backend, offload_params_to_cpu) {
int64_t ff_dim = 10240, if (is_umt5) {
int64_t num_heads = 64, params.vocab_size = 256384;
int64_t vocab_size = 32128) params.relative_attention = false;
: GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size) { }
model.init(params_ctx, tensor_types, prefix); model = T5(params);
model.init(params_ctx, tensor_storage_map, prefix);
} }
std::string get_desc() { std::string get_desc() override {
return "t5"; return "t5";
} }
@ -754,22 +779,23 @@ struct T5Runner : public GGMLRunner {
model.get_param_tensors(tensors, prefix); model.get_param_tensors(tensors, prefix);
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* relative_position_bucket, struct ggml_tensor* relative_position_bucket,
struct ggml_tensor* attention_mask = NULL) { struct ggml_tensor* attention_mask = nullptr) {
size_t N = input_ids->ne[1]; size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0]; size_t n_token = input_ids->ne[0];
auto hidden_states = model.forward(ctx, input_ids, NULL, attention_mask, relative_position_bucket); // [N, n_token, model_dim] auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim]
return hidden_states; return hidden_states;
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
struct ggml_tensor* attention_mask = NULL) { struct ggml_tensor* attention_mask = nullptr) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
input_ids = to_backend(input_ids); input_ids = to_backend(input_ids);
attention_mask = to_backend(attention_mask);
relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]); relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]);
@ -786,22 +812,23 @@ struct T5Runner : public GGMLRunner {
input_ids->ne[0]); input_ids->ne[0]);
set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, relative_position_bucket, attention_mask); auto runner_ctx = get_context();
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* attention_mask, struct ggml_tensor* attention_mask,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx = NULL) { ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(input_ids, attention_mask); return build_graph(input_ids, attention_mask);
}; };
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
} }
static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position, static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
@ -876,17 +903,12 @@ struct T5Embedder {
T5UniGramTokenizer tokenizer; T5UniGramTokenizer tokenizer;
T5Runner model; T5Runner model;
static std::map<std::string, enum ggml_type> empty_tensor_types;
T5Embedder(ggml_backend_t backend, T5Embedder(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool offload_params_to_cpu,
const std::string prefix = "", const String2TensorStorage& tensor_storage_map = {},
int64_t num_layers = 24, const std::string prefix = "",
int64_t model_dim = 4096, bool is_umt5 = false)
int64_t ff_dim = 10240, : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
int64_t num_heads = 64,
int64_t vocab_size = 32128)
: model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) {
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@ -941,32 +963,29 @@ struct T5Embedder {
void test() { void test() {
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
params.mem_buffer = NULL; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); struct ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != NULL); GGML_ASSERT(work_ctx != nullptr);
{ {
// cpu f16: pass
// cpu f32: pass
// cuda f16: nan
// cuda f32: pass
// cuda q8_0: nan
// TODO: fix cuda nan
std::string text("a lovely cat"); std::string text("a lovely cat");
auto tokens_and_weights = tokenize(text, 77, true); // std::string text("一只可爱的猫"); // umt5 chinease test
auto tokens_and_weights = tokenize(text, 512, true);
std::vector<int>& tokens = std::get<0>(tokens_and_weights); std::vector<int>& tokens = std::get<0>(tokens_and_weights);
std::vector<float>& weights = std::get<1>(tokens_and_weights); std::vector<float>& weights = std::get<1>(tokens_and_weights);
std::vector<float>& masks = std::get<2>(tokens_and_weights);
for (auto token : tokens) { for (auto token : tokens) {
printf("%d ", token); printf("%d ", token);
} }
printf("\n"); printf("\n");
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
struct ggml_tensor* out = NULL; auto attention_mask = vector_to_ggml_tensor(work_ctx, masks);
struct ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int t0 = ggml_time_ms();
model.compute(8, input_ids, NULL, &out, work_ctx); model.compute(8, input_ids, attention_mask, &out, work_ctx);
int t1 = ggml_time_ms(); int t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
@ -975,32 +994,42 @@ struct T5Embedder {
} }
static void load_from_file_and_test(const std::string& file_path) { static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0); // cpu f16: pass
ggml_backend_t backend = ggml_backend_cpu_init(); // cpu f32: pass
ggml_type model_data_type = GGML_TYPE_F32; // cuda f16: pass
std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend)); // cuda f32: pass
{ // cuda q8_0: pass
LOG_INFO("loading from '%s'", file_path.c_str()); // ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16;
t5->alloc_params_buffer(); ModelLoader model_loader;
std::map<std::string, ggml_tensor*> tensors; if (!model_loader.init_from_file_and_convert_name(file_path)) {
t5->get_param_tensors(tensors, ""); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return;
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return;
}
bool success = model_loader.load_tensors(tensors, backend);
if (!success) {
LOG_ERROR("load tensors from model loader failed");
return;
}
LOG_INFO("t5 model loaded");
} }
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
for (auto& [name, tensor_storage] : tensor_storage_map) {
if (ends_with(name, "weight")) {
tensor_storage.expected_type = model_data_type;
}
}
std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_storage_map, "", true);
t5->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
t5->get_param_tensors(tensors, "");
bool success = model_loader.load_tensors(tensors);
if (!success) {
LOG_ERROR("load tensors from model loader failed");
return;
}
LOG_INFO("t5 model loaded");
t5->test(); t5->test();
} }
}; };

52
tae.hpp
View File

@ -29,7 +29,7 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [n, n_in, h, w] // x: [n, n_in, h, w]
// return: [n, n_out, h, w] // return: [n, n_out, h, w]
@ -38,9 +38,9 @@ public:
auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]); auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
auto h = conv_0->forward(ctx, x); auto h = conv_0->forward(ctx, x);
h = ggml_relu_inplace(ctx, h); h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv_2->forward(ctx, h); h = conv_2->forward(ctx, h);
h = ggml_relu_inplace(ctx, h); h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv_4->forward(ctx, h); h = conv_4->forward(ctx, h);
if (n_in != n_out) { if (n_in != n_out) {
@ -49,8 +49,8 @@ public:
x = skip->forward(ctx, x); x = skip->forward(ctx, x);
} }
h = ggml_add(ctx, h, x); h = ggml_add(ctx->ggml_ctx, h, x);
h = ggml_relu_inplace(ctx, h); h = ggml_relu_inplace(ctx->ggml_ctx, h);
return h; return h;
} }
}; };
@ -86,7 +86,7 @@ public:
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1})); blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [n, in_channels, h, w] // x: [n, in_channels, h, w]
// return: [n, z_channels, h/8, w/8] // return: [n, z_channels, h/8, w/8]
@ -136,20 +136,20 @@ public:
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override {
// z: [n, z_channels, h, w] // z: [n, z_channels, h, w]
// return: [n, out_channels, h*8, w*8] // return: [n, out_channels, h*8, w*8]
auto h = ggml_scale(ctx, z, 1.0f / 3.0f); auto h = ggml_scale(ctx->ggml_ctx, z, 1.0f / 3.0f);
h = ggml_tanh_inplace(ctx, h); h = ggml_tanh_inplace(ctx->ggml_ctx, h);
h = ggml_scale(ctx, h, 3.0f); h = ggml_scale(ctx->ggml_ctx, h, 3.0f);
for (int i = 0; i < num_blocks * 3 + 10; i++) { for (int i = 0; i < num_blocks * 3 + 10; i++) {
if (blocks.find(std::to_string(i)) == blocks.end()) { if (blocks.find(std::to_string(i)) == blocks.end()) {
if (i == 1) { if (i == 1) {
h = ggml_relu_inplace(ctx, h); h = ggml_relu_inplace(ctx->ggml_ctx, h);
} else { } else {
h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST); h = ggml_upscale(ctx->ggml_ctx, h, 2, GGML_SCALE_MODE_NEAREST);
} }
continue; continue;
} }
@ -180,12 +180,12 @@ public:
} }
} }
struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) { struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]); auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
return decoder->forward(ctx, z); return decoder->forward(ctx, z);
} }
struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]); auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
return encoder->forward(ctx, x); return encoder->forward(ctx, x);
} }
@ -196,21 +196,22 @@ struct TinyAutoEncoder : public GGMLRunner {
bool decode_only = false; bool decode_only = false;
TinyAutoEncoder(ggml_backend_t backend, TinyAutoEncoder(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types, bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
bool decoder_only = true, bool decoder_only = true,
SDVersion version = VERSION_SD1) SDVersion version = VERSION_SD1)
: decode_only(decoder_only), : decode_only(decoder_only),
taesd(decoder_only, version), taesd(decoder_only, version),
GGMLRunner(backend) { GGMLRunner(backend, offload_params_to_cpu) {
taesd.init(params_ctx, tensor_types, prefix); taesd.init(params_ctx, tensor_storage_map, prefix);
} }
std::string get_desc() { std::string get_desc() override {
return "taesd"; return "taesd";
} }
bool load_from_file(const std::string& file_path) { bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false"); LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
alloc_params_buffer(); alloc_params_buffer();
std::map<std::string, ggml_tensor*> taesd_tensors; std::map<std::string, ggml_tensor*> taesd_tensors;
@ -221,12 +222,12 @@ struct TinyAutoEncoder : public GGMLRunner {
} }
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
return false; return false;
} }
bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors); bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);
if (!success) { if (!success) {
LOG_ERROR("load tae tensors from model loader failed"); LOG_ERROR("load tae tensors from model loader failed");
@ -240,21 +241,22 @@ struct TinyAutoEncoder : public GGMLRunner {
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z); z = to_backend(z);
struct ggml_tensor* out = decode_graph ? taesd.decode(compute_ctx, z) : taesd.encode(compute_ctx, z); auto runner_ctx = get_context();
struct ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* z, struct ggml_tensor* z,
bool decode_graph, bool decode_graph,
struct ggml_tensor** output, struct ggml_tensor** output,
struct ggml_context* output_ctx = NULL) { struct ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(z, decode_graph); return build_graph(z, decode_graph);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
}; };

3
thirdparty/darts.h vendored
View File

@ -4,6 +4,7 @@
#include <cstdio> #include <cstdio>
#include <exception> #include <exception>
#include <new> #include <new>
#include <iostream>
#define DARTS_VERSION "0.32" #define DARTS_VERSION "0.32"
@ -1140,9 +1141,11 @@ inline void DawgBuilder::insert(const char *key, std::size_t length,
if (value < 0) { if (value < 0) {
DARTS_THROW("failed to insert key: negative value"); DARTS_THROW("failed to insert key: negative value");
} else if (length == 0) { } else if (length == 0) {
std::cout << value << std::endl;
DARTS_THROW("failed to insert key: zero-length key"); DARTS_THROW("failed to insert key: zero-length key");
} }
id_type id = 0; id_type id = 0;
std::size_t key_pos = 0; std::size_t key_pos = 0;

Some files were not shown because too many files have changed in this diff Show More