Compare commits

...

123 Commits

Author SHA1 Message Date
leejet
545fac4f3f
refactor: simplify sample cache flow (#1350) 2026-03-17 00:28:03 +08:00
Tay
5265a5efa1
perf(z-image): switch to fused SwiGLU kernel (#1302) 2026-03-17 00:27:46 +08:00
leejet
84cbd88df1
style: remove redundant struct qualifiers for consistent C/C++ type usage (#1349) 2026-03-16 22:17:22 +08:00
Daniele
997bb11fb6
fix: correct encoder channels for flux2 (#1346) 2026-03-16 22:16:43 +08:00
leejet
862a6586cb
feat: add embedded WebUI (#1207) 2026-03-16 00:26:57 +08:00
leejet
61d8331ef3 ci: avoid cuda docker build timeout by using -j16 2026-03-15 18:39:29 +08:00
leejet
acc3bf1fdc
refactor: optimize the VAE architecture (#1345) 2026-03-15 16:57:42 +08:00
Kevin Nause
83eabd7c01
ci: add CUDA Dockerfile (#1314) 2026-03-15 16:46:01 +08:00
Wagner Bruna
630ee03f23
refactor: move all cache parameter defaults to the library (#1327) 2026-03-15 16:43:46 +08:00
Wagner Bruna
f6968bc589
chore: remove SD_FAST_SOFTMAX build flag (#1338) 2026-03-15 16:42:47 +08:00
rmatif
adfef62900
feat: add generic DiT support to spectrum cache (#1336) 2026-03-15 16:41:05 +08:00
JusteLeo
6fa7ca9317
docs: add Anima2 gguf download link to anima.md (#1335) 2026-03-15 16:40:14 +08:00
leejet
d6dd6d7b55
refactor: remove ununsed encode_video (#1332) 2026-03-10 00:36:09 +08:00
rmatif
dea4980f4e
feat: add spectrum caching method (#1322) 2026-03-10 00:35:32 +08:00
leejet
c8fb3d2458
fix: resolve SD1 Pix2Pix issue (#1329) 2026-03-08 00:28:05 +08:00
stduhpf
3d33caaef8
fix: make tiling work better when using circular (#1299) 2026-03-08 00:25:07 +08:00
WinkelCode
9b424db0f4
ci: change workflow owner of "actions-commit-hash" from "pr-mpt" to "prompt" (#1323) 2026-03-08 00:23:23 +08:00
rmatif
d95062737e
fix: ucache: normalize reuse error (#1313) 2026-03-04 23:50:45 +08:00
Korsar13
7c880f80c7
fix: avoid sd-server memory leak (#1316) 2026-03-04 23:47:38 +08:00
leejet
aaa8a51bd8 docs: update sd-cli/sd-server docs 2026-03-04 00:41:17 +08:00
leejet
ba35dd734e
refactor: introduce ggml_ext_zeros_like/ggml_ext_ones_like (#1312) 2026-03-04 00:36:52 +08:00
bssrdf
d41f5fff69
perf: improved flux attention qkv unpacking (#1306) 2026-03-04 00:36:32 +08:00
Korsar13
810ef0cf76
fix: reset weight adapter for models if no loras in request (#1307) 2026-03-04 00:34:07 +08:00
leejet
5792c66879
feat: support some non-standard Anima weight names (#1305) 2026-03-01 22:01:29 +08:00
Wagner Bruna
39d54702a6
feat: accept legacy image parameter on v1/images/edits (#1270) 2026-03-01 22:00:50 +08:00
Wagner Bruna
60889bc9a1
fix: correct sdapi LoRA file handling (#1276) 2026-03-01 21:57:06 +08:00
leejet
e64baa3611
refactor: reuse DiT's patchify/unpatchify functions (#1304) 2026-03-01 21:44:51 +08:00
leejet
cec4aedcfd docs: add anima docs 2026-03-01 15:32:25 +08:00
rmatif
4cdfff5ff2
feat: add Anima support (#1296) 2026-03-01 15:23:18 +08:00
leejet
0752cc9d3a
fix: resolve image quality degradation issue (#1297) 2026-02-26 00:26:21 +08:00
Wagner Bruna
b314d80ad0
feat: turn flow_shift into a generation parameter (#1289)
* feat: turn flow_shift into a generation parameter

* format code

* simplify set_shift/set_parameters

* fix sd_sample_params_to_str

* remove unused variable

* update docs

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-02-26 00:26:04 +08:00
leejet
c9cd49701a
fix: safely handle whitespace and consecutive newlines (#1288) 2026-02-19 20:54:42 +08:00
akleine
c5eb1e4137
fix: avoid black images if using an invalid VAE (for SDXL) (#1273) 2026-02-19 20:54:18 +08:00
leejet
636d3cb6ff
refactor: reorganize the vocab file structure (#1271) 2026-02-11 00:44:17 +08:00
Wagner Bruna
adea272225
feat(server): use image and command-line dimensions by default on server (#1262) 2026-02-11 00:42:50 +08:00
Mario Limonciello
45ce78a3ae
ci: correct rocm artifact of linux (#1269) 2026-02-10 23:19:28 +08:00
leejet
28ef93c0e1
refactor: reorganize the file structure (#1266) 2026-02-10 23:13:35 +08:00
leejet
3296545090
feat: add extra_c_crossattns support for llm embedder (#1265) 2026-02-10 00:00:17 +08:00
akleine
d60fb27560
fix: avoid unwanted file extension changes (#1257) 2026-02-09 23:59:43 +08:00
Wagner Bruna
c7ccafbd6f
fix: correct sdapi handling of cfg_scale and steps (#1260) 2026-02-09 23:34:19 +08:00
stduhpf
aa0b899397
fix: improve handling of VAE decode failures (#1222) 2026-02-09 23:29:41 +08:00
Mario Limonciello
5e264372ce
ci: add a github action to generate a Linux ROCm artifact (#1258) 2026-02-09 23:23:06 +08:00
leejet
f0f641a142
feat(server): add lora support to sdapi (#1256) 2026-02-08 00:11:16 +08:00
stduhpf
9f56833e14
feat: optimize LoKr at runtime (#1233) 2026-02-08 00:08:09 +08:00
Roj234
65891d74cc
fix: avoid the issue of NaN for qwen-image on certain devices (#1249) 2026-02-04 23:49:05 +08:00
leejet
f957fa3d2a
feat: add --fa option (#1242) 2026-02-01 21:44:54 +08:00
leejet
c252e03c6b sync: update ggml 2026-02-01 20:54:23 +08:00
rmatif
e63daba33d
feat: add res_multistep, res_2s sampler and bong tangent scheduler (#1234) 2026-02-01 20:05:27 +08:00
stduhpf
3959109281
fix: improve LoCon support with other naming conventions (#1239) 2026-02-01 20:00:16 +08:00
leejet
e411520407 docs: add z-image-base example 2026-01-28 21:47:36 +08:00
leejet
43e829f219
refactor: unify the processing of attention mask (#1230) 2026-01-26 00:33:34 +08:00
leejet
7837232631
perf: make dit faster (#1228) 2026-01-25 22:50:10 +08:00
Equious
4ccce027b2
fix: correct mask and control image loading in cli (#1229) 2026-01-25 22:47:52 +08:00
leejet
fa61ea744d
fix: set default lora_model_dir to . (#1224) 2026-01-23 22:13:59 +08:00
leejet
5e4579c11d
feat: use image width and height when not explicitly set (#1206) 2026-01-22 23:54:41 +08:00
Wagner Bruna
329571131d
chore: clarify warning about missing model files (#1219) 2026-01-21 22:34:11 +08:00
leejet
a48b4a3ade docs: add FLUX.2-klein support to news 2026-01-19 23:56:50 +08:00
stduhpf
b87fe13afd
feat: support new chroma radiance "x0_x32_proto" (#1209) 2026-01-19 23:51:26 +08:00
Oleg Skutte
e50e1f253d
feat: add taef2 support (#1211) 2026-01-19 23:39:36 +08:00
leejet
c6206fb351 fix: set VAE conv scale for all SDXL variants 2026-01-19 23:21:48 +08:00
akleine
639091fbe9
feat: add support for Segmind's Vega model (#1195) 2026-01-19 23:15:47 +08:00
leejet
9293016c9d docs: update esrgan.md 2026-01-19 23:00:50 +08:00
leejet
2efd19978d
fix: use Unix timestamp for field instead of ISO string (#1205) 2026-01-19 00:21:29 +08:00
Wagner Bruna
61659ef299
feat: add basic sdapi support to sd-server (#1197)
* feat: add basic sdapi support to sd-server

Compatible with AUTOMATIC1111 / Forge.

* fix img2img with no mask

* add more parameter validation

* eliminate MSVC warnings

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-01-19 00:21:11 +08:00
leejet
9565c7f6bd
add support for flux2 klein (#1193)
* add support for flux2 klein 4b

* add support for flux2 klein 8b

* use attention_mask in Flux.2 klein LLMEmbedder

* update docs
2026-01-18 01:17:33 +08:00
Wagner Bruna
fbce16e02d
fix: avoid undefined behavior on image mask allocation failure (#1198) 2026-01-18 01:14:56 +08:00
akleine
7010bb4dff
feat: support for SDXS-512 model (#1180)
* feat: add U-Net specials of SDXS

* docs: update distilled_sd.md for SDXS-512

* feat: for SDXS use AutoencoderTiny as the primary VAE

* docs: update distilled_sd.md for SDXS-512

* fix: SDXS code cleaning after review by stduhpf

* format code

* fix sdxs with --taesd-preview-only

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-01-14 01:14:57 +08:00
Wagner Bruna
48d3161a8d
feat: add sd-server API support for steps, sampler and scheduler (#1173) 2026-01-14 00:34:27 +08:00
Weiqi Gao
271b594e74
sync: update ggml (#1187) 2026-01-14 00:28:55 +08:00
leejet
885e62ea82
refactor: replace ggml_ext_attention with ggml_ext_attention_ext (#1185) 2026-01-11 16:34:13 +08:00
rmatif
0e52afc651
feat: enable vae tiling for vid gen (#1152)
* enable vae tiling for vid gen

* format code

* eliminate compilation warning

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-01-08 23:23:05 +08:00
leejet
27b5f17401 ci: only push Docker images on master or release 2026-01-08 23:03:32 +08:00
Flavio Bizzarri
dfe6d6c664
fix: missing newline after seed in sd_img_gen_params_to_str (#1183) 2026-01-08 22:52:22 +08:00
leejet
9be0b91927 docs: fix safetensors file extension notation 2026-01-06 23:31:03 +08:00
evanreichard
e7e83ed4d1
fix(server): use has_file for mask multipart detection (#1178) 2026-01-06 23:16:05 +08:00
Wagner Bruna
c5602a676c
feat: prioritize gguf and safetensors formats for embeddings and LoRAs (#1169) 2026-01-05 23:58:09 +08:00
Nuno
c34730d9b4
chore: downgrade ubuntu base image in musa container image (#1176)
Signed-off-by: rare-magma <rare-magma@posteo.eu>
2026-01-05 23:56:34 +08:00
Nuno
fdcacc1ebb
ci: cancel old github action runs (#1172)
* ci: cancel old github action runs

Signed-off-by: rare-magma <rare-magma@posteo.eu>

* ci: adjust concurrency to avoid canceling non-PR workflows

---------

Signed-off-by: rare-magma <rare-magma@posteo.eu>
Co-authored-by: leejet <leejet714@gmail.com>
2026-01-05 23:52:34 +08:00
Nuno
496ec9421e
chore: add Linux Vulkan build and Docker image workflows (#1164) 2026-01-05 23:42:12 +08:00
leejet
05006cd6e1
chore: use CMAKE_BUILD_TYPE (#1175) 2026-01-05 23:29:22 +08:00
leejet
b90b1ee9cf
chore: eliminate compilation warnings under MSVC (#1170) 2026-01-04 22:26:57 +08:00
leejet
2cef4badb8 chore: use Release build for windows-latest-cmake 2026-01-04 22:26:09 +08:00
Daniele
a119a4da9a
fix: avoid issues when sigma_min is close to 0 (#1138) 2026-01-04 22:05:01 +08:00
Jay4242
6eefd2d49a
feat: support random seed flag (#1163) 2026-01-04 21:57:50 +08:00
leejet
4ff2c8c74b
refactor: simplify logic for saving results (#1149) 2025-12-28 23:27:27 +08:00
leejet
51bd9c8004 chore: reformat named cache params description into single line 2025-12-28 22:53:07 +08:00
Wagner Bruna
d0d836ae74
feat: support mmap for model loading (#1059) 2025-12-28 22:38:29 +08:00
leejet
a2d83dd0c8
refactor: move pmid condition logic into get_pmid_condition (#1148) 2025-12-27 16:48:15 +08:00
Wagner Bruna
cc107714d7
fix: consistently pass 2nd-order samplers half steps as negatives (#1095) 2025-12-27 15:54:18 +08:00
leejet
37c9860b79
fix: handle redirected UTF-8 output correctly on Windows (#1147) 2025-12-27 15:43:19 +08:00
leejet
ccb6b0ac9d
feat: add __index_timestep_zero__ support (#1146) 2025-12-26 22:07:40 +08:00
Weiqi Gao
df4efe26bd
feat: add png sequence output for vid_gen (#1117) 2025-12-26 22:06:13 +08:00
leejet
860a78e248
fix: avoid crash when using taesd for preview only (#1141) 2025-12-24 23:30:12 +08:00
leejet
a0adcfb148
feat: add support for qwen image edit 2511 (#1096) 2025-12-24 23:00:08 +08:00
leejet
3d5fdd7b37
feat: add support for more underline loras (#1135) 2025-12-24 22:59:23 +08:00
Weiqi Gao
3e6c428c27
chore: use Ninja on Windows to speed up build process (#1120) 2025-12-24 22:53:17 +08:00
张春乔
96fcb13fc0
feat: add --serve-html-path option to example server (#1123) 2025-12-24 22:43:09 +08:00
leejet
3e812460cf
fix: correct ggml_pad_ext (#1133) 2025-12-23 21:37:07 +08:00
leejet
98916e8256 docs: update README.md 2025-12-22 23:58:28 +08:00
rmatif
298b11069f
feat: add more caching methods (#1066) 2025-12-22 23:52:11 +08:00
leejet
30a91138f8 fix: add the missing } 2025-12-21 21:53:38 +08:00
leejet
c6937ba44a fix: correct the parsing of --convert-name opotion 2025-12-21 21:47:50 +08:00
leejet
ca5b1969a8
feat: do not convert tensor names by default in convert mode (#1122) 2025-12-21 18:40:10 +08:00
Phylliida Dev
50ff966445
feat: add seamless texture generation support (#914)
* global bool

* reworked circular to global flag

* cleaner implementation of tiling support in sd cpp

* cleaned rope

* working simplified but still need wraps

* Further clean of rope

* resolve flux conflict

* switch to pad op circular only

* Set ggml to most recent

* Revert ggml temp

* Update ggml to most recent

* Revert unneded flux change

* move circular flag to the GGMLRunnerContext

* Pass through circular param in all places where conv is called

* fix of constant and minor cleanup

* Added back --circular option

* Conv2d circular in vae and various models

* Fix temporal padding for qwen image and other vaes

* Z Image circular tiling

* x and y axis seamless only

* First attempt at chroma seamless x and y

* refactor into pure x and y, almost there

* Fix crash on chroma

* Refactor into cleaner variable choices

* Removed redundant set_circular_enabled

* Sync ggml

* simplify circular parameter

* format code

* no need to perform circular pad on the clip

* simplify circular_axes setting

* unify function naming

* remove unnecessary member variables

* simplify rope

---------

Co-authored-by: Phylliida <phylliidadev@gmail.com>
Co-authored-by: leejet <leejet714@gmail.com>
2025-12-21 18:06:47 +08:00
leejet
88ec9d30b1
feat: add scale_rope support (#1121) 2025-12-21 15:40:21 +08:00
stduhpf
60abda56e0
feat: select vulkan device with env variable (#629) 2025-12-21 15:35:38 +08:00
stduhpf
23fce0bd84
feat: add support for Chroma Radiance x0 (#1091)
* Add x0 Flux pred (+prepare for others)

* Fix convert models with empty tensors

* patch_32 exp support attempt

* improve support for patch_32

* follow official pipeline

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-12-20 00:55:57 +08:00
Wagner Bruna
7c88c4765c
chore: give feedback about cfg values smaller than 1 (#1088) 2025-12-19 23:41:52 +08:00
Weiqi Gao
1f77545cf8
docs: document usage of tae for VRAM reduction using wan (#1108) 2025-12-19 23:31:09 +08:00
leejet
8e9f3a4d9e
feat: add support for underline style lora of flux (#1103)
* feat: add support for underline style lora of flux

* add support for underline style lora of t5

* add more protected tokens
2025-12-18 21:44:16 +08:00
Wagner Bruna
78e15bd4af
feat: default to LCM scheduler for LCM sampling (#1109)
* feat: default to LCM scheduler for LCM sampling

* fix bug and attempt to get default scheduler for vid_gen when none is set

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-12-18 21:43:39 +08:00
Daniele
97cf2efe45
feat: add KL Optimal scheduler (#1098) 2025-12-18 21:02:55 +08:00
leejet
bda7fab9f2 chore: remove unused debug code 2025-12-17 23:43:37 +08:00
leejet
c2e18c86e8
fix: make flash attn work with high noise diffusion model (#1111) 2025-12-17 23:28:59 +08:00
leejet
c3ad6a13e1
refactor: optimize the printing of version log (#1102) 2025-12-16 23:11:27 +08:00
leejet
ebe9d26a72
feat: supports correct UTF-8 printing on windows (#1101) 2025-12-16 23:00:41 +08:00
stduhpf
9fa7f415df
feat: add taehv support for Wan/Qwen (#937) 2025-12-16 22:57:34 +08:00
akleine
a23262dfde
fix: added a clean exit in ModelLoader::load_tensors if OOM (#1097) 2025-12-16 22:45:10 +08:00
Wagner Bruna
e687913bf1
chore: remove lora_model_dir parameter (#1100) 2025-12-16 22:37:45 +08:00
Wagner Bruna
200cb6f2ca
fix: avoid crash with VAE tiling and certain image sizes (#1090) 2025-12-15 23:51:40 +08:00
leejet
43a70e819b
fix: add lora info to image metadata (#1086) 2025-12-14 01:24:15 +08:00
Kirill A. Korinsky
614f8736df
sync: update ggml (#1082) 2025-12-14 01:23:34 +08:00
stduhpf
d96b4152d6
perf: optimize ggml_ext_chunk (#1084) 2025-12-14 01:22:41 +08:00
98 changed files with 11323 additions and 4750 deletions

View File

@ -1,4 +1,5 @@
build*/ build*/
docs/
test/ test/
.cache/ .cache/

View File

@ -21,11 +21,13 @@ on:
"**/*.c", "**/*.c",
"**/*.cpp", "**/*.cpp",
"**/*.cu", "**/*.cu",
"examples/server/frontend/**",
] ]
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: paths:
[ [
".github/workflows/**",
"**/CMakeLists.txt", "**/CMakeLists.txt",
"**/Makefile", "**/Makefile",
"**/*.h", "**/*.h",
@ -33,11 +35,16 @@ on:
"**/*.c", "**/*.c",
"**/*.cpp", "**/*.cpp",
"**/*.cu", "**/*.cu",
"examples/server/frontend/**",
] ]
env: env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }} BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs: jobs:
ubuntu-latest-cmake: ubuntu-latest-cmake:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -49,6 +56,16 @@ jobs:
with: with:
submodules: recursive submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
@ -66,7 +83,7 @@ jobs:
- name: Get commit hash - name: Get commit hash
id: commit id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2 uses: prompt/actions-commit-hash@v2
- name: Fetch system info - name: Fetch system info
id: system-info id: system-info
@ -92,6 +109,143 @@ jobs:
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
ubuntu-latest-cmake-vulkan:
runs-on: ubuntu-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libvulkan-dev glslc
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DSD_BUILD_SHARED_LIBS=ON -DSD_VULKAN=ON
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
run: |
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
build-and-push-docker-images:
name: Build and push container images
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
id-token: write
attestations: write
artifact-metadata: write
strategy:
matrix:
variant: [musa, sycl, vulkan, cuda]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
steps:
- name: Checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: prompt/actions-commit-hash@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to the container registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@v1.3.1
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
- name: Build and push Docker image
id: build-push
uses: docker/build-push-action@v6
with:
platforms: linux/amd64
push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
file: Dockerfile.${{ matrix.variant }}
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
labels: ${{ steps.meta.outputs.labels }}
annotations: ${{ steps.meta.outputs.annotations }}
macOS-latest-cmake: macOS-latest-cmake:
runs-on: macos-latest runs-on: macos-latest
@ -102,6 +256,16 @@ jobs:
with: with:
submodules: recursive submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Dependencies - name: Dependencies
id: depends id: depends
run: | run: |
@ -119,7 +283,7 @@ jobs:
- name: Get commit hash - name: Get commit hash
id: commit id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2 uses: prompt/actions-commit-hash@v2
- name: Fetch system info - name: Fetch system info
id: system-info id: system-info
@ -146,7 +310,7 @@ jobs:
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-2025 runs-on: windows-2022
env: env:
VULKAN_VERSION: 1.4.328.1 VULKAN_VERSION: 1.4.328.1
@ -163,8 +327,8 @@ jobs:
- build: "avx512" - build: "avx512"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON" defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12" - build: "cuda12"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120'" defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
- build: 'vulkan' - build: "vulkan"
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON" defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
steps: steps:
- name: Clone - name: Clone
@ -173,6 +337,16 @@ jobs:
with: with:
submodules: recursive submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Install cuda-toolkit - name: Install cuda-toolkit
id: cuda-toolkit id: cuda-toolkit
if: ${{ matrix.build == 'cuda12' }} if: ${{ matrix.build == 'cuda12' }}
@ -191,13 +365,17 @@ jobs:
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Activate MSVC environment
id: msvc_dev_cmd
uses: ilammy/msvc-dev-cmd@v1
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. ${{ matrix.defines }} cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe -DCMAKE_BUILD_TYPE=Release ${{ matrix.defines }}
cmake --build . --config Release cmake --build .
- name: Check AVX512F support - name: Check AVX512F support
id: check_avx512f id: check_avx512f
@ -215,7 +393,7 @@ jobs:
- name: Get commit hash - name: Get commit hash
id: commit id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2 uses: prompt/actions-commit-hash@v2
- name: Pack artifacts - name: Pack artifacts
id: pack_artifacts id: pack_artifacts
@ -274,6 +452,16 @@ jobs:
with: with:
submodules: recursive submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Cache ROCm Installation - name: Cache ROCm Installation
id: cache-rocm id: cache-rocm
uses: actions/cache@v4 uses: actions/cache@v4
@ -338,7 +526,7 @@ jobs:
- name: Get commit hash - name: Get commit hash
id: commit id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2 uses: prompt/actions-commit-hash@v2
- name: Pack artifacts - name: Pack artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -360,6 +548,156 @@ jobs:
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
ubuntu-latest-rocm:
runs-on: ubuntu-latest
container: rocm/dev-ubuntu-24.04:7.2
env:
ROCM_VERSION: "7.2"
UBUNTU_VERSION: "24.04"
GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
steps:
- run: apt-get update && apt-get install -y git
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Free disk space
run: |
# Remove preinstalled SDKs and caches not needed for this job
sudo rm -rf /usr/share/dotnet || true
sudo rm -rf /usr/local/lib/android || true
sudo rm -rf /opt/ghc || true
sudo rm -rf /usr/local/.ghcup || true
sudo rm -rf /opt/hostedtoolcache || true
# Remove old package lists and caches
sudo rm -rf /var/lib/apt/lists/* || true
sudo apt clean
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt install -y \
cmake \
hip-dev \
hipblas-dev \
ninja-build \
rocm-dev \
zip
# Clean apt caches to recover disk space
sudo apt clean
sudo rm -rf /var/lib/apt/lists/* || true
- name: Setup ROCm Environment
run: |
# Add ROCm to PATH for current session
echo "/opt/rocm/bin" >> $GITHUB_PATH
# Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
# Remove library files for architectures we're not building for to save disk space
echo "Cleaning up unneeded architecture files..."
cd /opt/rocm/lib/rocblas/library
# Keep only our target architectures
for file in *; do
if printf '%s' "$file" | grep -q 'gfx'; then
if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
echo "Removing $file" &&
sudo rm -f "$file";
fi
fi
done
cd /opt/rocm/lib/hipblaslt/library
for file in *; do
if printf '%s' "$file" | grep -q 'gfx'; then
if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
echo "Removing $file" &&
sudo rm -f "$file";
fi
fi
done
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -G Ninja \
-DCMAKE_CXX_COMPILER=amdclang++ \
-DCMAKE_C_COMPILER=amdclang \
-DCMAKE_BUILD_TYPE=Release \
-DSD_HIPBLAS=ON \
-DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DSD_BUILD_SHARED_LIBS=ON
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: prompt/actions-commit-hash@v2
- name: Prepare artifacts
id: prepare_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
# Copy licenses
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
# Move ROCm runtime libraries (to avoid double space consumption)
sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
- name: Fetch system info
id: system-info
run: |
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -367,6 +705,9 @@ jobs:
needs: needs:
- ubuntu-latest-cmake - ubuntu-latest-cmake
- ubuntu-latest-cmake-vulkan
- ubuntu-latest-rocm
- build-and-push-docker-images
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-latest-cmake-hip - windows-latest-cmake-hip
@ -392,7 +733,7 @@ jobs:
- name: Get commit hash - name: Get commit hash
id: commit id: commit
uses: pr-mpt/actions-commit-hash@v2 uses: prompt/actions-commit-hash@v2
- name: Create release - name: Create release
id: create_release id: create_release

3
.gitmodules vendored
View File

@ -1,3 +1,6 @@
[submodule "ggml"] [submodule "ggml"]
path = ggml path = ggml
url = https://github.com/ggml-org/ggml.git url = https://github.com/ggml-org/ggml.git
[submodule "examples/server/frontend"]
path = examples/server/frontend
url = https://github.com/leejet/stable-ui.git

View File

@ -8,6 +8,11 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif() endif()
if (MSVC)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
endif()
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
@ -31,7 +36,6 @@ option(SD_VULKAN "sd: vulkan backend" OFF)
option(SD_OPENCL "sd: opencl backend" OFF) option(SD_OPENCL "sd: opencl backend" OFF)
option(SD_SYCL "sd: sycl backend" OFF) option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF) option(SD_MUSA "sd: musa backend" OFF)
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF) option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF) option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF) option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
@ -65,26 +69,22 @@ if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion") message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON) set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA) add_definitions(-DSD_USE_CUDA)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
endif () endif ()
if(SD_MUSA) if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion") message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON) set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA) add_definitions(-DSD_USE_CUDA)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
endif() endif()
set(SD_LIB stable-diffusion) set(SD_LIB stable-diffusion)
file(GLOB SD_LIB_SOURCES file(GLOB SD_LIB_SOURCES
"*.h" "src/*.h"
"*.cpp" "src/*.cpp"
"*.hpp" "src/*.hpp"
"src/vocab/*.h"
"src/vocab/*.cpp"
) )
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH) find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@ -114,7 +114,7 @@ endif()
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}") message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
set_property( set_property(
SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/version.cpp
APPEND PROPERTY COMPILE_DEFINITIONS APPEND PROPERTY COMPILE_DEFINITIONS
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION} SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
) )
@ -177,6 +177,7 @@ endif()
add_subdirectory(thirdparty) add_subdirectory(thirdparty)
target_link_libraries(${SD_LIB} PUBLIC ggml zip) target_link_libraries(${SD_LIB} PUBLIC ggml zip)
target_include_directories(${SD_LIB} PUBLIC . include)
target_include_directories(${SD_LIB} PUBLIC . thirdparty) target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17) target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
@ -185,7 +186,7 @@ if (SD_BUILD_EXAMPLES)
add_subdirectory(examples) add_subdirectory(examples)
endif() endif()
set(SD_PUBLIC_HEADERS stable-diffusion.h) set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}") set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER) install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)

View File

@ -1,4 +1,4 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build FROM ubuntu:$UBUNTU_VERSION AS build
@ -18,5 +18,6 @@ RUN apt-get update && \
apt-get clean apt-get clean
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ] ENTRYPOINT [ "/sd-cli" ]

25
Dockerfile.cuda Normal file
View File

@ -0,0 +1,25 @@
ARG CUDA_VERSION=12.6.3
ARG UBUNTU_VERSION=24.04
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
WORKDIR /sd.cpp
COPY . .
ARG CUDACXX=/usr/local/cuda/bin/nvcc
RUN cmake . -B ./build -DSD_CUDA=ON
RUN cmake --build ./build --config Release -j$(nproc)
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
RUN apt-get update && \
apt-get install --yes --no-install-recommends libgomp1 && \
apt-get clean
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ]

View File

@ -19,5 +19,6 @@ RUN mkdir build && cd build && \
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ] ENTRYPOINT [ "/sd-cli" ]

View File

@ -15,5 +15,6 @@ RUN mkdir build && cd build && \
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ] ENTRYPOINT [ "/sd-cli" ]

23
Dockerfile.vulkan Normal file
View File

@ -0,0 +1,23 @@
ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc
WORKDIR /sd.cpp
COPY . .
RUN cmake . -B ./build -DSD_VULKAN=ON
RUN cmake --build ./build --config Release --parallel
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install --yes --no-install-recommends libgomp1 libvulkan1 mesa-vulkan-drivers && \
apt-get clean
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ]

View File

@ -15,6 +15,9 @@ API and command-line option may change frequently.***
## 🔥Important News ## 🔥Important News
* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**
👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image** * **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**
👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020) 👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
@ -43,16 +46,17 @@ API and command-line option may change frequently.***
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md) - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- [SD3/SD3.5](./docs/sd3.md) - [SD3/SD3.5](./docs/sd3.md)
- [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md) - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev](./docs/flux2.md) - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- [Chroma](./docs/chroma.md) - [Chroma](./docs/chroma.md)
- [Chroma1-Radiance](./docs/chroma_radiance.md) - [Chroma1-Radiance](./docs/chroma_radiance.md)
- [Qwen Image](./docs/qwen_image.md) - [Qwen Image](./docs/qwen_image.md)
- [Z-Image](./docs/z_image.md) - [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md) - [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- Image Edit Models - Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md) - [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md) - [Qwen Image Edit series](./docs/qwen_image_edit.md)
- Video Models - Video Models
- [Wan2.1/Wan2.2](./docs/wan.md) - [Wan2.1/Wan2.2](./docs/wan.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support. - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
@ -70,7 +74,7 @@ API and command-line option may change frequently.***
- SYCL - SYCL
- Supported weight formats - Supported weight formats
- Pytorch checkpoint (`.ckpt` or `.pth`) - Pytorch checkpoint (`.ckpt` or `.pth`)
- Safetensors (`./safetensors`) - Safetensors (`.safetensors`)
- GGUF (`.gguf`) - GGUF (`.gguf`)
- Supported platforms - Supported platforms
- Linux - Linux
@ -127,15 +131,16 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [SD1.x/SD2.x/SDXL](./docs/sd.md) - [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md) - [SD3/SD3.5](./docs/sd3.md)
- [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md) - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev](./docs/flux2.md) - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- [FLUX.1-Kontext-dev](./docs/kontext.md) - [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Chroma](./docs/chroma.md) - [Chroma](./docs/chroma.md)
- [🔥Qwen Image](./docs/qwen_image.md) - [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md) - [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md) - [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md) - [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md) - [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- [LoRA](./docs/lora.md) - [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md) - [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md) - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@ -143,6 +148,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [Using TAESD to faster decoding](./docs/taesd.md) - [Using TAESD to faster decoding](./docs/taesd.md)
- [Docker](./docs/docker.md) - [Docker](./docs/docker.md)
- [Quantization and GGUF](./docs/quantization_and_gguf.md) - [Quantization and GGUF](./docs/quantization_and_gguf.md)
- [Inference acceleration via caching](./docs/caching.md)
## Bindings ## Bindings

BIN
assets/anima/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 230 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 510 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 455 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 511 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 491 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 464 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 552 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 450 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 870 KiB

21
docs/anima.md Normal file
View File

@ -0,0 +1,21 @@
# How to Use
## Download weights
- Download Anima
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main
- gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae
- Download Qwen3-0.6B-Base
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/mradermacher/Qwen3-0.6B-Base-GGUF/tree/main
## Examples
```sh
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa
```
<img alt="anima image example" src="../assets/anima/example.png" />

141
docs/caching.md Normal file
View File

@ -0,0 +1,141 @@
## Caching
Caching methods accelerate diffusion inference by reusing intermediate computations when changes between steps are small.
### Cache Modes
| Mode | Target | Description |
|------|--------|-------------|
| `ucache` | UNET models | Condition-level caching with error tracking |
| `easycache` | DiT models | Condition-level cache |
| `dbcache` | DiT models | Block-level L1 residual threshold |
| `taylorseer` | DiT models | Taylor series approximation |
| `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
| `spectrum` | UNET models | Chebyshev + Taylor output forecasting |
### UCache (UNET Models)
UCache caches the residual difference (output - input) and reuses it when input changes are below threshold.
```bash
sd-cli -m model.safetensors -p "a cat" --cache-mode ucache --cache-option "threshold=1.5"
```
#### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `threshold` | Error threshold for reuse decision | 1.0 |
| `start` | Start caching at this percent of steps | 0.15 |
| `end` | Stop caching at this percent of steps | 0.95 |
| `decay` | Error decay rate (0-1) | 1.0 |
| `relative` | Scale threshold by output norm (0/1) | 1 |
| `reset` | Reset error after computing (0/1) | 1 |
#### Reset Parameter
The `reset` parameter controls error accumulation behavior:
- `reset=1` (default): Resets accumulated error after each computed step. More aggressive caching, works well with most samplers.
- `reset=0`: Keeps error accumulated. More conservative, recommended for `euler_a` sampler.
### EasyCache (DiT Models)
Condition-level caching for DiT models. Caches and reuses outputs when input changes are below threshold.
```bash
--cache-mode easycache --cache-option "threshold=0.3"
```
#### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `threshold` | Input change threshold for reuse | 0.2 |
| `start` | Start caching at this percent of steps | 0.15 |
| `end` | Stop caching at this percent of steps | 0.95 |
### Cache-DIT (DiT Models)
For DiT models like FLUX and QWEN, use block-level caching modes.
#### DBCache
Caches blocks based on L1 residual difference threshold:
```bash
--cache-mode dbcache --cache-option "threshold=0.25,warmup=4"
```
#### TaylorSeer
Uses Taylor series approximation to predict block outputs:
```bash
--cache-mode taylorseer
```
#### Cache-DIT (Combined)
Combines DBCache and TaylorSeer:
```bash
--cache-mode cache-dit
```
#### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `Fn` | Front blocks to always compute | 8 |
| `Bn` | Back blocks to always compute | 0 |
| `threshold` | L1 residual difference threshold | 0.08 |
| `warmup` | Steps before caching starts | 8 |
#### SCM Options
Steps Computation Mask controls which steps can be cached:
```bash
--scm-mask "1,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1"
```
Mask values: `1` = compute, `0` = can cache.
| Policy | Description |
|--------|-------------|
| `dynamic` | Check threshold before caching |
| `static` | Always cache on cacheable steps |
```bash
--scm-policy dynamic
```
### Spectrum (UNET Models)
Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).
```bash
sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
```
#### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
| `m` | Chebyshev polynomial degree | 3 |
| `lam` | Ridge regression regularization | 1.0 |
| `window` | Initial window size (compute every N steps) | 2 |
| `flex` | Window growth per computed step after warmup | 0.50 |
| `warmup` | Steps to always compute before caching starts | 4 |
| `stop` | Stop caching at this fraction of total steps | 0.9 |
```
### Performance Tips
- Start with default thresholds and adjust based on output quality
- Lower threshold = better quality, less speedup
- Higher threshold = more speedup, potential quality loss
- More steps generally means more caching opportunities

View File

@ -1,8 +1,8 @@
# Running distilled models: SSD1B and SDx.x with tiny U-Nets # Running distilled models: SSD1B, Vega and SDx.x with tiny U-Nets
## Preface ## Preface
These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1. These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B and Vega U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf. Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
## SSD1B ## SSD1B
@ -17,7 +17,17 @@ Useful LoRAs are also available:
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors * https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors * https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
These files can be used out-of-the-box, unlike the models described in the next section. ## Vega
Segmind's Vega model is available online here:
* https://huggingface.co/segmind/Segmind-Vega/resolve/main/segmind-vega.safetensors
VegaRT is an example for an LCM-LoRA:
* https://huggingface.co/segmind/Segmind-VegaRT/resolve/main/pytorch_lora_weights.safetensors
Both files can be used out-of-the-box, unlike the models described in next sections.
## SD1.x, SD2.x with tiny U-Nets ## SD1.x, SD2.x with tiny U-Nets
@ -83,7 +93,7 @@ python convert_diffusers_to_original_stable_diffusion.py \
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above. The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
### Another available .ckpt file: ##### Another available .ckpt file:
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
@ -97,3 +107,31 @@ for key, value in ckpt['state_dict'].items():
ckpt['state_dict'][key] = value.contiguous() ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt") torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
``` ```
### SDXS-512
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
##### 1. Download the diffusers model from Hugging Face using Python:
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
pipe.save_pretrained(save_directory="sdxs")
```
##### 2. Create a safetensors file
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
```
##### 3. Run the model as follows:
```bash
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
--cfg-scale 1 --steps 1
```
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.

View File

@ -1,15 +1,39 @@
## Docker # Docker
### Building using Docker ## Run CLI
```shell
docker run --rm -v /path/to/models:/models -v /path/to/output/:/output ghcr.io/leejet/stable-diffusion.cpp:master [args...]
# For example
# docker run --rm -v ./models:/models -v ./build:/output ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```
## Run server
```shell
docker run --rm --init -v /path/to/models:/models -v /path/to/output/:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master [args...]
# For example
# docker run --rm --init -v ./models:/models -v ./build:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```
## Building using Docker
```shell ```shell
docker build -t sd . docker build -t sd .
``` ```
### Run ## Building variants using Docker
Vulkan:
```shell ```shell
docker run -v /path/to/models:/models -v /path/to/output/:/output sd-cli [args...] docker build -f Dockerfile.vulkan -t sd .
```
## Run locally built image's CLI
```shell
docker run --rm -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
# For example # For example
# docker run -v ./models:/models -v ./build:/output sd-cli -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png # docker run --rm -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
``` ```

View File

@ -1,6 +1,6 @@
## Using ESRGAN to upscale results ## Using ESRGAN to upscale results
You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon. You can use ESRGAN—such as the model [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth)—to upscale the generated images and improve their overall resolution and clarity.
- Specify the model path using the `--upscale-model PATH` parameter. example: - Specify the model path using the `--upscale-model PATH` parameter. example:

View File

@ -1,6 +1,8 @@
# How to Use # How to Use
## Download weights ## Flux.2-dev
### Download weights
- Download FLUX.2-dev - Download FLUX.2-dev
- gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main - gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
@ -9,7 +11,7 @@
- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF - Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
- gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main - gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
## Examples ### Examples
``` ```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu .\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
@ -17,5 +19,74 @@
<img alt="flux2 example" src="../assets/flux2/example.png" /> <img alt="flux2 example" src="../assets/flux2/example.png" />
## Flux.2 klein 4B / Flux.2 klein base 4B
### Download weights
- Download FLUX.2-klein-4B
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-4B
- gguf: https://huggingface.co/leejet/FLUX.2-klein-4B-GGUF/tree/main
- Download FLUX.2-klein-base-4B
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B
- gguf: https://huggingface.co/leejet/FLUX.2-klein-base-4B-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Qwen3 4b
- safetensors: https://huggingface.co/Comfy-Org/flux2-klein-4B/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/unsloth/Qwen3-4B-GGUF/tree/main
### Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
```
<img alt="flux2-klein-4b" src="../assets/flux2/flux2-klein-4b.png" />
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
```
<img alt="flux2-klein-4b-edit" src="../assets/flux2/flux2-klein-4b-edit.png" />
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
```
<img alt="flux2-klein-base-4b" src="../assets/flux2/flux2-klein-base-4b.png" />
## Flux.2 klein 9B / Flux.2 klein base 9B
### Download weights
- Download FLUX.2-klein-9B
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-9B
- gguf: https://huggingface.co/leejet/FLUX.2-klein-9B-GGUF/tree/main
- Download FLUX.2-klein-base-9B
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-9B
- gguf: https://huggingface.co/leejet/FLUX.2-klein-base-9B-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Qwen3 8B
- safetensors: https://huggingface.co/Comfy-Org/flux2-klein-9B/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/unsloth/Qwen3-8B-GGUF/tree/main
### Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
```
<img alt="flux2-klein-9b" src="../assets/flux2/flux2-klein-9b.png" />
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
```
<img alt="flux2-klein-9b-edit" src="../assets/flux2/flux2-klein-9b-edit.png" />
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
```
<img alt="flux2-klein-base-9b" src="../assets/flux2/flux2-klein-base-9b.png" />

View File

@ -9,6 +9,9 @@
- Qwen Image Edit 2509 - Qwen Image Edit 2509
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-2509-GGUF/tree/main - gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-2509-GGUF/tree/main
- Qwen Image Edit 2511
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/tree/main
- Download vae - Download vae
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
- Download qwen_2.5_vl 7b - Download qwen_2.5_vl 7b
@ -32,4 +35,14 @@
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'" .\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
``` ```
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" /> <img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
### Qwen Image Edit 2511
To use the new Qwen Image Edit 2511 mode, the `--qwen-image-zero-cond-t` flag must be enabled; otherwise, image editing quality will degrade significantly.
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-edit-2511-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --qwen-image-zero-cond-t
```
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2511.png" />

View File

@ -14,4 +14,26 @@ curl -L -O https://huggingface.co/madebyollin/taesd/resolve/main/diffusion_pytor
```bash ```bash
sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
``` ```
### Qwen-Image and wan (TAEHV)
sd.cpp also supports [TAEHV](https://github.com/madebyollin/taehv) (#937), which can be used for Qwen-Image and wan.
- For **Qwen-Image and wan2.1 and wan2.2-A14B**, download the wan2.1 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_1.safetensors)
Or curl
```bash
curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_1.safetensors
```
- For **wan2.2-TI2V-5B**, use the wan2.2 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_2.safetensors)
Or curl
```bash
curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_2.safetensors
```
Then simply replace the `--vae xxx.safetensors` with `--tae xxx.safetensors` in the commands. If it still out of VRAM, add `--vae-conv-direct` to your command though might be slower.

View File

@ -39,6 +39,9 @@
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors
- wan_2.2_vae (for Wan2.2 TI2V 5B only) - wan_2.2_vae (for Wan2.2 TI2V 5B only)
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors - safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors
> Wan models vae requires really much VRAM! If you do not have enough VRAM, please try tae instead, though the results may be poorer. For tae usage, please refer to [taesd](taesd.md)
- Download umt5_xxl - Download umt5_xxl
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors
- gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main - gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main

View File

@ -7,6 +7,9 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
- Download Z-Image-Turbo - Download Z-Image-Turbo
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models - safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main - gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
- Download Z-Image
- safetensors: https://huggingface.co/Comfy-Org/z_image/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/unsloth/Z-Image-GGUF/tree/main
- Download vae - Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Qwen3 4b - Download Qwen3 4b
@ -15,12 +18,22 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
## Examples ## Examples
### Z-Image-Turbo
``` ```
.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512 .\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
``` ```
<img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" /> <img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
### Z-Image-Base
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\z_image_bf16.safetensors --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
```
<img width="256" alt="z-image example" src="../assets/z_image/base_bf16.png" />
## Comparison of Different Quantization Types ## Comparison of Different Quantization Types
| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K| | bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|

View File

@ -4,11 +4,14 @@
usage: ./bin/sd-cli [options] usage: ./bin/sd-cli [options]
CLI Options: CLI Options:
-o, --output <string> path to write result image to (default: ./output.png) -o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
./output.png) (eg. output_%03d.png)
--preview-path <string> path to write preview image to (default: ./preview.png) --preview-path <string> path to write preview image to (default: ./preview.png)
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at --preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step) every step)
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
--canny apply canny preprocessor (edge detection) --canny apply canny preprocessor (edge detection)
--convert-name convert tensor name (for convert mode)
-v, --verbose print extra info -v, --verbose print extra info
--color colors the logging tags according to level --color colors the logging tags according to level
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
@ -31,6 +34,7 @@ Context Options:
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model --high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--vae <string> path to standalone vae model --vae <string> path to standalone vae model
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) --taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--tae <string> alias of --taesd
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
@ -41,17 +45,22 @@ Context Options:
CPU physical cores CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5) --vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tiling process vae in tiles to reduce memory usage --vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram) --control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram) --vae-on-cpu keep vae in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model --fa use flash attention
--diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model --diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model --vae-conv-direct use ggml_conv2d_direct in the vae model
--circular enable circular padding for convolutions
--circularx enable circular RoPE wrapping on x-axis (width) only
--circulary enable circular RoPE wrapping on y-axis (height) only
--chroma-disable-dit-mask disable dit mask for chroma --chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma --chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file type of the weight file
@ -92,6 +101,7 @@ Generation Options:
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1) --upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
@ -100,6 +110,7 @@ Generation Options:
--skip-layer-start <float> SLG enabling point (default: 0.01) --skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2) --skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> eta in DDIM, only for DDIM and TCD (default: 0) --eta <float> eta in DDIM, only for DDIM and TCD (default: 0)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0) --high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) --high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5) --high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
@ -116,14 +127,23 @@ Generation Options:
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], euler_a otherwise
default: discrete --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0"). --sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95) --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
``` ```

View File

@ -172,9 +172,9 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
// Write '00dc' chunk (video frame) // Write '00dc' chunk (video frame)
fwrite("00dc", 4, 1, f); fwrite("00dc", 4, 1, f);
write_u32_le(f, jpeg_data.size); write_u32_le(f, (uint32_t)jpeg_data.size);
index[i].offset = ftell(f) - 8; index[i].offset = ftell(f) - 8;
index[i].size = jpeg_data.size; index[i].size = (uint32_t)jpeg_data.size;
fwrite(jpeg_data.buf, 1, jpeg_data.size, f); fwrite(jpeg_data.buf, 1, jpeg_data.size, f);
// Align to even byte size // Align to even byte size

View File

@ -26,13 +26,16 @@ const char* previews_str[] = {
"vae", "vae",
}; };
std::regex format_specifier_regex("(?:[^%]|^)(?:%%)*(%\\d{0,3}d)");
struct SDCliParams { struct SDCliParams {
SDMode mode = IMG_GEN; SDMode mode = IMG_GEN;
std::string output_path = "output.png"; std::string output_path = "output.png";
int output_begin_idx = -1;
bool verbose = false; bool verbose = false;
bool version = false;
bool canny_preprocess = false; bool canny_preprocess = false;
bool convert_name = false;
preview_t preview_method = PREVIEW_NONE; preview_t preview_method = PREVIEW_NONE;
int preview_interval = 1; int preview_interval = 1;
@ -50,7 +53,7 @@ struct SDCliParams {
options.string_options = { options.string_options = {
{"-o", {"-o",
"--output", "--output",
"path to write result image to (default: ./output.png)", "path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png)",
&output_path}, &output_path},
{"", {"",
"--preview-path", "--preview-path",
@ -63,6 +66,10 @@ struct SDCliParams {
"--preview-interval", "--preview-interval",
"interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)", "interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)",
&preview_interval}, &preview_interval},
{"",
"--output-begin-idx",
"starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)",
&output_begin_idx},
}; };
options.bool_options = { options.bool_options = {
@ -70,14 +77,14 @@ struct SDCliParams {
"--canny", "--canny",
"apply canny preprocessor (edge detection)", "apply canny preprocessor (edge detection)",
true, &canny_preprocess}, true, &canny_preprocess},
{"",
"--convert-name",
"convert tensor name (for convert mode)",
true, &convert_name},
{"-v", {"-v",
"--verbose", "--verbose",
"print extra info", "print extra info",
true, &verbose}, true, &verbose},
{"",
"--version",
"print stable-diffusion.cpp version",
true, &version},
{"", {"",
"--color", "--color",
"colors the logging tags according to level", "colors the logging tags according to level",
@ -106,9 +113,8 @@ struct SDCliParams {
} }
} }
if (mode_found == -1) { if (mode_found == -1) {
fprintf(stderr, LOG_ERROR("error: invalid mode %s, must be one of [%s]\n",
"error: invalid mode %s, must be one of [%s]\n", mode_c_str, SD_ALL_MODES_STR);
mode_c_str, SD_ALL_MODES_STR);
exit(1); exit(1);
} }
mode = (SDMode)mode_found; mode = (SDMode)mode_found;
@ -128,8 +134,7 @@ struct SDCliParams {
} }
} }
if (preview_found == -1) { if (preview_found == -1) {
fprintf(stderr, "error: preview method %s\n", LOG_ERROR("error: preview method %s", preview);
preview);
return -1; return -1;
} }
preview_method = (preview_t)preview_found; preview_method = (preview_t)preview_found;
@ -161,7 +166,7 @@ struct SDCliParams {
bool process_and_check() { bool process_and_check() {
if (output_path.length() == 0) { if (output_path.length() == 0) {
fprintf(stderr, "error: the following arguments are required: output_path\n"); LOG_ERROR("error: the following arguments are required: output_path");
return false; return false;
} }
@ -181,6 +186,7 @@ struct SDCliParams {
<< " verbose: " << (verbose ? "true" : "false") << ",\n" << " verbose: " << (verbose ? "true" : "false") << ",\n"
<< " color: " << (color ? "true" : "false") << ",\n" << " color: " << (color ? "true" : "false") << ",\n"
<< " canny_preprocess: " << (canny_preprocess ? "true" : "false") << ",\n" << " canny_preprocess: " << (canny_preprocess ? "true" : "false") << ",\n"
<< " convert_name: " << (convert_name ? "true" : "false") << ",\n"
<< " preview_method: " << previews_str[preview_method] << ",\n" << " preview_method: " << previews_str[preview_method] << ",\n"
<< " preview_interval: " << preview_interval << ",\n" << " preview_interval: " << preview_interval << ",\n"
<< " preview_path: \"" << preview_path << "\",\n" << " preview_path: \"" << preview_path << "\",\n"
@ -219,20 +225,8 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
} }
} }
static std::string sd_basename(const std::string& path) {
size_t pos = path.find_last_of('/');
if (pos != std::string::npos) {
return path.substr(pos + 1);
}
pos = path.find_last_of('\\');
if (pos != std::string::npos) {
return path.substr(pos + 1);
}
return path;
}
std::string get_image_params(const SDCliParams& cli_params, const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed) { std::string get_image_params(const SDCliParams& cli_params, const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed) {
std::string parameter_string = gen_params.prompt + "\n"; std::string parameter_string = gen_params.prompt_with_lora + "\n";
if (gen_params.negative_prompt.size() != 0) { if (gen_params.negative_prompt.size() != 0) {
parameter_string += "Negative prompt: " + gen_params.negative_prompt + "\n"; parameter_string += "Negative prompt: " + gen_params.negative_prompt + "\n";
} }
@ -251,7 +245,7 @@ std::string get_image_params(const SDCliParams& cli_params, const SDContextParam
parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", "; parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", ";
parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", "; parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", ";
parameter_string += "Seed: " + std::to_string(seed) + ", "; parameter_string += "Seed: " + std::to_string(seed) + ", ";
parameter_string += "Size: " + std::to_string(gen_params.width) + "x" + std::to_string(gen_params.height) + ", "; parameter_string += "Size: " + std::to_string(gen_params.get_resolved_width()) + "x" + std::to_string(gen_params.get_resolved_height()) + ", ";
parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", "; parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", ";
parameter_string += "RNG: " + std::string(sd_rng_type_name(ctx_params.rng_type)) + ", "; parameter_string += "RNG: " + std::string(sd_rng_type_name(ctx_params.rng_type)) + ", ";
if (ctx_params.sampler_rng_type != RNG_TYPE_COUNT) { if (ctx_params.sampler_rng_type != RNG_TYPE_COUNT) {
@ -288,47 +282,9 @@ std::string get_image_params(const SDCliParams& cli_params, const SDContextParam
return parameter_string; return parameter_string;
} }
/* Enables Printing the log level tag in color using ANSI escape codes */
void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
SDCliParams* cli_params = (SDCliParams*)data; SDCliParams* cli_params = (SDCliParams*)data;
int tag_color; log_print(level, log, cli_params->verbose, cli_params->color);
const char* level_str;
FILE* out_stream = (level == SD_LOG_ERROR) ? stderr : stdout;
if (!log || (!cli_params->verbose && level <= SD_LOG_DEBUG)) {
return;
}
switch (level) {
case SD_LOG_DEBUG:
tag_color = 37;
level_str = "DEBUG";
break;
case SD_LOG_INFO:
tag_color = 34;
level_str = "INFO";
break;
case SD_LOG_WARN:
tag_color = 35;
level_str = "WARN";
break;
case SD_LOG_ERROR:
tag_color = 31;
level_str = "ERROR";
break;
default: /* Potential future-proofing */
tag_color = 33;
level_str = "?????";
break;
}
if (cli_params->color == true) {
fprintf(out_stream, "\033[%d;1m[%-5s]\033[0m ", tag_color, level_str);
} else {
fprintf(out_stream, "[%-5s] ", level_str);
}
fputs(log, out_stream);
fflush(out_stream);
} }
bool load_images_from_dir(const std::string dir, bool load_images_from_dir(const std::string dir,
@ -338,7 +294,7 @@ bool load_images_from_dir(const std::string dir,
int max_image_num = 0, int max_image_num = 0,
bool verbose = false) { bool verbose = false) {
if (!fs::exists(dir) || !fs::is_directory(dir)) { if (!fs::exists(dir) || !fs::is_directory(dir)) {
fprintf(stderr, "'%s' is not a valid directory\n", dir.c_str()); LOG_ERROR("'%s' is not a valid directory\n", dir.c_str());
return false; return false;
} }
@ -360,14 +316,12 @@ bool load_images_from_dir(const std::string dir,
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
if (ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".bmp") { if (ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".bmp") {
if (verbose) { LOG_DEBUG("load image %zu from '%s'", images.size(), path.c_str());
printf("load image %zu from '%s'\n", images.size(), path.c_str());
}
int width = 0; int width = 0;
int height = 0; int height = 0;
uint8_t* image_buffer = load_image_from_file(path.c_str(), width, height, expected_width, expected_height); uint8_t* image_buffer = load_image_from_file(path.c_str(), width, height, expected_width, expected_height);
if (image_buffer == nullptr) { if (image_buffer == nullptr) {
fprintf(stderr, "load image from '%s' failed\n", path.c_str()); LOG_ERROR("load image from '%s' failed", path.c_str());
return false; return false;
} }
@ -397,6 +351,129 @@ void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy,
} }
} }
std::string format_frame_idx(std::string pattern, int frame_idx) {
std::smatch match;
std::string result = pattern;
while (std::regex_search(result, match, format_specifier_regex)) {
std::string specifier = match.str(1);
char buffer[32];
snprintf(buffer, sizeof(buffer), specifier.c_str(), frame_idx);
result.replace(match.position(1), match.length(1), buffer);
}
// Then replace all '%%' with '%'
size_t pos = 0;
while ((pos = result.find("%%", pos)) != std::string::npos) {
result.replace(pos, 2, "%");
pos += 1;
}
return result;
}
bool save_results(const SDCliParams& cli_params,
const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
sd_image_t* results,
int num_results) {
if (results == nullptr || num_results <= 0) {
return false;
}
namespace fs = std::filesystem;
fs::path out_path = cli_params.output_path;
if (!out_path.parent_path().empty()) {
std::error_code ec;
fs::create_directories(out_path.parent_path(), ec);
if (ec) {
LOG_ERROR("failed to create directory '%s': %s",
out_path.parent_path().string().c_str(), ec.message().c_str());
return false;
}
}
fs::path base_path = out_path;
fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{};
std::string ext_lower = ext.string();
std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
bool is_jpg = (ext_lower == ".jpg" || ext_lower == ".jpeg" || ext_lower == ".jpe");
if (!ext.empty()) {
if (is_jpg || ext_lower == ".png") {
base_path.replace_extension();
}
}
int output_begin_idx = cli_params.output_begin_idx;
if (output_begin_idx < 0) {
output_begin_idx = 0;
}
auto write_image = [&](const fs::path& path, int idx) {
const sd_image_t& img = results[idx];
if (!img.data)
return false;
std::string params = get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + idx);
int ok = 0;
if (is_jpg) {
ok = stbi_write_jpg(path.string().c_str(), img.width, img.height, img.channel, img.data, 90, params.c_str());
} else {
ok = stbi_write_png(path.string().c_str(), img.width, img.height, img.channel, img.data, 0, params.c_str());
}
LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
return ok != 0;
};
int sucessful_reults = 0;
if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
if (!is_jpg && ext_lower != ".png")
ext = ".png";
fs::path pattern = base_path;
pattern += ext;
for (int i = 0; i < num_results; ++i) {
fs::path img_path = format_frame_idx(pattern.string(), output_begin_idx + i);
if (write_image(img_path, i)) {
sucessful_reults++;
}
}
LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
return sucessful_reults != 0;
}
if (cli_params.mode == VID_GEN && num_results > 1) {
if (ext_lower != ".avi")
ext = ".avi";
fs::path video_path = base_path;
video_path += ext;
if (create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
return true;
} else {
LOG_ERROR("Failed to save result MPG AVI video to '%s'", video_path.string().c_str());
return false;
}
}
if (!is_jpg && ext_lower != ".png")
ext = ".png";
for (int i = 0; i < num_results; ++i) {
fs::path img_path = base_path;
if (num_results > 1) {
img_path += "_" + std::to_string(output_begin_idx + i);
}
img_path += ext;
if (write_image(img_path, i)) {
sucessful_reults++;
}
}
LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
return sucessful_reults != 0;
}
int main(int argc, const char* argv[]) { int main(int argc, const char* argv[]) {
if (argc > 1 && std::string(argv[1]) == "--version") { if (argc > 1 && std::string(argv[1]) == "--version") {
std::cout << version_string() << "\n"; std::cout << version_string() << "\n";
@ -408,9 +485,6 @@ int main(int argc, const char* argv[]) {
SDGenerationParams gen_params; SDGenerationParams gen_params;
parse_args(argc, argv, cli_params, ctx_params, gen_params); parse_args(argc, argv, cli_params, ctx_params, gen_params);
if (cli_params.verbose || cli_params.version) {
std::cout << version_string() << "\n";
}
if (gen_params.video_frames > 4) { if (gen_params.video_frames > 4) {
size_t last_dot_pos = cli_params.preview_path.find_last_of("."); size_t last_dot_pos = cli_params.preview_path.find_last_of(".");
std::string base_path = cli_params.preview_path; std::string base_path = cli_params.preview_path;
@ -429,6 +503,8 @@ int main(int argc, const char* argv[]) {
cli_params.preview_fps /= 4; cli_params.preview_fps /= 4;
sd_set_log_callback(sd_log_cb, (void*)&cli_params); sd_set_log_callback(sd_log_cb, (void*)&cli_params);
log_verbose = cli_params.verbose;
log_color = cli_params.color;
sd_set_preview_callback(step_callback, sd_set_preview_callback(step_callback,
cli_params.preview_method, cli_params.preview_method,
cli_params.preview_interval, cli_params.preview_interval,
@ -436,40 +512,39 @@ int main(int argc, const char* argv[]) {
cli_params.preview_noisy, cli_params.preview_noisy,
(void*)&cli_params); (void*)&cli_params);
if (cli_params.verbose) { LOG_DEBUG("version: %s", version_string().c_str());
printf("%s", sd_get_system_info()); LOG_DEBUG("%s", sd_get_system_info());
printf("%s\n", cli_params.to_string().c_str()); LOG_DEBUG("%s", cli_params.to_string().c_str());
printf("%s\n", ctx_params.to_string().c_str()); LOG_DEBUG("%s", ctx_params.to_string().c_str());
printf("%s\n", gen_params.to_string().c_str()); LOG_DEBUG("%s", gen_params.to_string().c_str());
}
if (cli_params.mode == CONVERT) { if (cli_params.mode == CONVERT) {
bool success = convert(ctx_params.model_path.c_str(), bool success = convert(ctx_params.model_path.c_str(),
ctx_params.vae_path.c_str(), ctx_params.vae_path.c_str(),
cli_params.output_path.c_str(), cli_params.output_path.c_str(),
ctx_params.wtype, ctx_params.wtype,
ctx_params.tensor_type_rules.c_str()); ctx_params.tensor_type_rules.c_str(),
cli_params.convert_name);
if (!success) { if (!success) {
fprintf(stderr, LOG_ERROR("convert '%s'/'%s' to '%s' failed",
"convert '%s'/'%s' to '%s' failed\n", ctx_params.model_path.c_str(),
ctx_params.model_path.c_str(), ctx_params.vae_path.c_str(),
ctx_params.vae_path.c_str(), cli_params.output_path.c_str());
cli_params.output_path.c_str());
return 1; return 1;
} else { } else {
printf("convert '%s'/'%s' to '%s' success\n", LOG_INFO("convert '%s'/'%s' to '%s' success",
ctx_params.model_path.c_str(), ctx_params.model_path.c_str(),
ctx_params.vae_path.c_str(), ctx_params.vae_path.c_str(),
cli_params.output_path.c_str()); cli_params.output_path.c_str());
return 0; return 0;
} }
} }
bool vae_decode_only = true; bool vae_decode_only = true;
sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; sd_image_t init_image = {0, 0, 3, nullptr};
sd_image_t end_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; sd_image_t end_image = {0, 0, 3, nullptr};
sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; sd_image_t control_image = {0, 0, 3, nullptr};
sd_image_t mask_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 1, nullptr}; sd_image_t mask_image = {0, 0, 1, nullptr};
std::vector<sd_image_t> ref_images; std::vector<sd_image_t> ref_images;
std::vector<sd_image_t> pmid_images; std::vector<sd_image_t> pmid_images;
std::vector<sd_image_t> control_frames; std::vector<sd_image_t> control_frames;
@ -496,58 +571,80 @@ int main(int argc, const char* argv[]) {
control_frames.clear(); control_frames.clear();
}; };
auto load_image_and_update_size = [&](const std::string& path,
sd_image_t& image,
bool resize_image = true,
int expected_channel = 3) -> bool {
int expected_width = 0;
int expected_height = 0;
if (resize_image && gen_params.width_and_height_are_set()) {
expected_width = gen_params.width;
expected_height = gen_params.height;
}
if (!load_sd_image_from_file(&image, path.c_str(), expected_width, expected_height, expected_channel)) {
LOG_ERROR("load image from '%s' failed", path.c_str());
release_all_resources();
return false;
}
gen_params.set_width_and_height_if_unset(image.width, image.height);
return true;
};
if (gen_params.init_image_path.size() > 0) { if (gen_params.init_image_path.size() > 0) {
vae_decode_only = false; vae_decode_only = false;
if (!load_image_and_update_size(gen_params.init_image_path, init_image)) {
int width = 0;
int height = 0;
init_image.data = load_image_from_file(gen_params.init_image_path.c_str(), width, height, gen_params.width, gen_params.height);
if (init_image.data == nullptr) {
fprintf(stderr, "load image from '%s' failed\n", gen_params.init_image_path.c_str());
release_all_resources();
return 1; return 1;
} }
} }
if (gen_params.end_image_path.size() > 0) { if (gen_params.end_image_path.size() > 0) {
vae_decode_only = false; vae_decode_only = false;
if (!load_image_and_update_size(gen_params.init_image_path, end_image)) {
int width = 0;
int height = 0;
end_image.data = load_image_from_file(gen_params.end_image_path.c_str(), width, height, gen_params.width, gen_params.height);
if (end_image.data == nullptr) {
fprintf(stderr, "load image from '%s' failed\n", gen_params.end_image_path.c_str());
release_all_resources();
return 1; return 1;
} }
} }
if (gen_params.ref_image_paths.size() > 0) {
vae_decode_only = false;
for (auto& path : gen_params.ref_image_paths) {
sd_image_t ref_image = {0, 0, 3, nullptr};
if (!load_image_and_update_size(path, ref_image, false)) {
return 1;
}
ref_images.push_back(ref_image);
}
}
if (gen_params.mask_image_path.size() > 0) { if (gen_params.mask_image_path.size() > 0) {
int c = 0; if (!load_sd_image_from_file(&mask_image,
int width = 0; gen_params.mask_image_path.c_str(),
int height = 0; gen_params.get_resolved_width(),
mask_image.data = load_image_from_file(gen_params.mask_image_path.c_str(), width, height, gen_params.width, gen_params.height, 1); gen_params.get_resolved_height(),
if (mask_image.data == nullptr) { 1)) {
fprintf(stderr, "load image from '%s' failed\n", gen_params.mask_image_path.c_str()); LOG_ERROR("load image from '%s' failed", gen_params.mask_image_path.c_str());
release_all_resources(); release_all_resources();
return 1; return 1;
} }
} else { } else {
mask_image.data = (uint8_t*)malloc(gen_params.width * gen_params.height); mask_image.data = (uint8_t*)malloc(gen_params.get_resolved_width() * gen_params.get_resolved_height());
memset(mask_image.data, 255, gen_params.width * gen_params.height);
if (mask_image.data == nullptr) { if (mask_image.data == nullptr) {
fprintf(stderr, "malloc mask image failed\n"); LOG_ERROR("malloc mask image failed");
release_all_resources(); release_all_resources();
return 1; return 1;
} }
mask_image.width = gen_params.get_resolved_width();
mask_image.height = gen_params.get_resolved_height();
memset(mask_image.data, 255, gen_params.get_resolved_width() * gen_params.get_resolved_height());
} }
if (gen_params.control_image_path.size() > 0) { if (gen_params.control_image_path.size() > 0) {
int width = 0; if (!load_sd_image_from_file(&control_image,
int height = 0; gen_params.control_image_path.c_str(),
control_image.data = load_image_from_file(gen_params.control_image_path.c_str(), width, height, gen_params.width, gen_params.height); gen_params.get_resolved_width(),
if (control_image.data == nullptr) { gen_params.get_resolved_height())) {
fprintf(stderr, "load image from '%s' failed\n", gen_params.control_image_path.c_str()); LOG_ERROR("load image from '%s' failed", gen_params.control_image_path.c_str());
release_all_resources(); release_all_resources();
return 1; return 1;
} }
@ -561,29 +658,11 @@ int main(int argc, const char* argv[]) {
} }
} }
if (gen_params.ref_image_paths.size() > 0) {
vae_decode_only = false;
for (auto& path : gen_params.ref_image_paths) {
int width = 0;
int height = 0;
uint8_t* image_buffer = load_image_from_file(path.c_str(), width, height);
if (image_buffer == nullptr) {
fprintf(stderr, "load image from '%s' failed\n", path.c_str());
release_all_resources();
return 1;
}
ref_images.push_back({(uint32_t)width,
(uint32_t)height,
3,
image_buffer});
}
}
if (!gen_params.control_video_path.empty()) { if (!gen_params.control_video_path.empty()) {
if (!load_images_from_dir(gen_params.control_video_path, if (!load_images_from_dir(gen_params.control_video_path,
control_frames, control_frames,
gen_params.width, gen_params.get_resolved_width(),
gen_params.height, gen_params.get_resolved_height(),
gen_params.video_frames, gen_params.video_frames,
cli_params.verbose)) { cli_params.verbose)) {
release_all_resources(); release_all_resources();
@ -616,7 +695,7 @@ int main(int argc, const char* argv[]) {
num_results = 1; num_results = 1;
results = (sd_image_t*)calloc(num_results, sizeof(sd_image_t)); results = (sd_image_t*)calloc(num_results, sizeof(sd_image_t));
if (results == nullptr) { if (results == nullptr) {
printf("failed to allocate results array\n"); LOG_INFO("failed to allocate results array");
release_all_resources(); release_all_resources();
return 1; return 1;
} }
@ -627,7 +706,7 @@ int main(int argc, const char* argv[]) {
sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params); sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
if (sd_ctx == nullptr) { if (sd_ctx == nullptr) {
printf("new_sd_ctx_t failed\n"); LOG_INFO("new_sd_ctx_t failed");
release_all_resources(); release_all_resources();
return 1; return 1;
} }
@ -641,7 +720,7 @@ int main(int argc, const char* argv[]) {
} }
if (gen_params.sample_params.scheduler == SCHEDULER_COUNT) { if (gen_params.sample_params.scheduler == SCHEDULER_COUNT) {
gen_params.sample_params.scheduler = sd_get_default_scheduler(sd_ctx); gen_params.sample_params.scheduler = sd_get_default_scheduler(sd_ctx, gen_params.sample_params.sample_method);
} }
if (cli_params.mode == IMG_GEN) { if (cli_params.mode == IMG_GEN) {
@ -657,8 +736,8 @@ int main(int argc, const char* argv[]) {
gen_params.auto_resize_ref_image, gen_params.auto_resize_ref_image,
gen_params.increase_ref_index, gen_params.increase_ref_index,
mask_image, mask_image,
gen_params.width, gen_params.get_resolved_width(),
gen_params.height, gen_params.get_resolved_height(),
gen_params.sample_params, gen_params.sample_params,
gen_params.strength, gen_params.strength,
gen_params.seed, gen_params.seed,
@ -672,7 +751,7 @@ int main(int argc, const char* argv[]) {
gen_params.pm_style_strength, gen_params.pm_style_strength,
}, // pm_params }, // pm_params
ctx_params.vae_tiling_params, ctx_params.vae_tiling_params,
gen_params.easycache_params, gen_params.cache_params,
}; };
results = generate_image(sd_ctx, &img_gen_params); results = generate_image(sd_ctx, &img_gen_params);
@ -688,8 +767,8 @@ int main(int argc, const char* argv[]) {
end_image, end_image,
control_frames.data(), control_frames.data(),
(int)control_frames.size(), (int)control_frames.size(),
gen_params.width, gen_params.get_resolved_width(),
gen_params.height, gen_params.get_resolved_height(),
gen_params.sample_params, gen_params.sample_params,
gen_params.high_noise_sample_params, gen_params.high_noise_sample_params,
gen_params.moe_boundary, gen_params.moe_boundary,
@ -697,14 +776,15 @@ int main(int argc, const char* argv[]) {
gen_params.seed, gen_params.seed,
gen_params.video_frames, gen_params.video_frames,
gen_params.vace_strength, gen_params.vace_strength,
gen_params.easycache_params, ctx_params.vae_tiling_params,
gen_params.cache_params,
}; };
results = generate_video(sd_ctx, &vid_gen_params, &num_results); results = generate_video(sd_ctx, &vid_gen_params, &num_results);
} }
if (results == nullptr) { if (results == nullptr) {
printf("generate failed\n"); LOG_ERROR("generate failed");
free_sd_ctx(sd_ctx); free_sd_ctx(sd_ctx);
return 1; return 1;
} }
@ -721,7 +801,7 @@ int main(int argc, const char* argv[]) {
gen_params.upscale_tile_size); gen_params.upscale_tile_size);
if (upscaler_ctx == nullptr) { if (upscaler_ctx == nullptr) {
printf("new_upscaler_ctx failed\n"); LOG_ERROR("new_upscaler_ctx failed");
} else { } else {
for (int i = 0; i < num_results; i++) { for (int i = 0; i < num_results; i++) {
if (results[i].data == nullptr) { if (results[i].data == nullptr) {
@ -731,7 +811,7 @@ int main(int argc, const char* argv[]) {
for (int u = 0; u < gen_params.upscale_repeats; ++u) { for (int u = 0; u < gen_params.upscale_repeats; ++u) {
sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor); sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor);
if (upscaled_image.data == nullptr) { if (upscaled_image.data == nullptr) {
printf("upscale failed\n"); LOG_ERROR("upscale failed");
break; break;
} }
free(current_image.data); free(current_image.data);
@ -742,67 +822,8 @@ int main(int argc, const char* argv[]) {
} }
} }
// create directory if not exists if (!save_results(cli_params, ctx_params, gen_params, results, num_results)) {
{ return 1;
const fs::path out_path = cli_params.output_path;
if (const fs::path out_dir = out_path.parent_path(); !out_dir.empty()) {
std::error_code ec;
fs::create_directories(out_dir, ec); // OK if already exists
if (ec) {
fprintf(stderr, "failed to create directory '%s': %s\n",
out_dir.string().c_str(), ec.message().c_str());
return 1;
}
}
}
std::string base_path;
std::string file_ext;
std::string file_ext_lower;
bool is_jpg;
size_t last_dot_pos = cli_params.output_path.find_last_of(".");
size_t last_slash_pos = std::min(cli_params.output_path.find_last_of("/"),
cli_params.output_path.find_last_of("\\"));
if (last_dot_pos != std::string::npos && (last_slash_pos == std::string::npos || last_dot_pos > last_slash_pos)) { // filename has extension
base_path = cli_params.output_path.substr(0, last_dot_pos);
file_ext = file_ext_lower = cli_params.output_path.substr(last_dot_pos);
std::transform(file_ext.begin(), file_ext.end(), file_ext_lower.begin(), ::tolower);
is_jpg = (file_ext_lower == ".jpg" || file_ext_lower == ".jpeg" || file_ext_lower == ".jpe");
} else {
base_path = cli_params.output_path;
file_ext = file_ext_lower = "";
is_jpg = false;
}
if (cli_params.mode == VID_GEN && num_results > 1) {
std::string vid_output_path = cli_params.output_path;
if (file_ext_lower == ".png") {
vid_output_path = base_path + ".avi";
}
create_mjpg_avi_from_sd_images(vid_output_path.c_str(), results, num_results, gen_params.fps);
printf("save result MJPG AVI video to '%s'\n", vid_output_path.c_str());
} else {
// appending ".png" to absent or unknown extension
if (!is_jpg && file_ext_lower != ".png") {
base_path += file_ext;
file_ext = ".png";
}
for (int i = 0; i < num_results; i++) {
if (results[i].data == nullptr) {
continue;
}
int write_ok;
std::string final_image_path = i > 0 ? base_path + "_" + std::to_string(i + 1) + file_ext : base_path + file_ext;
if (is_jpg) {
write_ok = stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
results[i].data, 90, get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + i).c_str());
printf("save result JPEG image to '%s' (%s)\n", final_image_path.c_str(), write_ok == 0 ? "failure" : "success");
} else {
write_ok = stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
results[i].data, 0, get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + i).c_str());
printf("save result PNG image to '%s' (%s)\n", final_image_path.c_str(), write_ok == 0 ? "failure" : "success");
}
}
} }
for (int i = 0; i < num_results; i++) { for (int i = 0; i < num_results; i++) {
@ -814,4 +835,4 @@ int main(int argc, const char* argv[]) {
release_all_resources(); release_all_resources();
return 0; return 0;
} }

View File

@ -86,6 +86,125 @@ static std::string argv_to_utf8(int index, const char** argv) {
#endif #endif
static void print_utf8(FILE* stream, const char* utf8) {
if (!utf8)
return;
#ifdef _WIN32
HANDLE h = (stream == stderr)
? GetStdHandle(STD_ERROR_HANDLE)
: GetStdHandle(STD_OUTPUT_HANDLE);
DWORD mode;
BOOL is_console = GetConsoleMode(h, &mode);
if (is_console) {
int wlen = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
if (wlen <= 0)
return;
wchar_t* wbuf = (wchar_t*)malloc(wlen * sizeof(wchar_t));
if (!wbuf)
return;
MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wbuf, wlen);
DWORD written;
WriteConsoleW(h, wbuf, wlen - 1, &written, NULL);
free(wbuf);
} else {
DWORD written;
WriteFile(h, utf8, (DWORD)strlen(utf8), &written, NULL);
}
#else
fputs(utf8, stream);
#endif
}
static std::string sd_basename(const std::string& path) {
size_t pos = path.find_last_of('/');
if (pos != std::string::npos) {
return path.substr(pos + 1);
}
pos = path.find_last_of('\\');
if (pos != std::string::npos) {
return path.substr(pos + 1);
}
return path;
}
static void log_print(enum sd_log_level_t level, const char* log, bool verbose, bool color) {
int tag_color;
const char* level_str;
FILE* out_stream = (level == SD_LOG_ERROR) ? stderr : stdout;
if (!log || (!verbose && level <= SD_LOG_DEBUG)) {
return;
}
switch (level) {
case SD_LOG_DEBUG:
tag_color = 37;
level_str = "DEBUG";
break;
case SD_LOG_INFO:
tag_color = 34;
level_str = "INFO";
break;
case SD_LOG_WARN:
tag_color = 35;
level_str = "WARN";
break;
case SD_LOG_ERROR:
tag_color = 31;
level_str = "ERROR";
break;
default: /* Potential future-proofing */
tag_color = 33;
level_str = "?????";
break;
}
if (color) {
fprintf(out_stream, "\033[%d;1m[%-5s]\033[0m ", tag_color, level_str);
} else {
fprintf(out_stream, "[%-5s] ", level_str);
}
print_utf8(out_stream, log);
fflush(out_stream);
}
#define LOG_BUFFER_SIZE 4096
static bool log_verbose = false;
static bool log_color = false;
static void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...) {
va_list args;
va_start(args, format);
static char log_buffer[LOG_BUFFER_SIZE + 1];
int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "%s:%-4d - ", sd_basename(file).c_str(), line);
if (written >= 0 && written < LOG_BUFFER_SIZE) {
vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
}
size_t len = strlen(log_buffer);
if (log_buffer[len - 1] != '\n') {
strncat(log_buffer, "\n", LOG_BUFFER_SIZE - len);
}
log_print(level, log_buffer, log_verbose, log_color);
va_end(args);
}
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
struct StringOption { struct StringOption {
std::string short_name; std::string short_name;
std::string long_name; std::string long_name;
@ -295,11 +414,11 @@ static bool parse_options(int argc, const char** argv, const std::vector<ArgOpti
} }
if (invalid_arg) { if (invalid_arg) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); LOG_ERROR("error: invalid parameter for argument: %s", arg.c_str());
return false; return false;
} }
if (!found_arg) { if (!found_arg) {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); LOG_ERROR("error: unknown argument: %s", arg.c_str());
return false; return false;
} }
} }
@ -326,7 +445,7 @@ struct SDContextParams {
std::string photo_maker_path; std::string photo_maker_path;
sd_type_t wtype = SD_TYPE_COUNT; sd_type_t wtype = SD_TYPE_COUNT;
std::string tensor_type_rules; std::string tensor_type_rules;
std::string lora_model_dir; std::string lora_model_dir = ".";
std::map<std::string, std::string> embedding_map; std::map<std::string, std::string> embedding_map;
std::vector<sd_embedding_t> embedding_vec; std::vector<sd_embedding_t> embedding_vec;
@ -334,17 +453,25 @@ struct SDContextParams {
rng_type_t rng_type = CUDA_RNG; rng_type_t rng_type = CUDA_RNG;
rng_type_t sampler_rng_type = RNG_TYPE_COUNT; rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
bool offload_params_to_cpu = false; bool offload_params_to_cpu = false;
bool enable_mmap = false;
bool control_net_cpu = false; bool control_net_cpu = false;
bool clip_on_cpu = false; bool clip_on_cpu = false;
bool vae_on_cpu = false; bool vae_on_cpu = false;
bool flash_attn = false;
bool diffusion_flash_attn = false; bool diffusion_flash_attn = false;
bool diffusion_conv_direct = false; bool diffusion_conv_direct = false;
bool vae_conv_direct = false; bool vae_conv_direct = false;
bool circular = false;
bool circular_x = false;
bool circular_y = false;
bool chroma_use_dit_mask = true; bool chroma_use_dit_mask = true;
bool chroma_use_t5_mask = false; bool chroma_use_t5_mask = false;
int chroma_t5_mask_pad = 1; int chroma_t5_mask_pad = 1;
bool qwen_image_zero_cond_t = false;
prediction_t prediction = PREDICTION_COUNT; prediction_t prediction = PREDICTION_COUNT;
lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
@ -406,6 +533,10 @@ struct SDContextParams {
"--taesd", "--taesd",
"path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)", "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
&taesd_path}, &taesd_path},
{"",
"--tae",
"alias of --taesd",
&taesd_path},
{"", {"",
"--control-net", "--control-net",
"path to control net model", "path to control net model",
@ -450,10 +581,6 @@ struct SDContextParams {
"--vae-tile-overlap", "--vae-tile-overlap",
"tile overlap for vae tiling, in fraction of tile size (default: 0.5)", "tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
&vae_tiling_params.target_overlap}, &vae_tiling_params.target_overlap},
{"",
"--flow-shift",
"shift value for Flow models like SD3.x or WAN (default: auto)",
&flow_shift},
}; };
options.bool_options = { options.bool_options = {
@ -469,6 +596,10 @@ struct SDContextParams {
"--offload-to-cpu", "--offload-to-cpu",
"place the weights in RAM to save VRAM, and automatically load them into VRAM when needed", "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed",
true, &offload_params_to_cpu}, true, &offload_params_to_cpu},
{"",
"--mmap",
"whether to memory-map model",
true, &enable_mmap},
{"", {"",
"--control-net-cpu", "--control-net-cpu",
"keep controlnet in cpu (for low vram)", "keep controlnet in cpu (for low vram)",
@ -481,9 +612,13 @@ struct SDContextParams {
"--vae-on-cpu", "--vae-on-cpu",
"keep vae in cpu (for low vram)", "keep vae in cpu (for low vram)",
true, &vae_on_cpu}, true, &vae_on_cpu},
{"",
"--fa",
"use flash attention",
true, &flash_attn},
{"", {"",
"--diffusion-fa", "--diffusion-fa",
"use flash attention in the diffusion model", "use flash attention in the diffusion model only",
true, &diffusion_flash_attn}, true, &diffusion_flash_attn},
{"", {"",
"--diffusion-conv-direct", "--diffusion-conv-direct",
@ -493,10 +628,26 @@ struct SDContextParams {
"--vae-conv-direct", "--vae-conv-direct",
"use ggml_conv2d_direct in the vae model", "use ggml_conv2d_direct in the vae model",
true, &vae_conv_direct}, true, &vae_conv_direct},
{"",
"--circular",
"enable circular padding for convolutions",
true, &circular},
{"",
"--circularx",
"enable circular RoPE wrapping on x-axis (width) only",
true, &circular_x},
{"",
"--circulary",
"enable circular RoPE wrapping on y-axis (height) only",
true, &circular_y},
{"", {"",
"--chroma-disable-dit-mask", "--chroma-disable-dit-mask",
"disable dit mask for chroma", "disable dit mask for chroma",
false, &chroma_use_dit_mask}, false, &chroma_use_dit_mask},
{"",
"--qwen-image-zero-cond-t",
"enable zero_cond_t for qwen image",
true, &qwen_image_zero_cond_t},
{"", {"",
"--chroma-enable-t5-mask", "--chroma-enable-t5-mask",
"enable t5 mask for chroma", "enable t5 mask for chroma",
@ -510,8 +661,8 @@ struct SDContextParams {
const char* arg = argv[index]; const char* arg = argv[index];
wtype = str_to_sd_type(arg); wtype = str_to_sd_type(arg);
if (wtype == SD_TYPE_COUNT) { if (wtype == SD_TYPE_COUNT) {
fprintf(stderr, "error: invalid weight format %s\n", LOG_ERROR("error: invalid weight format %s",
arg); arg);
return -1; return -1;
} }
return 1; return 1;
@ -524,8 +675,8 @@ struct SDContextParams {
const char* arg = argv[index]; const char* arg = argv[index];
rng_type = str_to_rng_type(arg); rng_type = str_to_rng_type(arg);
if (rng_type == RNG_TYPE_COUNT) { if (rng_type == RNG_TYPE_COUNT) {
fprintf(stderr, "error: invalid rng type %s\n", LOG_ERROR("error: invalid rng type %s",
arg); arg);
return -1; return -1;
} }
return 1; return 1;
@ -538,8 +689,8 @@ struct SDContextParams {
const char* arg = argv[index]; const char* arg = argv[index];
sampler_rng_type = str_to_rng_type(arg); sampler_rng_type = str_to_rng_type(arg);
if (sampler_rng_type == RNG_TYPE_COUNT) { if (sampler_rng_type == RNG_TYPE_COUNT) {
fprintf(stderr, "error: invalid sampler rng type %s\n", LOG_ERROR("error: invalid sampler rng type %s",
arg); arg);
return -1; return -1;
} }
return 1; return 1;
@ -552,8 +703,8 @@ struct SDContextParams {
const char* arg = argv[index]; const char* arg = argv[index];
prediction = str_to_prediction(arg); prediction = str_to_prediction(arg);
if (prediction == PREDICTION_COUNT) { if (prediction == PREDICTION_COUNT) {
fprintf(stderr, "error: invalid prediction type %s\n", LOG_ERROR("error: invalid prediction type %s",
arg); arg);
return -1; return -1;
} }
return 1; return 1;
@ -566,8 +717,8 @@ struct SDContextParams {
const char* arg = argv[index]; const char* arg = argv[index];
lora_apply_mode = str_to_lora_apply_mode(arg); lora_apply_mode = str_to_lora_apply_mode(arg);
if (lora_apply_mode == LORA_APPLY_MODE_COUNT) { if (lora_apply_mode == LORA_APPLY_MODE_COUNT) {
fprintf(stderr, "error: invalid lora apply model %s\n", LOG_ERROR("error: invalid lora apply model %s",
arg); arg);
return -1; return -1;
} }
return 1; return 1;
@ -659,7 +810,7 @@ struct SDContextParams {
} }
void build_embedding_map() { void build_embedding_map() {
static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"}; static const std::vector<std::string> valid_ext = {".gguf", ".safetensors", ".pt"};
if (!fs::exists(embedding_dir) || !fs::is_directory(embedding_dir)) { if (!fs::exists(embedding_dir) || !fs::is_directory(embedding_dir)) {
return; return;
@ -691,13 +842,13 @@ struct SDContextParams {
bool process_and_check(SDMode mode) { bool process_and_check(SDMode mode) {
if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) { if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) {
fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n"); LOG_ERROR("error: the following arguments are required: model_path/diffusion_model\n");
return false; return false;
} }
if (mode == UPSCALE) { if (mode == UPSCALE) {
if (esrgan_path.length() == 0) { if (esrgan_path.length() == 0) {
fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n"); LOG_ERROR("error: upscale mode needs an upscaler model (--upscale-model)\n");
return false; return false;
} }
} }
@ -748,15 +899,20 @@ struct SDContextParams {
<< " photo_maker_path: \"" << photo_maker_path << "\",\n" << " photo_maker_path: \"" << photo_maker_path << "\",\n"
<< " rng_type: " << sd_rng_type_name(rng_type) << ",\n" << " rng_type: " << sd_rng_type_name(rng_type) << ",\n"
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
<< " flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n"
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
<< " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
<< " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
<< " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
<< " flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
<< " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
<< " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
<< " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n" << " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
<< " circular: " << (circular ? "true" : "false") << ",\n"
<< " circular_x: " << (circular_x ? "true" : "false") << ",\n"
<< " circular_y: " << (circular_y ? "true" : "false") << ",\n"
<< " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n" << " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n"
<< " qwen_image_zero_cond_t: " << (qwen_image_zero_cond_t ? "true" : "false") << ",\n"
<< " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n" << " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n"
<< " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n" << " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n"
<< " prediction: " << sd_prediction_name(prediction) << ",\n" << " prediction: " << sd_prediction_name(prediction) << ",\n"
@ -796,7 +952,6 @@ struct SDContextParams {
vae_path.c_str(), vae_path.c_str(),
taesd_path.c_str(), taesd_path.c_str(),
control_net_path.c_str(), control_net_path.c_str(),
lora_model_dir.c_str(),
embedding_vec.data(), embedding_vec.data(),
static_cast<uint32_t>(embedding_vec.size()), static_cast<uint32_t>(embedding_vec.size()),
photo_maker_path.c_str(), photo_maker_path.c_str(),
@ -810,18 +965,22 @@ struct SDContextParams {
prediction, prediction,
lora_apply_mode, lora_apply_mode,
offload_params_to_cpu, offload_params_to_cpu,
enable_mmap,
clip_on_cpu, clip_on_cpu,
control_net_cpu, control_net_cpu,
vae_on_cpu, vae_on_cpu,
flash_attn,
diffusion_flash_attn, diffusion_flash_attn,
taesd_preview, taesd_preview,
diffusion_conv_direct, diffusion_conv_direct,
vae_conv_direct, vae_conv_direct,
circular || circular_x,
circular || circular_y,
force_sdxl_vae_conv_scale, force_sdxl_vae_conv_scale,
chroma_use_dit_mask, chroma_use_dit_mask,
chroma_use_t5_mask, chroma_use_t5_mask,
chroma_t5_mask_pad, chroma_t5_mask_pad,
flow_shift, qwen_image_zero_cond_t,
}; };
return sd_ctx_params; return sd_ctx_params;
} }
@ -863,10 +1022,11 @@ static bool is_absolute_path(const std::string& p) {
struct SDGenerationParams { struct SDGenerationParams {
std::string prompt; std::string prompt;
std::string prompt_with_lora; // for metadata record only
std::string negative_prompt; std::string negative_prompt;
int clip_skip = -1; // <= 0 represents unspecified int clip_skip = -1; // <= 0 represents unspecified
int width = 512; int width = -1;
int height = 512; int height = -1;
int batch_count = 1; int batch_count = 1;
std::string init_image_path; std::string init_image_path;
std::string end_image_path; std::string end_image_path;
@ -885,8 +1045,11 @@ struct SDGenerationParams {
std::vector<float> custom_sigmas; std::vector<float> custom_sigmas;
std::string easycache_option; std::string cache_mode;
sd_easycache_params_t easycache_params; std::string cache_option;
std::string scm_mask;
bool scm_policy_dynamic = true;
sd_cache_params_t cache_params{};
float moe_boundary = 0.875f; float moe_boundary = 0.875f;
int video_frames = 1; int video_frames = 1;
@ -1036,6 +1199,10 @@ struct SDGenerationParams {
"--eta", "--eta",
"eta in DDIM, only for DDIM and TCD (default: 0)", "eta in DDIM, only for DDIM and TCD (default: 0)",
&sample_params.eta}, &sample_params.eta},
{"",
"--flow-shift",
"shift value for Flow models like SD3.x or WAN (default: auto)",
&sample_params.flow_shift},
{"", {"",
"--high-noise-cfg-scale", "--high-noise-cfg-scale",
"(high noise) unconditional guidance scale: (default: 7.0)", "(high noise) unconditional guidance scale: (default: 7.0)",
@ -1114,8 +1281,8 @@ struct SDGenerationParams {
const char* arg = argv[index]; const char* arg = argv[index];
sample_params.sample_method = str_to_sample_method(arg); sample_params.sample_method = str_to_sample_method(arg);
if (sample_params.sample_method == SAMPLE_METHOD_COUNT) { if (sample_params.sample_method == SAMPLE_METHOD_COUNT) {
fprintf(stderr, "error: invalid sample method %s\n", LOG_ERROR("error: invalid sample method %s",
arg); arg);
return -1; return -1;
} }
return 1; return 1;
@ -1128,8 +1295,8 @@ struct SDGenerationParams {
const char* arg = argv[index]; const char* arg = argv[index];
high_noise_sample_params.sample_method = str_to_sample_method(arg); high_noise_sample_params.sample_method = str_to_sample_method(arg);
if (high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { if (high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) {
fprintf(stderr, "error: invalid high noise sample method %s\n", LOG_ERROR("error: invalid high noise sample method %s",
arg); arg);
return -1; return -1;
} }
return 1; return 1;
@ -1142,8 +1309,8 @@ struct SDGenerationParams {
const char* arg = argv[index]; const char* arg = argv[index];
sample_params.scheduler = str_to_scheduler(arg); sample_params.scheduler = str_to_scheduler(arg);
if (sample_params.scheduler == SCHEDULER_COUNT) { if (sample_params.scheduler == SCHEDULER_COUNT) {
fprintf(stderr, "error: invalid scheduler %s\n", LOG_ERROR("error: invalid scheduler %s",
arg); arg);
return -1; return -1;
} }
return 1; return 1;
@ -1223,18 +1390,18 @@ struct SDGenerationParams {
if (!item.empty()) { if (!item.empty()) {
try { try {
custom_sigmas.push_back(std::stof(item)); custom_sigmas.push_back(std::stof(item));
} catch (const std::invalid_argument& e) { } catch (const std::invalid_argument&) {
fprintf(stderr, "error: invalid float value '%s' in --sigmas\n", item.c_str()); LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str());
return -1; return -1;
} catch (const std::out_of_range& e) { } catch (const std::out_of_range&) {
fprintf(stderr, "error: float value '%s' out of range in --sigmas\n", item.c_str()); LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str());
return -1; return -1;
} }
} }
} }
if (custom_sigmas.empty() && !sigmas_str.empty()) { if (custom_sigmas.empty() && !sigmas_str.empty()) {
fprintf(stderr, "error: could not parse any sigma values from '%s'\n", argv[index]); LOG_ERROR("error: could not parse any sigma values from '%s'", argv[index]);
return -1; return -1;
} }
return 1; return 1;
@ -1248,36 +1415,49 @@ struct SDGenerationParams {
return 1; return 1;
}; };
auto on_easycache_arg = [&](int argc, const char** argv, int index) { auto on_cache_mode_arg = [&](int argc, const char** argv, int index) {
const std::string default_values = "0.2,0.15,0.95"; if (++index >= argc) {
auto looks_like_value = [](const std::string& token) { return -1;
if (token.empty()) { }
return false; cache_mode = argv_to_utf8(index, argv);
} if (cache_mode != "easycache" && cache_mode != "ucache" &&
if (token[0] != '-') { cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit" && cache_mode != "spectrum") {
return true; fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', 'cache-dit', or 'spectrum'\n", cache_mode.c_str());
} return -1;
if (token.size() == 1) { }
return false; return 1;
} };
unsigned char next = static_cast<unsigned char>(token[1]);
return std::isdigit(next) || token[1] == '.';
};
std::string option_value; auto on_cache_option_arg = [&](int argc, const char** argv, int index) {
int consumed = 0; if (++index >= argc) {
if (index + 1 < argc) { return -1;
std::string next_arg = argv[index + 1];
if (looks_like_value(next_arg)) {
option_value = argv_to_utf8(index + 1, argv);
consumed = 1;
}
} }
if (option_value.empty()) { cache_option = argv_to_utf8(index, argv);
option_value = default_values; return 1;
};
auto on_scm_mask_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
} }
easycache_option = option_value; scm_mask = argv_to_utf8(index, argv);
return consumed; return 1;
};
auto on_scm_policy_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
std::string policy = argv_to_utf8(index, argv);
if (policy == "dynamic") {
scm_policy_dynamic = true;
} else if (policy == "static") {
scm_policy_dynamic = false;
} else {
fprintf(stderr, "error: invalid scm policy '%s', must be 'dynamic' or 'static'\n", policy.c_str());
return -1;
}
return 1;
}; };
options.manual_options = { options.manual_options = {
@ -1287,17 +1467,17 @@ struct SDGenerationParams {
on_seed_arg}, on_seed_arg},
{"", {"",
"--sampling-method", "--sampling-method",
"sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] " "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s] "
"(default: euler for Flux/SD3/Wan, euler_a otherwise)", "(default: euler for Flux/SD3/Wan, euler_a otherwise)",
on_sample_method_arg}, on_sample_method_arg},
{"", {"",
"--high-noise-sampling-method", "--high-noise-sampling-method",
"(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]" "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s]"
" default: euler for Flux/SD3/Wan, euler_a otherwise", " default: euler for Flux/SD3/Wan, euler_a otherwise",
on_high_noise_sample_method_arg}, on_high_noise_sample_method_arg},
{"", {"",
"--scheduler", "--scheduler",
"denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete", "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default: discrete",
on_scheduler_arg}, on_scheduler_arg},
{"", {"",
"--sigmas", "--sigmas",
@ -1316,9 +1496,21 @@ struct SDGenerationParams {
"reference image for Flux Kontext models (can be used multiple times)", "reference image for Flux Kontext models (can be used multiple times)",
on_ref_image_arg}, on_ref_image_arg},
{"", {"",
"--easycache", "--cache-mode",
"enable EasyCache for DiT models with optional \"threshold,start_percent,end_percent\" (default: 0.2,0.15,0.95)", "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)",
on_easycache_arg}, on_cache_mode_arg},
{"",
"--cache-option",
"named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"",
on_cache_option_arg},
{"",
"--scm-mask",
"SCM steps mask for cache-dit: comma-separated 0/1 (e.g., \"1,1,1,0,0,1,0,0,1,0\") - 1=compute, 0=can cache",
on_scm_mask_arg},
{"",
"--scm-policy",
"SCM policy: 'dynamic' (default) or 'static'",
on_scm_policy_arg},
}; };
@ -1330,7 +1522,7 @@ struct SDGenerationParams {
try { try {
j = json::parse(json_str); j = json::parse(json_str);
} catch (...) { } catch (...) {
fprintf(stderr, "json parse failed %s\n", json_str.c_str()); LOG_ERROR("json parse failed %s", json_str.c_str());
return false; return false;
} }
@ -1361,7 +1553,9 @@ struct SDGenerationParams {
load_if_exists("prompt", prompt); load_if_exists("prompt", prompt);
load_if_exists("negative_prompt", negative_prompt); load_if_exists("negative_prompt", negative_prompt);
load_if_exists("easycache_option", easycache_option); load_if_exists("cache_mode", cache_mode);
load_if_exists("cache_option", cache_option);
load_if_exists("scm_mask", scm_mask);
load_if_exists("clip_skip", clip_skip); load_if_exists("clip_skip", clip_skip);
load_if_exists("width", width); load_if_exists("width", width);
@ -1384,9 +1578,30 @@ struct SDGenerationParams {
load_if_exists("skip_layers", skip_layers); load_if_exists("skip_layers", skip_layers);
load_if_exists("high_noise_skip_layers", high_noise_skip_layers); load_if_exists("high_noise_skip_layers", high_noise_skip_layers);
load_if_exists("steps", sample_params.sample_steps);
load_if_exists("high_noise_steps", high_noise_sample_params.sample_steps);
load_if_exists("cfg_scale", sample_params.guidance.txt_cfg); load_if_exists("cfg_scale", sample_params.guidance.txt_cfg);
load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg); load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg);
load_if_exists("guidance", sample_params.guidance.distilled_guidance); load_if_exists("guidance", sample_params.guidance.distilled_guidance);
load_if_exists("flow_shift", sample_params.flow_shift);
auto load_sampler_if_exists = [&](const char* key, enum sample_method_t& out) {
if (j.contains(key) && j[key].is_string()) {
enum sample_method_t tmp = str_to_sample_method(j[key].get<std::string>().c_str());
if (tmp != SAMPLE_METHOD_COUNT) {
out = tmp;
}
}
};
load_sampler_if_exists("sample_method", sample_params.sample_method);
load_sampler_if_exists("high_noise_sample_method", high_noise_sample_params.sample_method);
if (j.contains("scheduler") && j["scheduler"].is_string()) {
enum scheduler_t tmp = str_to_scheduler(j["scheduler"].get<std::string>().c_str());
if (tmp != SCHEDULER_COUNT) {
sample_params.scheduler = tmp;
}
}
return true; return true;
} }
@ -1396,7 +1611,7 @@ struct SDGenerationParams {
return; return;
} }
static const std::regex re(R"(<lora:([^:>]+):([^>]+)>)"); static const std::regex re(R"(<lora:([^:>]+):([^>]+)>)");
static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"}; static const std::vector<std::string> valid_ext = {".gguf", ".safetensors", ".pt"};
std::smatch m; std::smatch m;
std::string tmp = prompt; std::string tmp = prompt;
@ -1439,7 +1654,7 @@ struct SDGenerationParams {
} }
} }
if (!found) { if (!found) {
printf("can not found lora %s\n", final_path.lexically_normal().string().c_str()); LOG_WARN("can not found lora %s", final_path.lexically_normal().string().c_str());
tmp = m.suffix().str(); tmp = m.suffix().str();
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only); prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
continue; continue;
@ -1475,19 +1690,27 @@ struct SDGenerationParams {
} }
} }
bool process_and_check(SDMode mode, const std::string& lora_model_dir) { bool width_and_height_are_set() const {
if (width <= 0) { return width > 0 && height > 0;
fprintf(stderr, "error: the width must be greater than 0\n"); }
return false;
}
if (height <= 0) { void set_width_and_height_if_unset(int w, int h) {
fprintf(stderr, "error: the height must be greater than 0\n"); if (!width_and_height_are_set()) {
return false; LOG_INFO("set width x height to %d x %d", w, h);
width = w;
height = h;
} }
}
int get_resolved_width() const { return (width > 0) ? width : 512; }
int get_resolved_height() const { return (height > 0) ? height : 512; }
bool process_and_check(SDMode mode, const std::string& lora_model_dir) {
prompt_with_lora = prompt;
if (sample_params.sample_steps <= 0) { if (sample_params.sample_steps <= 0) {
fprintf(stderr, "error: the sample_steps must be greater than 0\n"); LOG_ERROR("error: the sample_steps must be greater than 0\n");
return false; return false;
} }
@ -1496,61 +1719,116 @@ struct SDGenerationParams {
} }
if (strength < 0.f || strength > 1.f) { if (strength < 0.f || strength > 1.f) {
fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); LOG_ERROR("error: can only work with strength in [0.0, 1.0]\n");
return false; return false;
} }
if (!easycache_option.empty()) { sd_cache_params_init(&cache_params);
float values[3] = {0.0f, 0.0f, 0.0f};
std::stringstream ss(easycache_option); auto parse_named_params = [&](const std::string& opt_str) -> bool {
std::stringstream ss(opt_str);
std::string token; std::string token;
int idx = 0;
while (std::getline(ss, token, ',')) { while (std::getline(ss, token, ',')) {
auto trim = [](std::string& s) { size_t eq_pos = token.find('=');
const char* whitespace = " \t\r\n"; if (eq_pos == std::string::npos) {
auto start = s.find_first_not_of(whitespace); LOG_ERROR("error: cache option '%s' missing '=' separator", token.c_str());
if (start == std::string::npos) {
s.clear();
return;
}
auto end = s.find_last_not_of(whitespace);
s = s.substr(start, end - start + 1);
};
trim(token);
if (token.empty()) {
fprintf(stderr, "error: invalid easycache option '%s'\n", easycache_option.c_str());
return false;
}
if (idx >= 3) {
fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n");
return false; return false;
} }
std::string key = token.substr(0, eq_pos);
std::string val = token.substr(eq_pos + 1);
try { try {
values[idx] = std::stof(token); if (key == "threshold") {
if (cache_mode == "easycache" || cache_mode == "ucache") {
cache_params.reuse_threshold = std::stof(val);
} else {
cache_params.residual_diff_threshold = std::stof(val);
}
} else if (key == "start") {
cache_params.start_percent = std::stof(val);
} else if (key == "end") {
cache_params.end_percent = std::stof(val);
} else if (key == "decay") {
cache_params.error_decay_rate = std::stof(val);
} else if (key == "relative") {
cache_params.use_relative_threshold = (std::stof(val) != 0.0f);
} else if (key == "reset") {
cache_params.reset_error_on_compute = (std::stof(val) != 0.0f);
} else if (key == "Fn" || key == "fn") {
cache_params.Fn_compute_blocks = std::stoi(val);
} else if (key == "Bn" || key == "bn") {
cache_params.Bn_compute_blocks = std::stoi(val);
} else if (key == "warmup") {
if (cache_mode == "spectrum") {
cache_params.spectrum_warmup_steps = std::stoi(val);
} else {
cache_params.max_warmup_steps = std::stoi(val);
}
} else if (key == "w") {
cache_params.spectrum_w = std::stof(val);
} else if (key == "m") {
cache_params.spectrum_m = std::stoi(val);
} else if (key == "lam") {
cache_params.spectrum_lam = std::stof(val);
} else if (key == "window") {
cache_params.spectrum_window_size = std::stoi(val);
} else if (key == "flex") {
cache_params.spectrum_flex_window = std::stof(val);
} else if (key == "stop") {
cache_params.spectrum_stop_percent = std::stof(val);
} else {
LOG_ERROR("error: unknown cache parameter '%s'", key.c_str());
return false;
}
} catch (const std::exception&) { } catch (const std::exception&) {
fprintf(stderr, "error: invalid easycache value '%s'\n", token.c_str()); LOG_ERROR("error: invalid value '%s' for parameter '%s'", val.c_str(), key.c_str());
return false; return false;
} }
idx++;
} }
if (idx != 3) { return true;
fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n"); };
return false;
if (!cache_mode.empty()) {
if (cache_mode == "easycache") {
cache_params.mode = SD_CACHE_EASYCACHE;
} else if (cache_mode == "ucache") {
cache_params.mode = SD_CACHE_UCACHE;
} else if (cache_mode == "dbcache") {
cache_params.mode = SD_CACHE_DBCACHE;
} else if (cache_mode == "taylorseer") {
cache_params.mode = SD_CACHE_TAYLORSEER;
} else if (cache_mode == "cache-dit") {
cache_params.mode = SD_CACHE_CACHE_DIT;
} else if (cache_mode == "spectrum") {
cache_params.mode = SD_CACHE_SPECTRUM;
} }
if (values[0] < 0.0f) {
fprintf(stderr, "error: easycache threshold must be non-negative\n"); if (!cache_option.empty()) {
return false; if (!parse_named_params(cache_option)) {
return false;
}
} }
if (values[1] < 0.0f || values[1] >= 1.0f || values[2] <= 0.0f || values[2] > 1.0f || values[1] >= values[2]) {
fprintf(stderr, "error: easycache start/end percents must satisfy 0.0 <= start < end <= 1.0\n"); if (cache_mode == "easycache" || cache_mode == "ucache") {
return false; if (cache_params.reuse_threshold < 0.0f) {
LOG_ERROR("error: cache threshold must be non-negative");
return false;
}
if (cache_params.start_percent < 0.0f || cache_params.start_percent >= 1.0f ||
cache_params.end_percent <= 0.0f || cache_params.end_percent > 1.0f ||
cache_params.start_percent >= cache_params.end_percent) {
LOG_ERROR("error: cache start/end percents must satisfy 0.0 <= start < end <= 1.0");
return false;
}
} }
easycache_params.enabled = true; }
easycache_params.reuse_threshold = values[0];
easycache_params.start_percent = values[1]; if (cache_params.mode == SD_CACHE_DBCACHE ||
easycache_params.end_percent = values[2]; cache_params.mode == SD_CACHE_TAYLORSEER ||
} else { cache_params.mode == SD_CACHE_CACHE_DIT) {
easycache_params.enabled = false; if (!scm_mask.empty()) {
cache_params.scm_mask = scm_mask.c_str();
}
cache_params.scm_policy_dynamic = scm_policy_dynamic;
} }
sample_params.guidance.slg.layers = skip_layers.data(); sample_params.guidance.slg.layers = skip_layers.data();
@ -1582,7 +1860,7 @@ struct SDGenerationParams {
if (mode == UPSCALE) { if (mode == UPSCALE) {
if (init_image_path.length() == 0) { if (init_image_path.length() == 0) {
fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n"); LOG_ERROR("error: upscale mode needs an init image (--init-img)\n");
return false; return false;
} }
} }
@ -1652,12 +1930,13 @@ struct SDGenerationParams {
<< " high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n" << " high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n"
<< " high_noise_sample_params: " << high_noise_sample_params_str << ",\n" << " high_noise_sample_params: " << high_noise_sample_params_str << ",\n"
<< " custom_sigmas: " << vec_to_string(custom_sigmas) << ",\n" << " custom_sigmas: " << vec_to_string(custom_sigmas) << ",\n"
<< " easycache_option: \"" << easycache_option << "\",\n" << " cache_mode: \"" << cache_mode << "\",\n"
<< " easycache: " << " cache_option: \"" << cache_option << "\",\n"
<< (easycache_params.enabled ? "enabled" : "disabled") << " cache: "
<< " (threshold=" << easycache_params.reuse_threshold << (cache_params.mode != SD_CACHE_DISABLED ? "enabled" : "disabled")
<< ", start=" << easycache_params.start_percent << " (threshold=" << cache_params.reuse_threshold
<< ", end=" << easycache_params.end_percent << "),\n" << ", start=" << cache_params.start_percent
<< ", end=" << cache_params.end_percent << "),\n"
<< " moe_boundary: " << moe_boundary << ",\n" << " moe_boundary: " << moe_boundary << ",\n"
<< " video_frames: " << video_frames << ",\n" << " video_frames: " << video_frames << ",\n"
<< " fps: " << fps << ",\n" << " fps: " << fps << ",\n"
@ -1697,13 +1976,13 @@ uint8_t* load_image_common(bool from_memory,
image_buffer = (uint8_t*)stbi_load(image_path_or_bytes, &width, &height, &c, expected_channel); image_buffer = (uint8_t*)stbi_load(image_path_or_bytes, &width, &height, &c, expected_channel);
} }
if (image_buffer == nullptr) { if (image_buffer == nullptr) {
fprintf(stderr, "load image from '%s' failed\n", image_path); LOG_ERROR("load image from '%s' failed", image_path);
return nullptr; return nullptr;
} }
if (c < expected_channel) { if (c < expected_channel) {
fprintf(stderr, fprintf(stderr,
"the number of channels for the input image must be >= %d," "the number of channels for the input image must be >= %d,"
"but got %d channels, image_path = %s\n", "but got %d channels, image_path = %s",
expected_channel, expected_channel,
c, c,
image_path); image_path);
@ -1711,12 +1990,12 @@ uint8_t* load_image_common(bool from_memory,
return nullptr; return nullptr;
} }
if (width <= 0) { if (width <= 0) {
fprintf(stderr, "error: the width of image must be greater than 0, image_path = %s\n", image_path); LOG_ERROR("error: the width of image must be greater than 0, image_path = %s", image_path);
free(image_buffer); free(image_buffer);
return nullptr; return nullptr;
} }
if (height <= 0) { if (height <= 0) {
fprintf(stderr, "error: the height of image must be greater than 0, image_path = %s\n", image_path); LOG_ERROR("error: the height of image must be greater than 0, image_path = %s", image_path);
free(image_buffer); free(image_buffer);
return nullptr; return nullptr;
} }
@ -1738,10 +2017,10 @@ uint8_t* load_image_common(bool from_memory,
} }
if (crop_x != 0 || crop_y != 0) { if (crop_x != 0 || crop_y != 0) {
printf("crop input image from %dx%d to %dx%d, image_path = %s\n", width, height, crop_w, crop_h, image_path); LOG_INFO("crop input image from %dx%d to %dx%d, image_path = %s", width, height, crop_w, crop_h, image_path);
uint8_t* cropped_image_buffer = (uint8_t*)malloc(crop_w * crop_h * expected_channel); uint8_t* cropped_image_buffer = (uint8_t*)malloc(crop_w * crop_h * expected_channel);
if (cropped_image_buffer == nullptr) { if (cropped_image_buffer == nullptr) {
fprintf(stderr, "error: allocate memory for crop\n"); LOG_ERROR("error: allocate memory for crop\n");
free(image_buffer); free(image_buffer);
return nullptr; return nullptr;
} }
@ -1757,13 +2036,13 @@ uint8_t* load_image_common(bool from_memory,
image_buffer = cropped_image_buffer; image_buffer = cropped_image_buffer;
} }
printf("resize input image from %dx%d to %dx%d\n", width, height, expected_width, expected_height); LOG_INFO("resize input image from %dx%d to %dx%d", width, height, expected_width, expected_height);
int resized_height = expected_height; int resized_height = expected_height;
int resized_width = expected_width; int resized_width = expected_width;
uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * expected_channel); uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * expected_channel);
if (resized_image_buffer == nullptr) { if (resized_image_buffer == nullptr) {
fprintf(stderr, "error: allocate memory for resize input image\n"); LOG_ERROR("error: allocate memory for resize input image\n");
free(image_buffer); free(image_buffer);
return nullptr; return nullptr;
} }
@ -1790,6 +2069,22 @@ uint8_t* load_image_from_file(const char* image_path,
return load_image_common(false, image_path, 0, width, height, expected_width, expected_height, expected_channel); return load_image_common(false, image_path, 0, width, height, expected_width, expected_height, expected_channel);
} }
bool load_sd_image_from_file(sd_image_t* image,
const char* image_path,
int expected_width = 0,
int expected_height = 0,
int expected_channel = 3) {
int width;
int height;
image->data = load_image_common(false, image_path, 0, width, height, expected_width, expected_height, expected_channel);
if (image->data == nullptr) {
return false;
}
image->width = width;
image->height = height;
return true;
}
uint8_t* load_image_from_memory(const char* image_bytes, uint8_t* load_image_from_memory(const char* image_bytes,
int len, int len,
int& width, int& width,
@ -1798,4 +2093,4 @@ uint8_t* load_image_from_memory(const char* image_bytes,
int expected_height = 0, int expected_height = 0,
int expected_channel = 3) { int expected_channel = 3) {
return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel); return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel);
} }

View File

@ -1,6 +1,73 @@
set(TARGET sd-server) set(TARGET sd-server)
option(SD_SERVER_BUILD_FRONTEND "Build server frontend with pnpm" ON)
set(FRONTEND_DIR "${CMAKE_CURRENT_SOURCE_DIR}/frontend")
set(GENERATED_HTML_HEADER "${FRONTEND_DIR}/dist/gen_index_html.h")
set(HAVE_FRONTEND_BUILD OFF)
if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}")
if(WIN32)
find_program(PNPM_EXECUTABLE NAMES pnpm.cmd pnpm)
else()
find_program(PNPM_EXECUTABLE NAMES pnpm)
endif()
if(PNPM_EXECUTABLE)
message(STATUS "Frontend dir found: ${FRONTEND_DIR}")
message(STATUS "pnpm found: ${PNPM_EXECUTABLE}")
set(HAVE_FRONTEND_BUILD ON)
add_custom_target(${TARGET}_frontend_install
COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" install
WORKING_DIRECTORY "${FRONTEND_DIR}"
COMMENT "Installing frontend dependencies"
VERBATIM
)
add_custom_target(${TARGET}_frontend_build
COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build
WORKING_DIRECTORY "${FRONTEND_DIR}"
COMMENT "Building frontend"
VERBATIM
)
add_custom_target(${TARGET}_frontend_header
COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build:header
WORKING_DIRECTORY "${FRONTEND_DIR}"
COMMENT "Generating gen_index_html.h"
VERBATIM
)
add_dependencies(${TARGET}_frontend_build ${TARGET}_frontend_install)
add_dependencies(${TARGET}_frontend_header ${TARGET}_frontend_build)
add_custom_target(${TARGET}_frontend
DEPENDS ${TARGET}_frontend_header
)
set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE)
else()
message(WARNING "pnpm not found, frontend build disabled")
endif()
else()
message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}")
endif()
add_executable(${TARGET} main.cpp) add_executable(${TARGET} main.cpp)
if(HAVE_FRONTEND_BUILD)
add_dependencies(${TARGET} ${TARGET}_frontend)
target_sources(${TARGET} PRIVATE "${GENERATED_HTML_HEADER}")
target_include_directories(${TARGET} PRIVATE "${FRONTEND_DIR}/dist")
target_compile_definitions(${TARGET} PRIVATE HAVE_INDEX_HTML)
message(STATUS "HAVE_INDEX_HTML enabled")
else()
message(STATUS "HAVE_INDEX_HTML disabled")
endif()
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17) target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)

View File

@ -1,14 +1,104 @@
# Frontend
## Build with Frontend
The server can optionally build the web frontend and embed it into the binary as `gen_index_html.h`.
### Requirements
Install the following tools:
* **Node.js** ≥ 22.18
https://nodejs.org/
* **pnpm** ≥ 10
Install via npm:
```bash
npm install -g pnpm
```
Verify installation:
```bash
node -v
pnpm -v
```
### Install frontend dependencies
Go to the frontend directory and install dependencies:
```bash
cd examples/server/frontend
pnpm install
```
### Build the server with CMake
Enable the frontend build option when configuring CMake:
```bash
cmake -B build -DSD_SERVER_BUILD_FRONTEND=ON
cmake --build build --config Release
```
If `pnpm` is available, the build system will automatically run:
```
pnpm run build
pnpm run build:header
```
and embed the generated frontend into the server binary.
## Frontend Repository
The web frontend is maintained in a **separate repository**, https://github.com/leejet/stable-ui.
If you want to modify the UI or frontend logic, please submit pull requests to the **frontend repository**.
This repository (`stable-diffusion.cpp`) only vendors the frontend periodically. Changes from the frontend repo are synchronized:
* approximately **every 12 weeks**, or
* when there are **major frontend updates**
Because of this, frontend changes will **not appear here immediately** after being merged upstream.
## Using an external frontend
By default, the server uses the **embedded frontend** generated during the build (`gen_index_html.h`).
You can also serve a custom frontend file instead of the embedded one by using:
```bash
--serve-html-path <path-to-index.html>
```
For example:
```bash
sd-server --serve-html-path ./index.html
```
In this case, the server will load and serve the specified `index.html` file instead of the embedded frontend. This is useful when:
* developing or testing frontend changes
* using a custom UI
* avoiding rebuilding the binary after frontend modifications
# Run # Run
``` ```
usage: ./bin/sd-server [options] usage: ./bin/sd-server [options]
Svr Options: Svr Options:
-l, --listen-ip <string> server listen ip (default: 127.0.0.1) -l, --listen-ip <string> server listen ip (default: 127.0.0.1)
--listen-port <int> server listen port (default: 1234) --serve-html-path <string> path to HTML file to serve at root (optional)
-v, --verbose print extra info --listen-port <int> server listen port (default: 1234)
--color colors the logging tags according to level -v, --verbose print extra info
-h, --help show this help message and exit --color colors the logging tags according to level
-h, --help show this help message and exit
Context Options: Context Options:
-m, --model <string> path to full model -m, --model <string> path to full model
@ -24,6 +114,7 @@ Context Options:
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model --high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--vae <string> path to standalone vae model --vae <string> path to standalone vae model
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) --taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--tae <string> alias of --taesd
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
@ -34,17 +125,22 @@ Context Options:
CPU physical cores CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5) --vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tiling process vae in tiles to reduce memory usage --vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram) --control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram) --vae-on-cpu keep vae in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model --fa use flash attention
--diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model --diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model --vae-conv-direct use ggml_conv2d_direct in the vae model
--circular enable circular padding for convolutions
--circularx enable circular RoPE wrapping on x-axis (width) only
--circulary enable circular RoPE wrapping on y-axis (height) only
--chroma-disable-dit-mask disable dit mask for chroma --chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma --chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file type of the weight file
@ -94,6 +190,7 @@ Default Generation Options:
--skip-layer-start <float> SLG enabling point (default: 0.01) --skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2) --skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> eta in DDIM, only for DDIM and TCD (default: 0) --eta <float> eta in DDIM, only for DDIM and TCD (default: 0)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0) --high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) --high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5) --high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
@ -110,14 +207,21 @@ Default Generation Options:
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], euler_a otherwise
default: discrete --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0"). --sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95) --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
``` --cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
```

@ -0,0 +1 @@
Subproject commit 1a34176cd6d39ad3a226b2b69047e71f6797f6bc

View File

@ -13,6 +13,10 @@
#include "common/common.hpp" #include "common/common.hpp"
#ifdef HAVE_INDEX_HTML
#include "frontend/dist/gen_index_html.h"
#endif
namespace fs = std::filesystem; namespace fs = std::filesystem;
// ----------------------- helpers ----------------------- // ----------------------- helpers -----------------------
@ -44,7 +48,7 @@ inline bool is_base64(unsigned char c) {
} }
std::vector<uint8_t> base64_decode(const std::string& encoded_string) { std::vector<uint8_t> base64_decode(const std::string& encoded_string) {
int in_len = encoded_string.size(); int in_len = static_cast<int>(encoded_string.size());
int i = 0; int i = 0;
int j = 0; int j = 0;
int in_ = 0; int in_ = 0;
@ -86,27 +90,13 @@ std::vector<uint8_t> base64_decode(const std::string& encoded_string) {
return ret; return ret;
} }
std::string iso_timestamp_now() {
using namespace std::chrono;
auto now = system_clock::now();
std::time_t t = system_clock::to_time_t(now);
std::tm tm{};
#ifdef _MSC_VER
gmtime_s(&tm, &t);
#else
gmtime_r(&t, &tm);
#endif
std::ostringstream oss;
oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
return oss.str();
}
struct SDSvrParams { struct SDSvrParams {
std::string listen_ip = "127.0.0.1"; std::string listen_ip = "127.0.0.1";
int listen_port = 1234; int listen_port = 1234;
bool normal_exit = false; std::string serve_html_path;
bool verbose = false; bool normal_exit = false;
bool color = false; bool verbose = false;
bool color = false;
ArgOptions get_options() { ArgOptions get_options() {
ArgOptions options; ArgOptions options;
@ -115,7 +105,11 @@ struct SDSvrParams {
{"-l", {"-l",
"--listen-ip", "--listen-ip",
"server listen ip (default: 127.0.0.1)", "server listen ip (default: 127.0.0.1)",
&listen_ip}}; &listen_ip},
{"",
"--serve-html-path",
"path to HTML file to serve at root (optional)",
&serve_html_path}};
options.int_options = { options.int_options = {
{"", {"",
@ -151,12 +145,17 @@ struct SDSvrParams {
bool process_and_check() { bool process_and_check() {
if (listen_ip.empty()) { if (listen_ip.empty()) {
fprintf(stderr, "error: the following arguments are required: listen_ip\n"); LOG_ERROR("error: the following arguments are required: listen_ip");
return false; return false;
} }
if (listen_port < 0 || listen_port > 65535) { if (listen_port < 0 || listen_port > 65535) {
fprintf(stderr, "error: listen_port should be in the range [0, 65535]\n"); LOG_ERROR("error: listen_port should be in the range [0, 65535]");
return false;
}
if (!serve_html_path.empty() && !fs::exists(serve_html_path)) {
LOG_ERROR("error: serve_html_path file does not exist: %s", serve_html_path.c_str());
return false; return false;
} }
return true; return true;
@ -167,6 +166,7 @@ struct SDSvrParams {
oss << "SDSvrParams {\n" oss << "SDSvrParams {\n"
<< " listen_ip: " << listen_ip << ",\n" << " listen_ip: " << listen_ip << ",\n"
<< " listen_port: \"" << listen_port << "\",\n" << " listen_port: \"" << listen_port << "\",\n"
<< " serve_html_path: \"" << serve_html_path << "\",\n"
<< "}"; << "}";
return oss.str(); return oss.str();
} }
@ -191,12 +191,18 @@ void parse_args(int argc, const char** argv, SDSvrParams& svr_params, SDContextP
exit(svr_params.normal_exit ? 0 : 1); exit(svr_params.normal_exit ? 0 : 1);
} }
const bool random_seed_requested = default_gen_params.seed < 0;
if (!svr_params.process_and_check() || if (!svr_params.process_and_check() ||
!ctx_params.process_and_check(IMG_GEN) || !ctx_params.process_and_check(IMG_GEN) ||
!default_gen_params.process_and_check(IMG_GEN, ctx_params.lora_model_dir)) { !default_gen_params.process_and_check(IMG_GEN, ctx_params.lora_model_dir)) {
print_usage(argc, argv, options_vec); print_usage(argc, argv, options_vec);
exit(1); exit(1);
} }
if (random_seed_requested) {
default_gen_params.seed = -1;
}
} }
std::string extract_and_remove_sd_cpp_extra_args(std::string& text) { std::string extract_and_remove_sd_cpp_extra_args(std::string& text) {
@ -256,74 +262,109 @@ std::vector<uint8_t> write_image_to_vector(
return buffer; return buffer;
} }
/* Enables Printing the log level tag in color using ANSI escape codes */
void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
SDSvrParams* svr_params = (SDSvrParams*)data; SDSvrParams* svr_params = (SDSvrParams*)data;
int tag_color; log_print(level, log, svr_params->verbose, svr_params->color);
const char* level_str; }
FILE* out_stream = (level == SD_LOG_ERROR) ? stderr : stdout;
if (!log || (!svr_params->verbose && level <= SD_LOG_DEBUG)) { struct LoraEntry {
return; std::string name;
} std::string path;
std::string fullpath;
};
switch (level) { void free_results(sd_image_t* result_images, int num_results) {
case SD_LOG_DEBUG: if (result_images) {
tag_color = 37; for (int i = 0; i < num_results; ++i) {
level_str = "DEBUG"; if (result_images[i].data) {
break; stbi_image_free(result_images[i].data);
case SD_LOG_INFO: result_images[i].data = nullptr;
tag_color = 34; }
level_str = "INFO"; }
break;
case SD_LOG_WARN:
tag_color = 35;
level_str = "WARN";
break;
case SD_LOG_ERROR:
tag_color = 31;
level_str = "ERROR";
break;
default: /* Potential future-proofing */
tag_color = 33;
level_str = "?????";
break;
} }
free(result_images);
if (svr_params->color == true) {
fprintf(out_stream, "\033[%d;1m[%-5s]\033[0m ", tag_color, level_str);
} else {
fprintf(out_stream, "[%-5s] ", level_str);
}
fputs(log, out_stream);
fflush(out_stream);
} }
int main(int argc, const char** argv) { int main(int argc, const char** argv) {
if (argc > 1 && std::string(argv[1]) == "--version") {
std::cout << version_string() << "\n";
return EXIT_SUCCESS;
}
SDSvrParams svr_params; SDSvrParams svr_params;
SDContextParams ctx_params; SDContextParams ctx_params;
SDGenerationParams default_gen_params; SDGenerationParams default_gen_params;
parse_args(argc, argv, svr_params, ctx_params, default_gen_params); parse_args(argc, argv, svr_params, ctx_params, default_gen_params);
sd_set_log_callback(sd_log_cb, (void*)&svr_params); sd_set_log_callback(sd_log_cb, (void*)&svr_params);
log_verbose = svr_params.verbose;
log_color = svr_params.color;
if (svr_params.verbose) { LOG_DEBUG("version: %s", version_string().c_str());
printf("%s", sd_get_system_info()); LOG_DEBUG("%s", sd_get_system_info());
printf("%s\n", svr_params.to_string().c_str()); LOG_DEBUG("%s", svr_params.to_string().c_str());
printf("%s\n", ctx_params.to_string().c_str()); LOG_DEBUG("%s", ctx_params.to_string().c_str());
printf("%s\n", default_gen_params.to_string().c_str()); LOG_DEBUG("%s", default_gen_params.to_string().c_str());
}
sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false, false, false); sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false, false, false);
sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params); sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
if (sd_ctx == nullptr) { if (sd_ctx == nullptr) {
printf("new_sd_ctx_t failed\n"); LOG_ERROR("new_sd_ctx_t failed");
return 1; return 1;
} }
std::mutex sd_ctx_mutex; std::mutex sd_ctx_mutex;
std::vector<LoraEntry> lora_cache;
std::mutex lora_mutex;
auto refresh_lora_cache = [&]() {
std::vector<LoraEntry> new_cache;
fs::path lora_dir = ctx_params.lora_model_dir;
if (fs::exists(lora_dir) && fs::is_directory(lora_dir)) {
auto is_lora_ext = [](const fs::path& p) {
auto ext = p.extension().string();
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors";
};
for (auto& entry : fs::recursive_directory_iterator(lora_dir)) {
if (!entry.is_regular_file())
continue;
const fs::path& p = entry.path();
if (!is_lora_ext(p))
continue;
LoraEntry e;
e.name = p.stem().u8string();
e.fullpath = p.u8string();
std::string rel = p.lexically_relative(lora_dir).u8string();
std::replace(rel.begin(), rel.end(), '\\', '/');
e.path = rel;
new_cache.push_back(std::move(e));
}
}
std::sort(new_cache.begin(), new_cache.end(),
[](const LoraEntry& a, const LoraEntry& b) {
return a.path < b.path;
});
{
std::lock_guard<std::mutex> lock(lora_mutex);
lora_cache = std::move(new_cache);
}
};
auto get_lora_full_path = [&](const std::string& path) -> std::string {
std::lock_guard<std::mutex> lock(lora_mutex);
auto it = std::find_if(lora_cache.begin(), lora_cache.end(),
[&](const LoraEntry& e) { return e.path == path; });
return (it != lora_cache.end()) ? it->fullpath : "";
};
httplib::Server svr; httplib::Server svr;
svr.set_pre_routing_handler([](const httplib::Request& req, httplib::Response& res) { svr.set_pre_routing_handler([](const httplib::Request& req, httplib::Response& res) {
@ -343,9 +384,26 @@ int main(int argc, const char** argv) {
return httplib::Server::HandlerResponse::Unhandled; return httplib::Server::HandlerResponse::Unhandled;
}); });
// health // index html
std::string index_html;
#ifdef HAVE_INDEX_HTML
index_html.assign(reinterpret_cast<const char*>(index_html_bytes), index_html_size);
#else
index_html = "Stable Diffusion Server is running";
#endif
svr.Get("/", [&](const httplib::Request&, httplib::Response& res) { svr.Get("/", [&](const httplib::Request&, httplib::Response& res) {
res.set_content(R"({"ok":true,"service":"sd-cpp-http"})", "application/json"); if (!svr_params.serve_html_path.empty()) {
std::ifstream file(svr_params.serve_html_path);
if (file) {
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
res.set_content(content, "text/html");
} else {
res.status = 500;
res.set_content("Error: Unable to read HTML file", "text/plain");
}
} else {
res.set_content(index_html, "text/html");
}
}); });
// models endpoint (minimal) // models endpoint (minimal)
@ -371,8 +429,8 @@ int main(int argc, const char** argv) {
std::string size = j.value("size", ""); std::string size = j.value("size", "");
std::string output_format = j.value("output_format", "png"); std::string output_format = j.value("output_format", "png");
int output_compression = j.value("output_compression", 100); int output_compression = j.value("output_compression", 100);
int width = 512; int width = default_gen_params.width > 0 ? default_gen_params.width : 512;
int height = 512; int height = default_gen_params.width > 0 ? default_gen_params.height : 512;
if (!size.empty()) { if (!size.empty()) {
auto pos = size.find('x'); auto pos = size.find('x');
if (pos != std::string::npos) { if (pos != std::string::npos) {
@ -409,7 +467,7 @@ int main(int argc, const char** argv) {
} }
json out; json out;
out["created"] = iso_timestamp_now(); out["created"] = static_cast<long long>(std::time(nullptr));
out["data"] = json::array(); out["data"] = json::array();
out["output_format"] = output_format; out["output_format"] = output_format;
@ -425,15 +483,16 @@ int main(int argc, const char** argv) {
return; return;
} }
if (gen_params.sample_params.sample_steps > 100)
gen_params.sample_params.sample_steps = 100;
if (!gen_params.process_and_check(IMG_GEN, "")) { if (!gen_params.process_and_check(IMG_GEN, "")) {
res.status = 400; res.status = 400;
res.set_content(R"({"error":"invalid params"})", "application/json"); res.set_content(R"({"error":"invalid params"})", "application/json");
return; return;
} }
if (svr_params.verbose) { LOG_DEBUG("%s\n", gen_params.to_string().c_str());
printf("%s\n", gen_params.to_string().c_str());
}
sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
@ -467,7 +526,7 @@ int main(int argc, const char** argv) {
gen_params.pm_style_strength, gen_params.pm_style_strength,
}, // pm_params }, // pm_params
ctx_params.vae_tiling_params, ctx_params.vae_tiling_params,
gen_params.easycache_params, gen_params.cache_params,
}; };
sd_image_t* results = nullptr; sd_image_t* results = nullptr;
@ -490,7 +549,7 @@ int main(int argc, const char** argv) {
results[i].channel, results[i].channel,
output_compression); output_compression);
if (image_bytes.empty()) { if (image_bytes.empty()) {
printf("write image to mem failed\n"); LOG_ERROR("write image to mem failed");
continue; continue;
} }
@ -500,6 +559,7 @@ int main(int argc, const char** argv) {
item["b64_json"] = b64; item["b64_json"] = b64;
out["data"].push_back(item); out["data"].push_back(item);
} }
free_results(results, num_results);
res.set_content(out.dump(), "application/json"); res.set_content(out.dump(), "application/json");
res.status = 200; res.status = 200;
@ -530,8 +590,9 @@ int main(int argc, const char** argv) {
std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(prompt); std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(prompt);
size_t image_count = req.form.get_file_count("image[]"); size_t image_count = req.form.get_file_count("image[]");
if (image_count == 0) { bool has_legacy_image = req.form.has_file("image");
if (image_count == 0 && !has_legacy_image) {
res.status = 400; res.status = 400;
res.set_content(R"({"error":"at least one image[] required"})", "application/json"); res.set_content(R"({"error":"at least one image[] required"})", "application/json");
return; return;
@ -542,9 +603,13 @@ int main(int argc, const char** argv) {
auto file = req.form.get_file("image[]", i); auto file = req.form.get_file("image[]", i);
images_bytes.emplace_back(file.content.begin(), file.content.end()); images_bytes.emplace_back(file.content.begin(), file.content.end());
} }
if (image_count == 0 && has_legacy_image) {
auto file = req.form.get_file("image");
images_bytes.emplace_back(file.content.begin(), file.content.end());
}
std::vector<uint8_t> mask_bytes; std::vector<uint8_t> mask_bytes;
if (req.form.has_field("mask")) { if (req.form.has_file("mask")) {
auto file = req.form.get_file("mask"); auto file = req.form.get_file("mask");
mask_bytes.assign(file.content.begin(), file.content.end()); mask_bytes.assign(file.content.begin(), file.content.end());
} }
@ -559,7 +624,7 @@ int main(int argc, const char** argv) {
n = std::clamp(n, 1, 8); n = std::clamp(n, 1, 8);
std::string size = req.form.get_field("size"); std::string size = req.form.get_field("size");
int width = 512, height = 512; int width = -1, height = -1;
if (!size.empty()) { if (!size.empty()) {
auto pos = size.find('x'); auto pos = size.find('x');
if (pos != std::string::npos) { if (pos != std::string::npos) {
@ -605,28 +670,45 @@ int main(int argc, const char** argv) {
return; return;
} }
if (gen_params.sample_params.sample_steps > 100)
gen_params.sample_params.sample_steps = 100;
if (!gen_params.process_and_check(IMG_GEN, "")) { if (!gen_params.process_and_check(IMG_GEN, "")) {
res.status = 400; res.status = 400;
res.set_content(R"({"error":"invalid params"})", "application/json"); res.set_content(R"({"error":"invalid params"})", "application/json");
return; return;
} }
if (svr_params.verbose) { LOG_DEBUG("%s\n", gen_params.to_string().c_str());
printf("%s\n", gen_params.to_string().c_str());
}
sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; sd_image_t init_image = {0, 0, 3, nullptr};
sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; sd_image_t control_image = {0, 0, 3, nullptr};
std::vector<sd_image_t> pmid_images; std::vector<sd_image_t> pmid_images;
auto get_resolved_width = [&gen_params, &default_gen_params]() -> int {
if (gen_params.width > 0)
return gen_params.width;
if (default_gen_params.width > 0)
return default_gen_params.width;
return 512;
};
auto get_resolved_height = [&gen_params, &default_gen_params]() -> int {
if (gen_params.height > 0)
return gen_params.height;
if (default_gen_params.height > 0)
return default_gen_params.height;
return 512;
};
std::vector<sd_image_t> ref_images; std::vector<sd_image_t> ref_images;
ref_images.reserve(images_bytes.size()); ref_images.reserve(images_bytes.size());
for (auto& bytes : images_bytes) { for (auto& bytes : images_bytes) {
int img_w = width; int img_w;
int img_h = height; int img_h;
uint8_t* raw_pixels = load_image_from_memory( uint8_t* raw_pixels = load_image_from_memory(
reinterpret_cast<const char*>(bytes.data()), reinterpret_cast<const char*>(bytes.data()),
bytes.size(), static_cast<int>(bytes.size()),
img_w, img_h, img_w, img_h,
width, height, 3); width, height, 3);
@ -635,22 +717,31 @@ int main(int argc, const char** argv) {
} }
sd_image_t img{(uint32_t)img_w, (uint32_t)img_h, 3, raw_pixels}; sd_image_t img{(uint32_t)img_w, (uint32_t)img_h, 3, raw_pixels};
gen_params.set_width_and_height_if_unset(img.width, img.height);
ref_images.push_back(img); ref_images.push_back(img);
} }
sd_image_t mask_image = {0}; sd_image_t mask_image = {0};
if (!mask_bytes.empty()) { if (!mask_bytes.empty()) {
int mask_w = width; int expected_width = 0;
int mask_h = height; int expected_height = 0;
if (gen_params.width_and_height_are_set()) {
expected_width = gen_params.width;
expected_height = gen_params.height;
}
int mask_w;
int mask_h;
uint8_t* mask_raw = load_image_from_memory( uint8_t* mask_raw = load_image_from_memory(
reinterpret_cast<const char*>(mask_bytes.data()), reinterpret_cast<const char*>(mask_bytes.data()),
mask_bytes.size(), static_cast<int>(mask_bytes.size()),
mask_w, mask_h, mask_w, mask_h,
width, height, 1); expected_width, expected_height, 1);
mask_image = {(uint32_t)mask_w, (uint32_t)mask_h, 1, mask_raw}; mask_image = {(uint32_t)mask_w, (uint32_t)mask_h, 1, mask_raw};
gen_params.set_width_and_height_if_unset(mask_image.width, mask_image.height);
} else { } else {
mask_image.width = width; mask_image.width = get_resolved_width();
mask_image.height = height; mask_image.height = get_resolved_height();
mask_image.channel = 1; mask_image.channel = 1;
mask_image.data = nullptr; mask_image.data = nullptr;
} }
@ -667,8 +758,8 @@ int main(int argc, const char** argv) {
gen_params.auto_resize_ref_image, gen_params.auto_resize_ref_image,
gen_params.increase_ref_index, gen_params.increase_ref_index,
mask_image, mask_image,
gen_params.width, get_resolved_width(),
gen_params.height, get_resolved_height(),
gen_params.sample_params, gen_params.sample_params,
gen_params.strength, gen_params.strength,
gen_params.seed, gen_params.seed,
@ -682,7 +773,7 @@ int main(int argc, const char** argv) {
gen_params.pm_style_strength, gen_params.pm_style_strength,
}, // pm_params }, // pm_params
ctx_params.vae_tiling_params, ctx_params.vae_tiling_params,
gen_params.easycache_params, gen_params.cache_params,
}; };
sd_image_t* results = nullptr; sd_image_t* results = nullptr;
@ -695,7 +786,7 @@ int main(int argc, const char** argv) {
} }
json out; json out;
out["created"] = iso_timestamp_now(); out["created"] = static_cast<long long>(std::time(nullptr));
out["data"] = json::array(); out["data"] = json::array();
out["output_format"] = output_format; out["output_format"] = output_format;
@ -713,6 +804,7 @@ int main(int argc, const char** argv) {
item["b64_json"] = b64; item["b64_json"] = b64;
out["data"].push_back(item); out["data"].push_back(item);
} }
free_results(results, num_results);
res.set_content(out.dump(), "application/json"); res.set_content(out.dump(), "application/json");
res.status = 200; res.status = 200;
@ -735,7 +827,409 @@ int main(int argc, const char** argv) {
} }
}); });
printf("listening on: %s:%d\n", svr_params.listen_ip.c_str(), svr_params.listen_port); // sdapi endpoints (AUTOMATIC1111 / Forge)
auto sdapi_any2img = [&](const httplib::Request& req, httplib::Response& res, bool img2img) {
try {
if (req.body.empty()) {
res.status = 400;
res.set_content(R"({"error":"empty body"})", "application/json");
return;
}
json j = json::parse(req.body);
std::string prompt = j.value("prompt", "");
std::string negative_prompt = j.value("negative_prompt", "");
int width = j.value("width", 512);
int height = j.value("height", 512);
int steps = j.value("steps", default_gen_params.sample_params.sample_steps);
float cfg_scale = j.value("cfg_scale", default_gen_params.sample_params.guidance.txt_cfg);
int64_t seed = j.value("seed", -1);
int batch_size = j.value("batch_size", 1);
int clip_skip = j.value("clip_skip", -1);
std::string sampler_name = j.value("sampler_name", "");
std::string scheduler_name = j.value("scheduler", "");
auto bad = [&](const std::string& msg) {
res.status = 400;
res.set_content("{\"error\":\"" + msg + "\"}", "application/json");
return;
};
if (width <= 0 || height <= 0) {
return bad("width and height must be positive");
}
if (steps < 1 || steps > 150) {
return bad("steps must be in range [1, 150]");
}
if (batch_size < 1 || batch_size > 8) {
return bad("batch_size must be in range [1, 8]");
}
if (cfg_scale < 0.f) {
return bad("cfg_scale must be positive");
}
if (prompt.empty()) {
return bad("prompt required");
}
std::vector<sd_lora_t> sd_loras;
std::vector<std::string> lora_path_storage;
if (j.contains("lora") && j["lora"].is_array()) {
for (const auto& item : j["lora"]) {
if (!item.is_object()) {
continue;
}
std::string path = item.value("path", "");
float multiplier = item.value("multiplier", 1.0f);
bool is_high_noise = item.value("is_high_noise", false);
if (path.empty()) {
return bad("lora.path required");
}
std::string fullpath = get_lora_full_path(path);
if (fullpath.empty()) {
return bad("invalid lora path: " + path);
}
lora_path_storage.push_back(fullpath);
sd_lora_t l;
l.is_high_noise = is_high_noise;
l.multiplier = multiplier;
l.path = lora_path_storage.back().c_str();
sd_loras.push_back(l);
}
}
auto get_sample_method = [](std::string name) -> enum sample_method_t {
enum sample_method_t result = str_to_sample_method(name.c_str());
if (result != SAMPLE_METHOD_COUNT) return result;
// some applications use a hardcoded sampler list
std::transform(name.begin(), name.end(), name.begin(),
[](unsigned char c) { return std::tolower(c); });
static const std::unordered_map<std::string_view, sample_method_t> hardcoded{
{"euler a", EULER_A_SAMPLE_METHOD},
{"k_euler_a", EULER_A_SAMPLE_METHOD},
{"euler", EULER_SAMPLE_METHOD},
{"k_euler", EULER_SAMPLE_METHOD},
{"heun", HEUN_SAMPLE_METHOD},
{"k_heun", HEUN_SAMPLE_METHOD},
{"dpm2", DPM2_SAMPLE_METHOD},
{"k_dpm_2", DPM2_SAMPLE_METHOD},
{"lcm", LCM_SAMPLE_METHOD},
{"ddim", DDIM_TRAILING_SAMPLE_METHOD},
{"dpm++ 2m", DPMPP2M_SAMPLE_METHOD},
{"k_dpmpp_2m", DPMPP2M_SAMPLE_METHOD},
{"res multistep", RES_MULTISTEP_SAMPLE_METHOD},
{"k_res_multistep", RES_MULTISTEP_SAMPLE_METHOD},
{"res 2s", RES_2S_SAMPLE_METHOD},
{"k_res_2s", RES_2S_SAMPLE_METHOD}};
auto it = hardcoded.find(name);
if (it != hardcoded.end()) return it->second;
return SAMPLE_METHOD_COUNT;
};
enum sample_method_t sample_method = get_sample_method(sampler_name);
enum scheduler_t scheduler = str_to_scheduler(scheduler_name.c_str());
SDGenerationParams gen_params = default_gen_params;
gen_params.prompt = prompt;
gen_params.negative_prompt = negative_prompt;
gen_params.seed = seed;
gen_params.sample_params.sample_steps = steps;
gen_params.batch_count = batch_size;
gen_params.sample_params.guidance.txt_cfg = cfg_scale;
if (clip_skip > 0) {
gen_params.clip_skip = clip_skip;
}
if (sample_method != SAMPLE_METHOD_COUNT) {
gen_params.sample_params.sample_method = sample_method;
}
if (scheduler != SCHEDULER_COUNT) {
gen_params.sample_params.scheduler = scheduler;
}
// re-read to avoid applying 512 as default before the provided
// images and/or server command-line
gen_params.width = j.value("width", -1);
gen_params.height = j.value("height", -1);
LOG_DEBUG("%s\n", gen_params.to_string().c_str());
sd_image_t init_image = {0, 0, 3, nullptr};
sd_image_t control_image = {0, 0, 3, nullptr};
sd_image_t mask_image = {0, 0, 1, nullptr};
std::vector<uint8_t> mask_data;
std::vector<sd_image_t> pmid_images;
std::vector<sd_image_t> ref_images;
auto get_resolved_width = [&gen_params, &default_gen_params]() -> int {
if (gen_params.width > 0)
return gen_params.width;
if (default_gen_params.width > 0)
return default_gen_params.width;
return 512;
};
auto get_resolved_height = [&gen_params, &default_gen_params]() -> int {
if (gen_params.height > 0)
return gen_params.height;
if (default_gen_params.height > 0)
return default_gen_params.height;
return 512;
};
auto decode_image = [&gen_params](sd_image_t& image, std::string encoded) -> bool {
// remove data URI prefix if present ("data:image/png;base64,")
auto comma_pos = encoded.find(',');
if (comma_pos != std::string::npos) {
encoded = encoded.substr(comma_pos + 1);
}
std::vector<uint8_t> img_data = base64_decode(encoded);
if (!img_data.empty()) {
int expected_width = 0;
int expected_height = 0;
if (gen_params.width_and_height_are_set()) {
expected_width = gen_params.width;
expected_height = gen_params.height;
}
int img_w;
int img_h;
uint8_t* raw_data = load_image_from_memory(
(const char*)img_data.data(), (int)img_data.size(),
img_w, img_h,
expected_width, expected_height, image.channel);
if (raw_data) {
image = {(uint32_t)img_w, (uint32_t)img_h, image.channel, raw_data};
gen_params.set_width_and_height_if_unset(image.width, image.height);
return true;
}
}
return false;
};
if (img2img) {
if (j.contains("init_images") && j["init_images"].is_array() && !j["init_images"].empty()) {
std::string encoded = j["init_images"][0].get<std::string>();
decode_image(init_image, encoded);
}
if (j.contains("mask") && j["mask"].is_string()) {
std::string encoded = j["mask"].get<std::string>();
decode_image(mask_image, encoded);
bool inpainting_mask_invert = j.value("inpainting_mask_invert", 0) != 0;
if (inpainting_mask_invert && mask_image.data != nullptr) {
for (uint32_t i = 0; i < mask_image.width * mask_image.height; i++) {
mask_image.data[i] = 255 - mask_image.data[i];
}
}
} else {
int m_width = get_resolved_width();
int m_height = get_resolved_height();
mask_data = std::vector<uint8_t>(m_width * m_height, 255);
mask_image.width = m_width;
mask_image.height = m_height;
mask_image.channel = 1;
mask_image.data = mask_data.data();
}
float denoising_strength = j.value("denoising_strength", -1.f);
if (denoising_strength >= 0.f) {
denoising_strength = std::min(denoising_strength, 1.0f);
gen_params.strength = denoising_strength;
}
}
if (j.contains("extra_images") && j["extra_images"].is_array()) {
for (auto extra_image : j["extra_images"]) {
std::string encoded = extra_image.get<std::string>();
sd_image_t tmp_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
if (decode_image(tmp_image, encoded)) {
ref_images.push_back(tmp_image);
}
}
}
sd_img_gen_params_t img_gen_params = {
sd_loras.data(),
static_cast<uint32_t>(sd_loras.size()),
gen_params.prompt.c_str(),
gen_params.negative_prompt.c_str(),
gen_params.clip_skip,
init_image,
ref_images.data(),
(int)ref_images.size(),
gen_params.auto_resize_ref_image,
gen_params.increase_ref_index,
mask_image,
get_resolved_width(),
get_resolved_height(),
gen_params.sample_params,
gen_params.strength,
gen_params.seed,
gen_params.batch_count,
control_image,
gen_params.control_strength,
{
pmid_images.data(),
(int)pmid_images.size(),
gen_params.pm_id_embed_path.c_str(),
gen_params.pm_style_strength,
}, // pm_params
ctx_params.vae_tiling_params,
gen_params.cache_params,
};
sd_image_t* results = nullptr;
int num_results = 0;
{
std::lock_guard<std::mutex> lock(sd_ctx_mutex);
results = generate_image(sd_ctx, &img_gen_params);
num_results = gen_params.batch_count;
}
json out;
out["images"] = json::array();
out["parameters"] = j; // TODO should return changed defaults
out["info"] = "";
for (int i = 0; i < num_results; i++) {
if (results[i].data == nullptr) {
continue;
}
auto image_bytes = write_image_to_vector(ImageFormat::PNG,
results[i].data,
results[i].width,
results[i].height,
results[i].channel);
if (image_bytes.empty()) {
LOG_ERROR("write image to mem failed");
continue;
}
std::string b64 = base64_encode(image_bytes);
out["images"].push_back(b64);
}
free_results(results, num_results);
res.set_content(out.dump(), "application/json");
res.status = 200;
if (init_image.data) {
stbi_image_free(init_image.data);
}
if (mask_image.data && mask_data.empty()) {
stbi_image_free(mask_image.data);
}
for (auto ref_image : ref_images) {
stbi_image_free(ref_image.data);
}
} catch (const std::exception& e) {
res.status = 500;
json err;
err["error"] = "server_error";
err["message"] = e.what();
res.set_content(err.dump(), "application/json");
}
};
svr.Post("/sdapi/v1/txt2img", [&](const httplib::Request& req, httplib::Response& res) {
sdapi_any2img(req, res, false);
});
svr.Post("/sdapi/v1/img2img", [&](const httplib::Request& req, httplib::Response& res) {
sdapi_any2img(req, res, true);
});
svr.Get("/sdapi/v1/loras", [&](const httplib::Request&, httplib::Response& res) {
refresh_lora_cache();
json result = json::array();
{
std::lock_guard<std::mutex> lock(lora_mutex);
for (const auto& e : lora_cache) {
json item;
item["name"] = e.name;
item["path"] = e.path;
result.push_back(item);
}
}
res.set_content(result.dump(), "application/json");
});
svr.Get("/sdapi/v1/samplers", [&](const httplib::Request&, httplib::Response& res) {
std::vector<std::string> sampler_names;
sampler_names.push_back("default");
for (int i = 0; i < SAMPLE_METHOD_COUNT; i++) {
sampler_names.push_back(sd_sample_method_name((sample_method_t)i));
}
json r = json::array();
for (auto name : sampler_names) {
json entry;
entry["name"] = name;
entry["aliases"] = json::array({name});
entry["options"] = json::object();
r.push_back(entry);
}
res.set_content(r.dump(), "application/json");
});
svr.Get("/sdapi/v1/schedulers", [&](const httplib::Request&, httplib::Response& res) {
std::vector<std::string> scheduler_names;
scheduler_names.push_back("default");
for (int i = 0; i < SCHEDULER_COUNT; i++) {
scheduler_names.push_back(sd_scheduler_name((scheduler_t)i));
}
json r = json::array();
for (auto name : scheduler_names) {
json entry;
entry["name"] = name;
entry["label"] = name;
r.push_back(entry);
}
res.set_content(r.dump(), "application/json");
});
svr.Get("/sdapi/v1/sd-models", [&](const httplib::Request&, httplib::Response& res) {
fs::path model_path = ctx_params.model_path;
json entry;
entry["title"] = model_path.stem();
entry["model_name"] = model_path.stem();
entry["filename"] = model_path.filename();
entry["hash"] = "8888888888";
entry["sha256"] = "8888888888888888888888888888888888888888888888888888888888888888";
entry["config"] = nullptr;
json r = json::array();
r.push_back(entry);
res.set_content(r.dump(), "application/json");
});
svr.Get("/sdapi/v1/options", [&](const httplib::Request&, httplib::Response& res) {
fs::path model_path = ctx_params.model_path;
json r;
r["samples_format"] = "png";
r["sd_model_checkpoint"] = model_path.stem();
res.set_content(r.dump(), "application/json");
});
LOG_INFO("listening on: %s:%d\n", svr_params.listen_ip.c_str(), svr_params.listen_port);
svr.listen(svr_params.listen_ip, svr_params.listen_port); svr.listen(svr_params.listen_ip, svr_params.listen_port);
// cleanup // cleanup

View File

@ -1,4 +1,4 @@
for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do for f in src/*.cpp src/*.h src/*.hpp src/vocab/*.h src/vocab/*.cpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do
[[ "$f" == vocab* ]] && continue [[ "$f" == vocab* ]] && continue
echo "formatting '$f'" echo "formatting '$f'"
# if [ "$f" != "stable-diffusion.h" ]; then # if [ "$f" != "stable-diffusion.h" ]; then

2
ggml

@ -1 +1 @@
Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275 Subproject commit a8db410a252c8c8f2d120c6f2e7133ebe032f35d

View File

@ -48,6 +48,8 @@ enum sample_method_t {
LCM_SAMPLE_METHOD, LCM_SAMPLE_METHOD,
DDIM_TRAILING_SAMPLE_METHOD, DDIM_TRAILING_SAMPLE_METHOD,
TCD_SAMPLE_METHOD, TCD_SAMPLE_METHOD,
RES_MULTISTEP_SAMPLE_METHOD,
RES_2S_SAMPLE_METHOD,
SAMPLE_METHOD_COUNT SAMPLE_METHOD_COUNT
}; };
@ -60,7 +62,9 @@ enum scheduler_t {
SGM_UNIFORM_SCHEDULER, SGM_UNIFORM_SCHEDULER,
SIMPLE_SCHEDULER, SIMPLE_SCHEDULER,
SMOOTHSTEP_SCHEDULER, SMOOTHSTEP_SCHEDULER,
KL_OPTIMAL_SCHEDULER,
LCM_SCHEDULER, LCM_SCHEDULER,
BONG_TANGENT_SCHEDULER,
SCHEDULER_COUNT SCHEDULER_COUNT
}; };
@ -168,7 +172,6 @@ typedef struct {
const char* vae_path; const char* vae_path;
const char* taesd_path; const char* taesd_path;
const char* control_net_path; const char* control_net_path;
const char* lora_model_dir;
const sd_embedding_t* embeddings; const sd_embedding_t* embeddings;
uint32_t embedding_count; uint32_t embedding_count;
const char* photo_maker_path; const char* photo_maker_path;
@ -182,18 +185,22 @@ typedef struct {
enum prediction_t prediction; enum prediction_t prediction;
enum lora_apply_mode_t lora_apply_mode; enum lora_apply_mode_t lora_apply_mode;
bool offload_params_to_cpu; bool offload_params_to_cpu;
bool enable_mmap;
bool keep_clip_on_cpu; bool keep_clip_on_cpu;
bool keep_control_net_on_cpu; bool keep_control_net_on_cpu;
bool keep_vae_on_cpu; bool keep_vae_on_cpu;
bool flash_attn;
bool diffusion_flash_attn; bool diffusion_flash_attn;
bool tae_preview_only; bool tae_preview_only;
bool diffusion_conv_direct; bool diffusion_conv_direct;
bool vae_conv_direct; bool vae_conv_direct;
bool circular_x;
bool circular_y;
bool force_sdxl_vae_conv_scale; bool force_sdxl_vae_conv_scale;
bool chroma_use_dit_mask; bool chroma_use_dit_mask;
bool chroma_use_t5_mask; bool chroma_use_t5_mask;
int chroma_t5_mask_pad; int chroma_t5_mask_pad;
float flow_shift; bool qwen_image_zero_cond_t;
} sd_ctx_params_t; } sd_ctx_params_t;
typedef struct { typedef struct {
@ -227,6 +234,7 @@ typedef struct {
int shifted_timestep; int shifted_timestep;
float* custom_sigmas; float* custom_sigmas;
int custom_sigmas_count; int custom_sigmas_count;
float flow_shift;
} sd_sample_params_t; } sd_sample_params_t;
typedef struct { typedef struct {
@ -236,12 +244,42 @@ typedef struct {
float style_strength; float style_strength;
} sd_pm_params_t; // photo maker } sd_pm_params_t; // photo maker
enum sd_cache_mode_t {
SD_CACHE_DISABLED = 0,
SD_CACHE_EASYCACHE,
SD_CACHE_UCACHE,
SD_CACHE_DBCACHE,
SD_CACHE_TAYLORSEER,
SD_CACHE_CACHE_DIT,
SD_CACHE_SPECTRUM,
};
typedef struct { typedef struct {
bool enabled; enum sd_cache_mode_t mode;
float reuse_threshold; float reuse_threshold;
float start_percent; float start_percent;
float end_percent; float end_percent;
} sd_easycache_params_t; float error_decay_rate;
bool use_relative_threshold;
bool reset_error_on_compute;
int Fn_compute_blocks;
int Bn_compute_blocks;
float residual_diff_threshold;
int max_warmup_steps;
int max_cached_steps;
int max_continuous_cached_steps;
int taylorseer_n_derivatives;
int taylorseer_skip_interval;
const char* scm_mask;
bool scm_policy_dynamic;
float spectrum_w;
int spectrum_m;
float spectrum_lam;
int spectrum_window_size;
float spectrum_flex_window;
int spectrum_warmup_steps;
float spectrum_stop_percent;
} sd_cache_params_t;
typedef struct { typedef struct {
bool is_high_noise; bool is_high_noise;
@ -271,7 +309,7 @@ typedef struct {
float control_strength; float control_strength;
sd_pm_params_t pm_params; sd_pm_params_t pm_params;
sd_tiling_params_t vae_tiling_params; sd_tiling_params_t vae_tiling_params;
sd_easycache_params_t easycache; sd_cache_params_t cache;
} sd_img_gen_params_t; } sd_img_gen_params_t;
typedef struct { typedef struct {
@ -293,7 +331,8 @@ typedef struct {
int64_t seed; int64_t seed;
int video_frames; int video_frames;
float vace_strength; float vace_strength;
sd_easycache_params_t easycache; sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache;
} sd_vid_gen_params_t; } sd_vid_gen_params_t;
typedef struct sd_ctx_t sd_ctx_t; typedef struct sd_ctx_t sd_ctx_t;
@ -323,7 +362,7 @@ SD_API enum preview_t str_to_preview(const char* str);
SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode); SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str); SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
SD_API void sd_easycache_params_init(sd_easycache_params_t* easycache_params); SD_API void sd_cache_params_init(sd_cache_params_t* cache_params);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
@ -335,7 +374,7 @@ SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params); SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);
SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx); SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);
SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx); SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_method_t sample_method);
SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
@ -363,7 +402,8 @@ SD_API bool convert(const char* input_path,
const char* vae_path, const char* vae_path,
const char* output_path, const char* output_path,
enum sd_type_t output_type, enum sd_type_t output_type,
const char* tensor_type_rules); const char* tensor_type_rules,
bool convert_name);
SD_API bool preprocess_canny(sd_image_t image, SD_API bool preprocess_canny(sd_image_t image,
float high_threshold, float high_threshold,

View File

@ -1,88 +1,88 @@
import os import os
import sys import sys
import numpy as np import numpy as np
import torch import torch
from diffusers.utils import load_image from diffusers.utils import load_image
# pip install insightface==0.7.3 # pip install insightface==0.7.3
from insightface.app import FaceAnalysis from insightface.app import FaceAnalysis
from insightface.data import get_image as ins_get_image from insightface.data import get_image as ins_get_image
from safetensors.torch import save_file from safetensors.torch import save_file
### ###
# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543 # https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
### ###
class FaceAnalysis2(FaceAnalysis): class FaceAnalysis2(FaceAnalysis):
# NOTE: allows setting det_size for each detection call. # NOTE: allows setting det_size for each detection call.
# the model allows it but the wrapping code from insightface # the model allows it but the wrapping code from insightface
# doesn't show it, and people end up loading duplicate models # doesn't show it, and people end up loading duplicate models
# for different sizes where there is absolutely no need to # for different sizes where there is absolutely no need to
def get(self, img, max_num=0, det_size=(640, 640)): def get(self, img, max_num=0, det_size=(640, 640)):
if det_size is not None: if det_size is not None:
self.det_model.input_size = det_size self.det_model.input_size = det_size
return super().get(img, max_num) return super().get(img, max_num)
def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)): def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
# NOTE: try detect faces, if no faces detected, lower det_size until it does # NOTE: try detect faces, if no faces detected, lower det_size until it does
detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)] detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
for size in detection_sizes: for size in detection_sizes:
faces = face_analysis.get(img_data, det_size=size) faces = face_analysis.get(img_data, det_size=size)
if len(faces) > 0: if len(faces) > 0:
return faces return faces
return [] return []
if __name__ == "__main__": if __name__ == "__main__":
#face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition']) #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition']) face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
face_detector.prepare(ctx_id=0, det_size=(640, 640)) face_detector.prepare(ctx_id=0, det_size=(640, 640))
#input_folder_name = './scarletthead_woman' #input_folder_name = './scarletthead_woman'
input_folder_name = sys.argv[1] input_folder_name = sys.argv[1]
image_basename_list = os.listdir(input_folder_name) image_basename_list = os.listdir(input_folder_name)
image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list]) image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
input_id_images = [] input_id_images = []
for image_path in image_path_list: for image_path in image_path_list:
input_id_images.append(load_image(image_path)) input_id_images.append(load_image(image_path))
id_embed_list = [] id_embed_list = []
for img in input_id_images: for img in input_id_images:
img = np.array(img) img = np.array(img)
img = img[:, :, ::-1] img = img[:, :, ::-1]
faces = analyze_faces(face_detector, img) faces = analyze_faces(face_detector, img)
if len(faces) > 0: if len(faces) > 0:
id_embed_list.append(torch.from_numpy((faces[0]['embedding']))) id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
if len(id_embed_list) == 0: if len(id_embed_list) == 0:
raise ValueError(f"No face detected in input image pool") raise ValueError(f"No face detected in input image pool")
id_embeds = torch.stack(id_embed_list) id_embeds = torch.stack(id_embed_list)
# for r in id_embeds: # for r in id_embeds:
# print(r) # print(r)
# #torch.save(id_embeds, input_folder_name+'/id_embeds.pt'); # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
# weights = dict() # weights = dict()
# weights["id_embeds"] = id_embeds # weights["id_embeds"] = id_embeds
# save_file(weights, input_folder_name+'/id_embeds.safetensors') # save_file(weights, input_folder_name+'/id_embeds.safetensors')
binary_data = id_embeds.numpy().tobytes() binary_data = id_embeds.numpy().tobytes()
two = 4 two = 4
zero = 0 zero = 0
one = 1 one = 1
tensor_name = "id_embeds" tensor_name = "id_embeds"
# Write binary data to a file # Write binary data to a file
with open(input_folder_name+'/id_embeds.bin', "wb") as f: with open(input_folder_name+'/id_embeds.bin', "wb") as f:
f.write(two.to_bytes(4, byteorder='little')) f.write(two.to_bytes(4, byteorder='little'))
f.write((len(tensor_name)).to_bytes(4, byteorder='little')) f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
f.write(zero.to_bytes(4, byteorder='little')) f.write(zero.to_bytes(4, byteorder='little'))
f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little')) f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little')) f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
f.write(one.to_bytes(4, byteorder='little')) f.write(one.to_bytes(4, byteorder='little'))
f.write(one.to_bytes(4, byteorder='little')) f.write(one.to_bytes(4, byteorder='little'))
f.write(tensor_name.encode('ascii')) f.write(tensor_name.encode('ascii'))
f.write(binary_data) f.write(binary_data)

686
src/anima.hpp Normal file
View File

@ -0,0 +1,686 @@
#ifndef __ANIMA_HPP__
#define __ANIMA_HPP__
#include <cmath>
#include <memory>
#include <utility>
#include <vector>
#include "common_block.hpp"
#include "flux.hpp"
#include "rope.hpp"
namespace Anima {
constexpr int ANIMA_GRAPH_SIZE = 65536;
__STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx,
ggml_tensor* x,
ggml_tensor* gate) {
gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]); // [N, 1, C]
return ggml_mul(ctx, x, gate);
}
struct XEmbedder : public GGMLBlock {
public:
XEmbedder(int64_t in_dim, int64_t out_dim) {
blocks["proj.1"] = std::make_shared<Linear>(in_dim, out_dim, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj.1"]);
return proj->forward(ctx, x);
}
};
struct TimestepEmbedder : public GGMLBlock {
public:
TimestepEmbedder(int64_t in_dim, int64_t out_dim) {
blocks["1.linear_1"] = std::make_shared<Linear>(in_dim, in_dim, false);
blocks["1.linear_2"] = std::make_shared<Linear>(in_dim, out_dim, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_1"]);
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_2"]);
x = linear_1->forward(ctx, x);
x = ggml_silu_inplace(ctx->ggml_ctx, x);
x = linear_2->forward(ctx, x);
return x;
}
};
struct AdaLayerNormZero : public GGMLBlock {
protected:
int64_t in_features;
public:
AdaLayerNormZero(int64_t in_features, int64_t hidden_features = 256)
: in_features(in_features) {
blocks["norm"] = std::make_shared<LayerNorm>(in_features, 1e-6f, false, false);
blocks["1"] = std::make_shared<Linear>(in_features, hidden_features, false);
blocks["2"] = std::make_shared<Linear>(hidden_features, 3 * in_features, false);
}
std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
ggml_tensor* hidden_states,
ggml_tensor* embedded_timestep,
ggml_tensor* temb = nullptr) {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
emb = linear_1->forward(ctx, emb);
emb = linear_2->forward(ctx, emb); // [N, 3*C]
if (temb != nullptr) {
emb = ggml_add(ctx->ggml_ctx, emb, temb);
}
auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 3, 0);
auto shift = emb_chunks[0];
auto scale = emb_chunks[1];
auto gate = emb_chunks[2];
auto x = norm->forward(ctx, hidden_states);
x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
return {x, gate};
}
};
struct AdaLayerNorm : public GGMLBlock {
protected:
int64_t embedding_dim;
public:
AdaLayerNorm(int64_t in_features, int64_t hidden_features = 256)
: embedding_dim(in_features) {
blocks["norm"] = std::make_shared<LayerNorm>(in_features, 1e-6f, false, false);
blocks["1"] = std::make_shared<Linear>(in_features, hidden_features, false);
blocks["2"] = std::make_shared<Linear>(hidden_features, 2 * in_features, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* hidden_states,
ggml_tensor* embedded_timestep,
ggml_tensor* temb = nullptr) {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
emb = linear_1->forward(ctx, emb);
emb = linear_2->forward(ctx, emb); // [N, 2*C]
if (temb != nullptr) {
auto temb_2c = ggml_view_2d(ctx->ggml_ctx, temb, 2 * embedding_dim, temb->ne[1], temb->nb[1], 0);
emb = ggml_add(ctx->ggml_ctx, emb, temb_2c);
}
auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
auto shift = emb_chunks[0];
auto scale = emb_chunks[1];
auto x = norm->forward(ctx, hidden_states);
x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
return x;
}
};
struct AnimaAttention : public GGMLBlock {
protected:
int64_t num_heads;
int64_t head_dim;
std::string out_proj_name;
public:
AnimaAttention(int64_t query_dim,
int64_t context_dim,
int64_t num_heads,
int64_t head_dim,
const std::string& out_proj_name = "output_proj")
: num_heads(num_heads), head_dim(head_dim), out_proj_name(out_proj_name) {
int64_t inner_dim = num_heads * head_dim;
blocks["q_proj"] = std::make_shared<Linear>(query_dim, inner_dim, false);
blocks["k_proj"] = std::make_shared<Linear>(context_dim, inner_dim, false);
blocks["v_proj"] = std::make_shared<Linear>(context_dim, inner_dim, false);
blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
blocks[this->out_proj_name] = std::make_shared<Linear>(inner_dim, query_dim, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* hidden_states,
ggml_tensor* encoder_hidden_states = nullptr,
ggml_tensor* pe_q = nullptr,
ggml_tensor* pe_k = nullptr) {
if (encoder_hidden_states == nullptr) {
encoder_hidden_states = hidden_states;
}
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q_proj"]);
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k_proj"]);
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v_proj"]);
auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
auto k_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);
auto q = q_proj->forward(ctx, hidden_states);
auto k = k_proj->forward(ctx, encoder_hidden_states);
auto v = v_proj->forward(ctx, encoder_hidden_states);
int64_t N = q->ne[2];
int64_t L_q = q->ne[1];
int64_t L_k = k->ne[1];
auto q4 = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, L_q, N); // [N, L_q, H, D]
auto k4 = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, L_k, N); // [N, L_k, H, D]
auto v4 = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, L_k, N); // [N, L_k, H, D]
q4 = q_norm->forward(ctx, q4);
k4 = k_norm->forward(ctx, k4);
ggml_tensor* attn_out = nullptr;
if (pe_q != nullptr || pe_k != nullptr) {
if (pe_q == nullptr) {
pe_q = pe_k;
}
if (pe_k == nullptr) {
pe_k = pe_q;
}
auto q_rope = Rope::apply_rope(ctx->ggml_ctx, q4, pe_q, false);
auto k_rope = Rope::apply_rope(ctx->ggml_ctx, k4, pe_k, false);
attn_out = ggml_ext_attention_ext(ctx->ggml_ctx,
ctx->backend,
q_rope,
k_rope,
v4,
num_heads,
nullptr,
true,
ctx->flash_attn_enabled);
} else {
auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
attn_out = ggml_ext_attention_ext(ctx->ggml_ctx,
ctx->backend,
q_flat,
k_flat,
v,
num_heads,
nullptr,
false,
ctx->flash_attn_enabled);
}
return out_proj->forward(ctx, attn_out);
}
};
struct AnimaMLP : public GGMLBlock {
public:
AnimaMLP(int64_t dim, int64_t hidden_dim) {
blocks["layer1"] = std::make_shared<Linear>(dim, hidden_dim, false);
blocks["layer2"] = std::make_shared<Linear>(hidden_dim, dim, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto layer1 = std::dynamic_pointer_cast<Linear>(blocks["layer1"]);
auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["layer2"]);
x = layer1->forward(ctx, x);
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
x = layer2->forward(ctx, x);
return x;
}
};
struct AdapterMLP : public GGMLBlock {
public:
AdapterMLP(int64_t dim, int64_t hidden_dim) {
blocks["0"] = std::make_shared<Linear>(dim, hidden_dim, true);
blocks["2"] = std::make_shared<Linear>(hidden_dim, dim, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto layer0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
x = layer0->forward(ctx, x);
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
x = layer2->forward(ctx, x);
return x;
}
};
struct LLMAdapterBlock : public GGMLBlock {
public:
LLMAdapterBlock(int64_t model_dim = 1024, int64_t source_dim = 1024, int64_t num_heads = 16, int64_t head_dim = 64) {
blocks["norm_self_attn"] = std::make_shared<RMSNorm>(model_dim, 1e-6f);
blocks["self_attn"] = std::make_shared<AnimaAttention>(model_dim, model_dim, num_heads, head_dim, "o_proj");
blocks["norm_cross_attn"] = std::make_shared<RMSNorm>(model_dim, 1e-6f);
blocks["cross_attn"] = std::make_shared<AnimaAttention>(model_dim, source_dim, num_heads, head_dim, "o_proj");
blocks["norm_mlp"] = std::make_shared<RMSNorm>(model_dim, 1e-6f);
blocks["mlp"] = std::make_shared<AdapterMLP>(model_dim, model_dim * 4);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* context,
ggml_tensor* target_pe,
ggml_tensor* context_pe) {
auto norm_self_attn = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_self_attn"]);
auto self_attn = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
auto norm_cross_attn = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_cross_attn"]);
auto cross_attn = std::dynamic_pointer_cast<AnimaAttention>(blocks["cross_attn"]);
auto norm_mlp = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_mlp"]);
auto mlp = std::dynamic_pointer_cast<AdapterMLP>(blocks["mlp"]);
auto h = norm_self_attn->forward(ctx, x);
h = self_attn->forward(ctx, h, nullptr, target_pe, target_pe);
x = ggml_add(ctx->ggml_ctx, x, h);
h = norm_cross_attn->forward(ctx, x);
h = cross_attn->forward(ctx, h, context, target_pe, context_pe);
x = ggml_add(ctx->ggml_ctx, x, h);
h = norm_mlp->forward(ctx, x);
h = mlp->forward(ctx, h);
x = ggml_add(ctx->ggml_ctx, x, h);
return x;
}
};
struct LLMAdapter : public GGMLBlock {
protected:
int num_layers;
public:
LLMAdapter(int64_t source_dim = 1024,
int64_t target_dim = 1024,
int64_t model_dim = 1024,
int num_layers = 6,
int num_heads = 16)
: num_layers(num_layers) {
int64_t head_dim = model_dim / num_heads;
blocks["embed"] = std::make_shared<Embedding>(32128, target_dim);
for (int i = 0; i < num_layers; i++) {
blocks["blocks." + std::to_string(i)] =
std::make_shared<LLMAdapterBlock>(model_dim, source_dim, num_heads, head_dim);
}
blocks["out_proj"] = std::make_shared<Linear>(model_dim, target_dim, true);
blocks["norm"] = std::make_shared<RMSNorm>(target_dim, 1e-6f);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* source_hidden_states,
ggml_tensor* target_input_ids,
ggml_tensor* target_pe,
ggml_tensor* source_pe) {
GGML_ASSERT(target_input_ids != nullptr);
if (ggml_n_dims(target_input_ids) == 1) {
target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1);
}
auto embed = std::dynamic_pointer_cast<Embedding>(blocks["embed"]);
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
auto x = embed->forward(ctx, target_input_ids); // [N, target_len, target_dim]
for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<LLMAdapterBlock>(blocks["blocks." + std::to_string(i)]);
x = block->forward(ctx, x, source_hidden_states, target_pe, source_pe);
}
x = out_proj->forward(ctx, x);
x = norm->forward(ctx, x);
return x;
}
};
struct TransformerBlock : public GGMLBlock {
public:
TransformerBlock(int64_t hidden_size,
int64_t text_embed_dim,
int64_t num_heads,
int64_t head_dim,
int64_t mlp_ratio = 4,
int64_t adaln_lora_dim = 256) {
blocks["adaln_modulation_self_attn"] = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
blocks["self_attn"] = std::make_shared<AnimaAttention>(hidden_size, hidden_size, num_heads, head_dim);
blocks["adaln_modulation_cross_attn"] = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
blocks["cross_attn"] = std::make_shared<AnimaAttention>(hidden_size, text_embed_dim, num_heads, head_dim);
blocks["adaln_modulation_mlp"] = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
blocks["mlp"] = std::make_shared<AnimaMLP>(hidden_size, hidden_size * mlp_ratio);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* hidden_states,
ggml_tensor* encoder_hidden_states,
ggml_tensor* embedded_timestep,
ggml_tensor* temb,
ggml_tensor* image_pe) {
auto norm1 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_self_attn"]);
auto attn1 = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
auto norm2 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_cross_attn"]);
auto attn2 = std::dynamic_pointer_cast<AnimaAttention>(blocks["cross_attn"]);
auto norm3 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_mlp"]);
auto mlp = std::dynamic_pointer_cast<AnimaMLP>(blocks["mlp"]);
auto [normed1, gate1] = norm1->forward(ctx, hidden_states, embedded_timestep, temb);
auto h = attn1->forward(ctx, normed1, nullptr, image_pe, image_pe);
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate1));
auto [normed2, gate2] = norm2->forward(ctx, hidden_states, embedded_timestep, temb);
h = attn2->forward(ctx, normed2, encoder_hidden_states, nullptr, nullptr);
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate2));
auto [normed3, gate3] = norm3->forward(ctx, hidden_states, embedded_timestep, temb);
h = mlp->forward(ctx, normed3);
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate3));
return hidden_states;
}
};
struct FinalLayer : public GGMLBlock {
protected:
int64_t hidden_size;
int64_t patch_size;
int64_t out_channels;
public:
FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels)
: hidden_size(hidden_size), patch_size(patch_size), out_channels(out_channels) {
blocks["adaln_modulation"] = std::make_shared<AdaLayerNorm>(hidden_size, 256);
blocks["linear"] = std::make_shared<Linear>(hidden_size, patch_size * patch_size * out_channels, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* hidden_states,
ggml_tensor* embedded_timestep,
ggml_tensor* temb) {
auto adaln = std::dynamic_pointer_cast<AdaLayerNorm>(blocks["adaln_modulation"]);
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
hidden_states = adaln->forward(ctx, hidden_states, embedded_timestep, temb);
hidden_states = linear->forward(ctx, hidden_states);
return hidden_states;
}
};
struct AnimaNet : public GGMLBlock {
public:
int64_t in_channels = 16;
int64_t out_channels = 16;
int64_t hidden_size = 2048;
int64_t text_embed_dim = 1024;
int64_t num_heads = 16;
int64_t head_dim = 128;
int patch_size = 2;
int64_t num_layers = 28;
std::vector<int> axes_dim = {44, 42, 42};
int theta = 10000;
public:
AnimaNet() = default;
explicit AnimaNet(int64_t num_layers)
: num_layers(num_layers) {
blocks["x_embedder"] = std::make_shared<XEmbedder>((in_channels + 1) * patch_size * patch_size, hidden_size);
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(hidden_size, hidden_size * 3);
blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
for (int i = 0; i < num_layers; i++) {
blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(hidden_size,
text_embed_dim,
num_heads,
head_dim);
}
blocks["final_layer"] = std::make_shared<FinalLayer>(hidden_size, patch_size, out_channels);
blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* timestep,
ggml_tensor* encoder_hidden_states,
ggml_tensor* image_pe,
ggml_tensor* t5_ids = nullptr,
ggml_tensor* t5_weights = nullptr,
ggml_tensor* adapter_q_pe = nullptr,
ggml_tensor* adapter_k_pe = nullptr) {
GGML_ASSERT(x->ne[3] == 1);
auto x_embedder = std::dynamic_pointer_cast<XEmbedder>(blocks["x_embedder"]);
auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
auto t_embedding_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["t_embedding_norm"]);
auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
auto llm_adapter = std::dynamic_pointer_cast<LLMAdapter>(blocks["llm_adapter"]);
int64_t W = x->ne[0];
int64_t H = x->ne[1];
auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2); // [N, C + 1, H, W]
x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); // [N, h*w, (C+1)*ph*pw]
x = x_embedder->forward(ctx, x);
auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(hidden_size));
auto temb = t_embedder->forward(ctx, timestep_proj);
auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);
if (t5_ids != nullptr) {
auto adapted_context = llm_adapter->forward(ctx, encoder_hidden_states, t5_ids, adapter_q_pe, adapter_k_pe);
if (t5_weights != nullptr) {
auto w = t5_weights;
if (ggml_n_dims(w) == 1) {
w = ggml_reshape_3d(ctx->ggml_ctx, w, 1, w->ne[0], 1);
}
w = ggml_repeat_4d(ctx->ggml_ctx, w, adapted_context->ne[0], adapted_context->ne[1], adapted_context->ne[2], 1);
adapted_context = ggml_mul(ctx->ggml_ctx, adapted_context, w);
}
if (adapted_context->ne[1] < 512) {
auto pad_ctx = ggml_ext_zeros(ctx->ggml_ctx,
adapted_context->ne[0],
512 - adapted_context->ne[1],
adapted_context->ne[2],
1);
adapted_context = ggml_concat(ctx->ggml_ctx, adapted_context, pad_ctx, 1);
} else if (adapted_context->ne[1] > 512) {
adapted_context = ggml_ext_slice(ctx->ggml_ctx, adapted_context, 1, 0, 512);
}
encoder_hidden_states = adapted_context;
}
for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
}
x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false); // [N, C, H, W]
return x;
}
};
struct AnimaRunner : public GGMLRunner {
public:
std::vector<float> image_pe_vec;
std::vector<float> adapter_q_pe_vec;
std::vector<float> adapter_k_pe_vec;
AnimaNet net;
AnimaRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: GGMLRunner(backend, offload_params_to_cpu) {
int64_t num_layers = 0;
std::string layer_tag = prefix + ".net.blocks.";
for (const auto& kv : tensor_storage_map) {
const std::string& tensor_name = kv.first;
size_t pos = tensor_name.find(layer_tag);
if (pos == std::string::npos) {
continue;
}
size_t start = pos + layer_tag.size();
size_t end = tensor_name.find('.', start);
if (end == std::string::npos) {
continue;
}
int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
num_layers = std::max(num_layers, layer_id + 1);
}
if (num_layers <= 0) {
num_layers = 28;
}
LOG_INFO("anima net layers: %" PRId64, num_layers);
net = AnimaNet(num_layers);
net.init(params_ctx, tensor_storage_map, prefix + ".net");
}
std::string get_desc() override {
return "anima";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
net.get_param_tensors(tensors, prefix + ".net");
}
static std::vector<float> gen_1d_rope_pe_vec(int64_t seq_len, int dim, float theta = 10000.f) {
std::vector<float> pos(seq_len);
for (int64_t i = 0; i < seq_len; i++) {
pos[i] = static_cast<float>(i);
}
auto rope_emb = Rope::rope(pos, dim, theta);
return Rope::flatten(rope_emb);
}
static float calc_ntk_factor(float extrapolation_ratio, int axis_dim) {
if (extrapolation_ratio == 1.0f || axis_dim <= 2) {
return 1.0f;
}
return std::pow(extrapolation_ratio, static_cast<float>(axis_dim) / static_cast<float>(axis_dim - 2));
}
static std::vector<float> gen_anima_image_pe_vec(int bs,
int h,
int w,
int patch_size,
int theta,
const std::vector<int>& axes_dim,
float h_extrapolation_ratio,
float w_extrapolation_ratio,
float t_extrapolation_ratio) {
static const std::vector<ggml_tensor*> empty_ref_latents;
auto ids = Rope::gen_flux_ids(h,
w,
patch_size,
bs,
static_cast<int>(axes_dim.size()),
0,
{},
empty_ref_latents,
false,
1.0f);
std::vector<float> axis_thetas = {
static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
static_cast<float>(theta) * calc_ntk_factor(h_extrapolation_ratio, axes_dim[1]),
static_cast<float>(theta) * calc_ntk_factor(w_extrapolation_ratio, axes_dim[2]),
};
return Rope::embed_nd(ids, bs, axis_thetas, axes_dim);
}
ggml_cgraph* build_graph(ggml_tensor* x,
ggml_tensor* timesteps,
ggml_tensor* context,
ggml_tensor* t5_ids = nullptr,
ggml_tensor* t5_weights = nullptr) {
GGML_ASSERT(x->ne[3] == 1);
ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
x = to_backend(x);
timesteps = to_backend(timesteps);
context = to_backend(context);
t5_ids = to_backend(t5_ids);
t5_weights = to_backend(t5_weights);
int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
int64_t h_pad = x->ne[1] + pad_h;
int64_t w_pad = x->ne[0] + pad_w;
image_pe_vec = gen_anima_image_pe_vec(1,
static_cast<int>(h_pad),
static_cast<int>(w_pad),
static_cast<int>(net.patch_size),
net.theta,
net.axes_dim,
4.0f,
4.0f,
1.0f);
int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2));
auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len);
set_backend_tensor_data(image_pe, image_pe_vec.data());
ggml_tensor* adapter_q_pe = nullptr;
ggml_tensor* adapter_k_pe = nullptr;
if (t5_ids != nullptr) {
int64_t target_len = t5_ids->ne[0];
int64_t source_len = context->ne[1];
adapter_q_pe_vec = gen_1d_rope_pe_vec(target_len, 64, 10000.f);
adapter_k_pe_vec = gen_1d_rope_pe_vec(source_len, 64, 10000.f);
int64_t target_pos_len = static_cast<int64_t>(adapter_q_pe_vec.size()) / (2 * 2 * 32);
int64_t source_pos_len = static_cast<int64_t>(adapter_k_pe_vec.size()) / (2 * 2 * 32);
adapter_q_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, target_pos_len);
adapter_k_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, source_pos_len);
set_backend_tensor_data(adapter_q_pe, adapter_q_pe_vec.data());
set_backend_tensor_data(adapter_k_pe, adapter_k_pe_vec.data());
}
auto runner_ctx = get_context();
auto out = net.forward(&runner_ctx,
x,
timesteps,
context,
image_pe,
t5_ids,
t5_weights,
adapter_q_pe,
adapter_k_pe);
ggml_build_forward_expand(gf, out);
return gf;
}
bool compute(int n_threads,
ggml_tensor* x,
ggml_tensor* timesteps,
ggml_tensor* context,
ggml_tensor* t5_ids = nullptr,
ggml_tensor* t5_weights = nullptr,
ggml_tensor** output = nullptr,
ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, t5_ids, t5_weights);
};
return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
}
};
} // namespace Anima
#endif // __ANIMA_HPP__

View File

@ -1,8 +1,7 @@
#ifndef __VAE_HPP__ #ifndef __AUTO_ENCODER_KL_HPP__
#define __VAE_HPP__ #define __AUTO_ENCODER_KL_HPP__
#include "common.hpp" #include "vae.hpp"
#include "ggml_extend.hpp"
/*================================================== AutoEncoderKL ===================================================*/ /*================================================== AutoEncoderKL ===================================================*/
@ -30,7 +29,7 @@ public:
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// t_emb is always None // t_emb is always None
auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]); auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
@ -66,7 +65,7 @@ protected:
int64_t in_channels; int64_t in_channels;
bool use_linear; bool use_linear;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
auto iter = tensor_storage_map.find(prefix + "proj_out.weight"); auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
if (iter != tensor_storage_map.end()) { if (iter != tensor_storage_map.end()) {
if (iter->second.n_dims == 4 && use_linear) { if (iter->second.n_dims == 4 && use_linear) {
@ -102,7 +101,7 @@ public:
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]); auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
auto q_proj = std::dynamic_pointer_cast<UnaryBlock>(blocks["q"]); auto q_proj = std::dynamic_pointer_cast<UnaryBlock>(blocks["q"]);
@ -127,8 +126,6 @@ public:
q = q_proj->forward(ctx, h_); // [N, h * w, in_channels] q = q_proj->forward(ctx, h_); // [N, h * w, in_channels]
k = k_proj->forward(ctx, h_); // [N, h * w, in_channels] k = k_proj->forward(ctx, h_); // [N, h * w, in_channels]
v = v_proj->forward(ctx, h_); // [N, h * w, in_channels] v = v_proj->forward(ctx, h_); // [N, h * w, in_channels]
v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 0, 2, 3)); // [N, in_channels, h * w]
} else { } else {
q = q_proj->forward(ctx, h_); // [N, in_channels, h, w] q = q_proj->forward(ctx, h_); // [N, in_channels, h, w]
q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels] q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels]
@ -138,11 +135,12 @@ public:
k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels] k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels]
k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n); // [N, h * w, in_channels] k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n); // [N, h * w, in_channels]
v = v_proj->forward(ctx, h_); // [N, in_channels, h, w] v = v_proj->forward(ctx, h_); // [N, in_channels, h, w]
v = ggml_reshape_3d(ctx->ggml_ctx, v, h * w, c, n); // [N, in_channels, h * w] v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3)); // [N, h, w, in_channels]
v = ggml_reshape_3d(ctx->ggml_ctx, v, c, h * w, n); // [N, h * w, in_channels]
} }
h_ = ggml_ext_attention(ctx->ggml_ctx, q, k, v, false); // [N, h * w, in_channels] h_ = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, false, ctx->flash_attn_enabled);
if (use_linear) { if (use_linear) {
h_ = proj_out->forward(ctx, h_); // [N, h * w, in_channels] h_ = proj_out->forward(ctx, h_); // [N, h * w, in_channels]
@ -166,27 +164,27 @@ public:
AE3DConv(int64_t in_channels, AE3DConv(int64_t in_channels,
int64_t out_channels, int64_t out_channels,
std::pair<int, int> kernel_size, std::pair<int, int> kernel_size,
int64_t video_kernel_size = 3, int video_kernel_size = 3,
std::pair<int, int> stride = {1, 1}, std::pair<int, int> stride = {1, 1},
std::pair<int, int> padding = {0, 0}, std::pair<int, int> padding = {0, 0},
std::pair<int, int> dilation = {1, 1}, std::pair<int, int> dilation = {1, 1},
bool bias = true) bool bias = true)
: Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) { : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
int64_t kernel_padding = video_kernel_size / 2; int kernel_padding = video_kernel_size / 2;
blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(out_channels, blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(out_channels,
out_channels, out_channels,
video_kernel_size, {video_kernel_size, 1, 1},
1, {1, 1, 1},
kernel_padding)); {kernel_padding, 0, 0}));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x) override { ggml_tensor* x) override {
// timesteps always None // timesteps always None
// skip_video always False // skip_video always False
// x: [N, IC, IH, IW] // x: [N, IC, IH, IW]
// result: [N, OC, OH, OW] // result: [N, OC, OH, OW]
auto time_mix_conv = std::dynamic_pointer_cast<Conv3dnx1x1>(blocks["time_mix_conv"]); auto time_mix_conv = std::dynamic_pointer_cast<Conv3d>(blocks["time_mix_conv"]);
x = Conv2d::forward(ctx, x); x = Conv2d::forward(ctx, x);
// timesteps = x.shape[0] // timesteps = x.shape[0]
@ -210,7 +208,7 @@ public:
class VideoResnetBlock : public ResnetBlock { class VideoResnetBlock : public ResnetBlock {
protected: protected:
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32); enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32);
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
} }
@ -229,7 +227,7 @@ public:
blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true)); blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [N, in_channels, h, w] aka [b*t, in_channels, h, w] // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
// return: [N, out_channels, h, w] aka [b*t, out_channels, h, w] // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
// t_emb is always None // t_emb is always None
@ -254,8 +252,8 @@ public:
float alpha = get_alpha(); float alpha = get_alpha();
x = ggml_add(ctx->ggml_ctx, x = ggml_add(ctx->ggml_ctx,
ggml_scale(ctx->ggml_ctx, x, alpha), ggml_ext_scale(ctx->ggml_ctx, x, alpha),
ggml_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha)); ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha));
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
@ -319,7 +317,7 @@ public:
blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1})); blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
} }
virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
auto conv_in = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]); auto conv_in = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
@ -409,8 +407,8 @@ public:
z_channels(z_channels), z_channels(z_channels),
video_decoder(video_decoder), video_decoder(video_decoder),
video_kernel_size(video_kernel_size) { video_kernel_size(video_kernel_size) {
size_t num_resolutions = ch_mult.size(); int num_resolutions = static_cast<int>(ch_mult.size());
int block_in = ch * ch_mult[num_resolutions - 1]; int block_in = ch * ch_mult[num_resolutions - 1];
blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1})); blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
@ -437,7 +435,7 @@ public:
blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}); blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
} }
virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) { virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) {
// z: [N, z_channels, h, w] // z: [N, z_channels, h, w]
// alpha is always 0 // alpha is always 0
// merge_strategy is always learned // merge_strategy is always learned
@ -461,7 +459,7 @@ public:
h = mid_block_2->forward(ctx, h); // [N, block_in, h, w] h = mid_block_2->forward(ctx, h); // [N, block_in, h, w]
// upsampling // upsampling
size_t num_resolutions = ch_mult.size(); int num_resolutions = static_cast<int>(ch_mult.size());
for (int i = num_resolutions - 1; i >= 0; i--) { for (int i = num_resolutions - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) { for (int j = 0; j < num_res_blocks + 1; j++) {
std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j); std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
@ -485,7 +483,7 @@ public:
}; };
// ldm.models.autoencoder.AutoencoderKL // ldm.models.autoencoder.AutoencoderKL
class AutoencodingEngine : public GGMLBlock { class AutoEncoderKLModel : public GGMLBlock {
protected: protected:
SDVersion version; SDVersion version;
bool decode_only = true; bool decode_only = true;
@ -504,7 +502,7 @@ protected:
} dd_config; } dd_config;
public: public:
AutoencodingEngine(SDVersion version = VERSION_SD1, AutoEncoderKLModel(SDVersion version = VERSION_SD1,
bool decode_only = true, bool decode_only = true,
bool use_linear_projection = false, bool use_linear_projection = false,
bool use_video_decoder = false) bool use_video_decoder = false)
@ -551,7 +549,7 @@ public:
} }
} }
struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) { ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
// z: [N, z_channels, h, w] // z: [N, z_channels, h, w]
if (sd_version_is_flux2(version)) { if (sd_version_is_flux2(version)) {
// [N, C*p*p, h, w] -> [N, C, h*p, w*p] // [N, C*p*p, h, w] -> [N, C, h*p, w*p]
@ -583,7 +581,7 @@ public:
return h; return h;
} }
struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]); auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
@ -612,48 +610,21 @@ public:
} }
return z; return z;
} }
};
struct VAE : public GGMLRunner { int get_encoder_output_channels() {
VAE(ggml_backend_t backend, bool offload_params_to_cpu) int factor = dd_config.double_z ? 2 : 1;
: GGMLRunner(backend, offload_params_to_cpu) {} if (sd_version_is_flux2(version)) {
virtual bool compute(const int n_threads, return dd_config.z_channels * 4;
struct ggml_tensor* z,
bool decode_graph,
struct ggml_tensor** output,
struct ggml_context* output_ctx) = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
};
struct FakeVAE : public VAE {
FakeVAE(ggml_backend_t backend, bool offload_params_to_cpu)
: VAE(backend, offload_params_to_cpu) {}
bool compute(const int n_threads,
struct ggml_tensor* z,
bool decode_graph,
struct ggml_tensor** output,
struct ggml_context* output_ctx) override {
if (*output == nullptr && output_ctx != nullptr) {
*output = ggml_dup_tensor(output_ctx, z);
} }
ggml_ext_tensor_iter(z, [&](ggml_tensor* z, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { return dd_config.z_channels * factor;
float value = ggml_ext_tensor_get_f32(z, i0, i1, i2, i3);
ggml_ext_tensor_set_f32(*output, value, i0, i1, i2, i3);
});
return true;
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {}
std::string get_desc() override {
return "fake_vae";
} }
}; };
struct AutoEncoderKL : public VAE { struct AutoEncoderKL : public VAE {
bool decode_only = true; float scale_factor = 1.f;
AutoencodingEngine ae; float shift_factor = 0.f;
bool decode_only = true;
AutoEncoderKLModel ae;
AutoEncoderKL(ggml_backend_t backend, AutoEncoderKL(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
@ -662,7 +633,23 @@ struct AutoEncoderKL : public VAE {
bool decode_only = false, bool decode_only = false,
bool use_video_decoder = false, bool use_video_decoder = false,
SDVersion version = VERSION_SD1) SDVersion version = VERSION_SD1)
: decode_only(decode_only), VAE(backend, offload_params_to_cpu) { : decode_only(decode_only), VAE(version, backend, offload_params_to_cpu) {
if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
scale_factor = 0.18215f;
shift_factor = 0.f;
} else if (sd_version_is_sdxl(version)) {
scale_factor = 0.13025f;
shift_factor = 0.f;
} else if (sd_version_is_sd3(version)) {
scale_factor = 1.5305f;
shift_factor = 0.0609f;
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
scale_factor = 0.3611f;
shift_factor = 0.1159f;
} else if (sd_version_is_flux2(version)) {
scale_factor = 1.0f;
shift_factor = 0.f;
}
bool use_linear_projection = false; bool use_linear_projection = false;
for (const auto& [name, tensor_storage] : tensor_storage_map) { for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) { if (!starts_with(name, prefix)) {
@ -675,7 +662,7 @@ struct AutoEncoderKL : public VAE {
break; break;
} }
} }
ae = AutoencodingEngine(version, decode_only, use_linear_projection, use_video_decoder); ae = AutoEncoderKLModel(version, decode_only, use_linear_projection, use_video_decoder);
ae.init(params_ctx, tensor_storage_map, prefix); ae.init(params_ctx, tensor_storage_map, prefix);
} }
@ -694,31 +681,31 @@ struct AutoEncoderKL : public VAE {
return "vae"; return "vae";
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {
ae.get_param_tensors(tensors, prefix); ae.get_param_tensors(tensors, prefix);
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z); z = to_backend(z);
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z); ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
return gf; return gf;
} }
bool compute(const int n_threads, bool _compute(const int n_threads,
struct ggml_tensor* z, ggml_tensor* z,
bool decode_graph, bool decode_graph,
struct ggml_tensor** output, ggml_tensor** output,
struct ggml_context* output_ctx = nullptr) override { ggml_context* output_ctx = nullptr) override {
GGML_ASSERT(!decode_only || decode_graph); GGML_ASSERT(!decode_only || decode_graph);
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(z, decode_graph); return build_graph(z, decode_graph);
}; };
// ggml_set_f32(z, 0.5f); // ggml_set_f32(z, 0.5f);
@ -726,13 +713,183 @@ struct AutoEncoderKL : public VAE {
return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments, std::shared_ptr<RNG> rng) {
// ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
ggml_tensor* latents = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
ggml_tensor* noise = ggml_dup_tensor(work_ctx, latents);
ggml_ext_im_set_randn_f32(noise, rng);
{
float mean = 0;
float logvar = 0;
float value = 0;
float std_ = 0;
for (int i = 0; i < latents->ne[3]; i++) {
for (int j = 0; j < latents->ne[2]; j++) {
for (int k = 0; k < latents->ne[1]; k++) {
for (int l = 0; l < latents->ne[0]; l++) {
mean = ggml_ext_tensor_get_f32(moments, l, k, j, i);
logvar = ggml_ext_tensor_get_f32(moments, l, k, j + (int)latents->ne[2], i);
logvar = std::max(-30.0f, std::min(logvar, 20.0f));
std_ = std::exp(0.5f * logvar);
value = mean + std_ * ggml_ext_tensor_get_f32(noise, l, k, j, i);
// printf("%d %d %d %d -> %f\n", i, j, k, l, value);
ggml_ext_tensor_set_f32(latents, value, l, k, j, i);
}
}
}
}
}
return latents;
}
ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
if (sd_version_is_flux2(version)) {
return vae_output;
} else if (version == VERSION_SD1_PIX2PIX) {
return ggml_view_3d(work_ctx,
vae_output,
vae_output->ne[0],
vae_output->ne[1],
vae_output->ne[2] / 2,
vae_output->nb[1],
vae_output->nb[2],
0);
} else {
return gaussian_latent_sample(work_ctx, vae_output, rng);
}
}
void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector<float>& latents_mean_vec, std::vector<float>& latents_std_vec) {
// flux2
if (sd_version_is_flux2(version)) {
GGML_ASSERT(latents->ne[channel_dim] == 128);
latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
-0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f,
-0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f,
0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f,
-0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f,
0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f,
0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f,
0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f,
-0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f,
0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f,
-0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f,
0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f,
-0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f,
0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f,
-0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
-0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f};
latents_std_vec = {
1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f};
} else {
GGML_ABORT("unknown version %d", version);
}
}
ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
ggml_tensor* vae_latents = ggml_dup(work_ctx, latents);
if (sd_version_is_flux2(version)) {
int channel_dim = 2;
std::vector<float> latents_mean_vec;
std::vector<float> latents_std_vec;
get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec);
float mean;
float std_;
for (int i = 0; i < latents->ne[3]; i++) {
if (channel_dim == 3) {
mean = latents_mean_vec[i];
std_ = latents_std_vec[i];
}
for (int j = 0; j < latents->ne[2]; j++) {
if (channel_dim == 2) {
mean = latents_mean_vec[j];
std_ = latents_std_vec[j];
}
for (int k = 0; k < latents->ne[1]; k++) {
for (int l = 0; l < latents->ne[0]; l++) {
float value = ggml_ext_tensor_get_f32(latents, l, k, j, i);
value = value * std_ / scale_factor + mean;
ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i);
}
}
}
}
} else {
ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3);
value = (value / scale_factor) + shift_factor;
ggml_ext_tensor_set_f32(vae_latents, value, i0, i1, i2, i3);
});
}
return vae_latents;
}
ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents);
if (sd_version_is_flux2(version)) {
int channel_dim = 2;
std::vector<float> latents_mean_vec;
std::vector<float> latents_std_vec;
get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec);
float mean;
float std_;
for (int i = 0; i < latents->ne[3]; i++) {
if (channel_dim == 3) {
mean = latents_mean_vec[i];
std_ = latents_std_vec[i];
}
for (int j = 0; j < latents->ne[2]; j++) {
if (channel_dim == 2) {
mean = latents_mean_vec[j];
std_ = latents_std_vec[j];
}
for (int k = 0; k < latents->ne[1]; k++) {
for (int l = 0; l < latents->ne[0]; l++) {
float value = ggml_ext_tensor_get_f32(latents, l, k, j, i);
value = (value - mean) * scale_factor / std_;
ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i);
}
}
}
}
} else {
ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3);
value = (value - shift_factor) * scale_factor;
ggml_ext_tensor_set_f32(diffusion_latents, value, i0, i1, i2, i3);
});
}
return diffusion_latents;
}
int get_encoder_output_channels(int input_channels) {
return ae.get_encoder_output_channels();
}
void test() { void test() {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != nullptr); GGML_ASSERT(work_ctx != nullptr);
{ {
@ -743,14 +900,14 @@ struct AutoEncoderKL : public VAE {
auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2); auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2);
ggml_set_f32(x, 0.5f); ggml_set_f32(x, 0.5f);
print_ggml_tensor(x); print_ggml_tensor(x);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
compute(8, x, false, &out, work_ctx); _compute(8, x, false, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("encode test done in %dms", t1 - t0); LOG_DEBUG("encode test done in %lldms", t1 - t0);
} }
if (false) { if (false) {
@ -761,16 +918,16 @@ struct AutoEncoderKL : public VAE {
auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
ggml_set_f32(z, 0.5f); ggml_set_f32(z, 0.5f);
print_ggml_tensor(z); print_ggml_tensor(z);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
compute(8, z, true, &out, work_ctx); _compute(8, z, true, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("decode test done in %dms", t1 - t0); LOG_DEBUG("decode test done in %lldms", t1 - t0);
} }
}; };
}; };
#endif #endif // __AUTO_ENCODER_KL_HPP__

894
src/cache_dit.hpp Normal file
View File

@ -0,0 +1,894 @@
#ifndef __CACHE_DIT_HPP__
#define __CACHE_DIT_HPP__
#include <algorithm>
#include <cmath>
#include <limits>
#include <string>
#include <unordered_map>
#include <vector>
#include "ggml_extend.hpp"
struct DBCacheConfig {
bool enabled = false;
int Fn_compute_blocks = 8;
int Bn_compute_blocks = 0;
float residual_diff_threshold = 0.08f;
int max_warmup_steps = 8;
int max_cached_steps = -1;
int max_continuous_cached_steps = -1;
float max_accumulated_residual_diff = -1.0f;
std::vector<int> steps_computation_mask;
bool scm_policy_dynamic = true;
};
struct TaylorSeerConfig {
bool enabled = false;
int n_derivatives = 1;
int max_warmup_steps = 2;
int skip_interval_steps = 1;
};
struct CacheDitConfig {
DBCacheConfig dbcache;
TaylorSeerConfig taylorseer;
int double_Fn_blocks = -1;
int double_Bn_blocks = -1;
int single_Fn_blocks = -1;
int single_Bn_blocks = -1;
};
struct TaylorSeerState {
int n_derivatives = 1;
int current_step = -1;
int last_computed_step = -1;
std::vector<std::vector<float>> dY_prev;
std::vector<std::vector<float>> dY_current;
void init(int n_deriv, size_t hidden_size) {
n_derivatives = n_deriv;
int order = n_derivatives + 1;
dY_prev.resize(order);
dY_current.resize(order);
for (int i = 0; i < order; i++) {
dY_prev[i].clear();
dY_current[i].clear();
}
current_step = -1;
last_computed_step = -1;
}
void reset() {
for (auto& v : dY_prev)
v.clear();
for (auto& v : dY_current)
v.clear();
current_step = -1;
last_computed_step = -1;
}
bool can_approximate() const {
return last_computed_step >= n_derivatives && !dY_prev.empty() && !dY_prev[0].empty();
}
void update_derivatives(const float* Y, size_t size, int step) {
int order = n_derivatives + 1;
dY_prev = dY_current;
dY_current[0].resize(size);
for (size_t i = 0; i < size; i++) {
dY_current[0][i] = Y[i];
}
int window = step - last_computed_step;
if (window <= 0)
window = 1;
for (int d = 0; d < n_derivatives; d++) {
if (!dY_prev[d].empty() && dY_prev[d].size() == size) {
dY_current[d + 1].resize(size);
for (size_t i = 0; i < size; i++) {
dY_current[d + 1][i] = (dY_current[d][i] - dY_prev[d][i]) / static_cast<float>(window);
}
} else {
dY_current[d + 1].clear();
}
}
current_step = step;
last_computed_step = step;
}
void approximate(float* output, size_t size, int target_step) const {
if (!can_approximate() || dY_prev[0].size() != size) {
return;
}
int elapsed = target_step - last_computed_step;
if (elapsed <= 0)
elapsed = 1;
std::fill(output, output + size, 0.0f);
float factorial = 1.0f;
int order = static_cast<int>(dY_prev.size());
for (int o = 0; o < order; o++) {
if (dY_prev[o].empty() || dY_prev[o].size() != size)
continue;
if (o > 0)
factorial *= static_cast<float>(o);
float coeff = ::powf(static_cast<float>(elapsed), static_cast<float>(o)) / factorial;
for (size_t i = 0; i < size; i++) {
output[i] += coeff * dY_prev[o][i];
}
}
}
};
struct BlockCacheEntry {
std::vector<float> residual_img;
std::vector<float> residual_txt;
std::vector<float> residual;
std::vector<float> prev_img;
std::vector<float> prev_txt;
std::vector<float> prev_output;
bool has_prev = false;
};
struct CacheDitState {
CacheDitConfig config;
bool initialized = false;
int total_double_blocks = 0;
int total_single_blocks = 0;
size_t hidden_size = 0;
int current_step = -1;
int total_steps = 0;
int warmup_remaining = 0;
std::vector<int> cached_steps;
int continuous_cached_steps = 0;
float accumulated_residual_diff = 0.0f;
std::vector<BlockCacheEntry> double_block_cache;
std::vector<BlockCacheEntry> single_block_cache;
std::vector<float> Fn_residual_img;
std::vector<float> Fn_residual_txt;
std::vector<float> prev_Fn_residual_img;
std::vector<float> prev_Fn_residual_txt;
bool has_prev_Fn_residual = false;
std::vector<float> Bn_buffer_img;
std::vector<float> Bn_buffer_txt;
std::vector<float> Bn_buffer;
bool has_Bn_buffer = false;
TaylorSeerState taylor_state;
bool can_cache_this_step = false;
bool is_caching_this_step = false;
int total_blocks_computed = 0;
int total_blocks_cached = 0;
void init(const CacheDitConfig& cfg, int num_double_blocks, int num_single_blocks, size_t h_size) {
config = cfg;
total_double_blocks = num_double_blocks;
total_single_blocks = num_single_blocks;
hidden_size = h_size;
initialized = cfg.dbcache.enabled || cfg.taylorseer.enabled;
if (!initialized)
return;
warmup_remaining = cfg.dbcache.max_warmup_steps;
double_block_cache.resize(total_double_blocks);
single_block_cache.resize(total_single_blocks);
if (cfg.taylorseer.enabled) {
taylor_state.init(cfg.taylorseer.n_derivatives, h_size);
}
reset_runtime();
}
void reset_runtime() {
current_step = -1;
total_steps = 0;
warmup_remaining = config.dbcache.max_warmup_steps;
cached_steps.clear();
continuous_cached_steps = 0;
accumulated_residual_diff = 0.0f;
for (auto& entry : double_block_cache) {
entry.residual_img.clear();
entry.residual_txt.clear();
entry.prev_img.clear();
entry.prev_txt.clear();
entry.has_prev = false;
}
for (auto& entry : single_block_cache) {
entry.residual.clear();
entry.prev_output.clear();
entry.has_prev = false;
}
Fn_residual_img.clear();
Fn_residual_txt.clear();
prev_Fn_residual_img.clear();
prev_Fn_residual_txt.clear();
has_prev_Fn_residual = false;
Bn_buffer_img.clear();
Bn_buffer_txt.clear();
Bn_buffer.clear();
has_Bn_buffer = false;
taylor_state.reset();
can_cache_this_step = false;
is_caching_this_step = false;
total_blocks_computed = 0;
total_blocks_cached = 0;
}
bool enabled() const {
return initialized && (config.dbcache.enabled || config.taylorseer.enabled);
}
void begin_step(int step_index, float sigma = 0.0f) {
if (!enabled())
return;
if (step_index == current_step)
return;
current_step = step_index;
total_steps++;
bool in_warmup = warmup_remaining > 0;
if (in_warmup) {
warmup_remaining--;
}
bool scm_allows_cache = true;
if (!config.dbcache.steps_computation_mask.empty()) {
if (step_index < static_cast<int>(config.dbcache.steps_computation_mask.size())) {
scm_allows_cache = (config.dbcache.steps_computation_mask[step_index] == 0);
if (!config.dbcache.scm_policy_dynamic && scm_allows_cache) {
can_cache_this_step = true;
is_caching_this_step = false;
return;
}
}
}
bool max_cached_ok = (config.dbcache.max_cached_steps < 0) ||
(static_cast<int>(cached_steps.size()) < config.dbcache.max_cached_steps);
bool max_cont_ok = (config.dbcache.max_continuous_cached_steps < 0) ||
(continuous_cached_steps < config.dbcache.max_continuous_cached_steps);
bool accum_ok = (config.dbcache.max_accumulated_residual_diff < 0.0f) ||
(accumulated_residual_diff < config.dbcache.max_accumulated_residual_diff);
can_cache_this_step = !in_warmup && scm_allows_cache && max_cached_ok && max_cont_ok && accum_ok && has_prev_Fn_residual;
is_caching_this_step = false;
}
void end_step(bool was_cached) {
if (was_cached) {
cached_steps.push_back(current_step);
continuous_cached_steps++;
} else {
continuous_cached_steps = 0;
}
}
static float calculate_residual_diff(const float* prev, const float* curr, size_t size) {
if (size == 0)
return 0.0f;
float sum_diff = 0.0f;
float sum_abs = 0.0f;
for (size_t i = 0; i < size; i++) {
sum_diff += std::fabs(prev[i] - curr[i]);
sum_abs += std::fabs(prev[i]);
}
return sum_diff / (sum_abs + 1e-6f);
}
static float calculate_residual_diff(const std::vector<float>& prev, const std::vector<float>& curr) {
if (prev.size() != curr.size() || prev.empty())
return 1.0f;
return calculate_residual_diff(prev.data(), curr.data(), prev.size());
}
int get_double_Fn_blocks() const {
return (config.double_Fn_blocks >= 0) ? config.double_Fn_blocks : config.dbcache.Fn_compute_blocks;
}
int get_double_Bn_blocks() const {
return (config.double_Bn_blocks >= 0) ? config.double_Bn_blocks : config.dbcache.Bn_compute_blocks;
}
int get_single_Fn_blocks() const {
return (config.single_Fn_blocks >= 0) ? config.single_Fn_blocks : config.dbcache.Fn_compute_blocks;
}
int get_single_Bn_blocks() const {
return (config.single_Bn_blocks >= 0) ? config.single_Bn_blocks : config.dbcache.Bn_compute_blocks;
}
bool is_Fn_double_block(int block_idx) const {
return block_idx < get_double_Fn_blocks();
}
bool is_Bn_double_block(int block_idx) const {
int Bn = get_double_Bn_blocks();
return Bn > 0 && block_idx >= (total_double_blocks - Bn);
}
bool is_Mn_double_block(int block_idx) const {
return !is_Fn_double_block(block_idx) && !is_Bn_double_block(block_idx);
}
bool is_Fn_single_block(int block_idx) const {
return block_idx < get_single_Fn_blocks();
}
bool is_Bn_single_block(int block_idx) const {
int Bn = get_single_Bn_blocks();
return Bn > 0 && block_idx >= (total_single_blocks - Bn);
}
bool is_Mn_single_block(int block_idx) const {
return !is_Fn_single_block(block_idx) && !is_Bn_single_block(block_idx);
}
void store_Fn_residual(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* input_img, const float* input_txt) {
Fn_residual_img.resize(img_size);
Fn_residual_txt.resize(txt_size);
for (size_t i = 0; i < img_size; i++) {
Fn_residual_img[i] = img[i] - input_img[i];
}
for (size_t i = 0; i < txt_size; i++) {
Fn_residual_txt[i] = txt[i] - input_txt[i];
}
}
bool check_cache_decision() {
if (!can_cache_this_step) {
is_caching_this_step = false;
return false;
}
if (!has_prev_Fn_residual || prev_Fn_residual_img.empty()) {
is_caching_this_step = false;
return false;
}
float diff_img = calculate_residual_diff(prev_Fn_residual_img, Fn_residual_img);
float diff_txt = calculate_residual_diff(prev_Fn_residual_txt, Fn_residual_txt);
float diff = (diff_img + diff_txt) / 2.0f;
if (diff < config.dbcache.residual_diff_threshold) {
is_caching_this_step = true;
accumulated_residual_diff += diff;
return true;
}
is_caching_this_step = false;
return false;
}
void update_prev_Fn_residual() {
prev_Fn_residual_img = Fn_residual_img;
prev_Fn_residual_txt = Fn_residual_txt;
has_prev_Fn_residual = !prev_Fn_residual_img.empty();
}
void store_double_block_residual(int block_idx, const float* img, const float* txt, size_t img_size, size_t txt_size, const float* prev_img, const float* prev_txt) {
if (block_idx < 0 || block_idx >= static_cast<int>(double_block_cache.size()))
return;
BlockCacheEntry& entry = double_block_cache[block_idx];
entry.residual_img.resize(img_size);
entry.residual_txt.resize(txt_size);
for (size_t i = 0; i < img_size; i++) {
entry.residual_img[i] = img[i] - prev_img[i];
}
for (size_t i = 0; i < txt_size; i++) {
entry.residual_txt[i] = txt[i] - prev_txt[i];
}
entry.prev_img.resize(img_size);
entry.prev_txt.resize(txt_size);
for (size_t i = 0; i < img_size; i++) {
entry.prev_img[i] = img[i];
}
for (size_t i = 0; i < txt_size; i++) {
entry.prev_txt[i] = txt[i];
}
entry.has_prev = true;
}
void apply_double_block_cache(int block_idx, float* img, float* txt, size_t img_size, size_t txt_size) {
if (block_idx < 0 || block_idx >= static_cast<int>(double_block_cache.size()))
return;
const BlockCacheEntry& entry = double_block_cache[block_idx];
if (entry.residual_img.size() != img_size || entry.residual_txt.size() != txt_size)
return;
for (size_t i = 0; i < img_size; i++) {
img[i] += entry.residual_img[i];
}
for (size_t i = 0; i < txt_size; i++) {
txt[i] += entry.residual_txt[i];
}
total_blocks_cached++;
}
void store_single_block_residual(int block_idx, const float* output, size_t size, const float* input) {
if (block_idx < 0 || block_idx >= static_cast<int>(single_block_cache.size()))
return;
BlockCacheEntry& entry = single_block_cache[block_idx];
entry.residual.resize(size);
for (size_t i = 0; i < size; i++) {
entry.residual[i] = output[i] - input[i];
}
entry.prev_output.resize(size);
for (size_t i = 0; i < size; i++) {
entry.prev_output[i] = output[i];
}
entry.has_prev = true;
}
void apply_single_block_cache(int block_idx, float* output, size_t size) {
if (block_idx < 0 || block_idx >= static_cast<int>(single_block_cache.size()))
return;
const BlockCacheEntry& entry = single_block_cache[block_idx];
if (entry.residual.size() != size)
return;
for (size_t i = 0; i < size; i++) {
output[i] += entry.residual[i];
}
total_blocks_cached++;
}
void store_Bn_buffer(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* Bn_start_img, const float* Bn_start_txt) {
Bn_buffer_img.resize(img_size);
Bn_buffer_txt.resize(txt_size);
for (size_t i = 0; i < img_size; i++) {
Bn_buffer_img[i] = img[i] - Bn_start_img[i];
}
for (size_t i = 0; i < txt_size; i++) {
Bn_buffer_txt[i] = txt[i] - Bn_start_txt[i];
}
has_Bn_buffer = true;
}
void apply_Bn_buffer(float* img, float* txt, size_t img_size, size_t txt_size) {
if (!has_Bn_buffer)
return;
if (Bn_buffer_img.size() != img_size || Bn_buffer_txt.size() != txt_size)
return;
for (size_t i = 0; i < img_size; i++) {
img[i] += Bn_buffer_img[i];
}
for (size_t i = 0; i < txt_size; i++) {
txt[i] += Bn_buffer_txt[i];
}
}
void taylor_update(const float* hidden_state, size_t size) {
if (!config.taylorseer.enabled)
return;
taylor_state.update_derivatives(hidden_state, size, current_step);
}
bool taylor_can_approximate() const {
return config.taylorseer.enabled && taylor_state.can_approximate();
}
void taylor_approximate(float* output, size_t size) {
if (!config.taylorseer.enabled)
return;
taylor_state.approximate(output, size, current_step);
}
bool should_use_taylor_this_step() const {
if (!config.taylorseer.enabled)
return false;
if (current_step < config.taylorseer.max_warmup_steps)
return false;
int interval = config.taylorseer.skip_interval_steps;
if (interval <= 0)
interval = 1;
return (current_step % (interval + 1)) != 0;
}
void log_metrics() const {
if (!enabled())
return;
int total_blocks = total_blocks_computed + total_blocks_cached;
float cache_ratio = (total_blocks > 0) ? (static_cast<float>(total_blocks_cached) / total_blocks * 100.0f) : 0.0f;
float step_cache_ratio = (total_steps > 0) ? (static_cast<float>(cached_steps.size()) / total_steps * 100.0f) : 0.0f;
LOG_INFO("CacheDIT: steps_cached=%zu/%d (%.1f%%), blocks_cached=%d/%d (%.1f%%), accum_diff=%.4f",
cached_steps.size(), total_steps, step_cache_ratio,
total_blocks_cached, total_blocks, cache_ratio,
accumulated_residual_diff);
}
std::string get_summary() const {
char buf[256];
snprintf(buf, sizeof(buf),
"CacheDIT[thresh=%.2f]: cached %zu/%d steps, %d/%d blocks",
config.dbcache.residual_diff_threshold,
cached_steps.size(), total_steps,
total_blocks_cached, total_blocks_computed + total_blocks_cached);
return std::string(buf);
}
};
inline std::vector<int> parse_scm_mask(const std::string& mask_str) {
std::vector<int> mask;
if (mask_str.empty())
return mask;
size_t pos = 0;
size_t start = 0;
while ((pos = mask_str.find(',', start)) != std::string::npos) {
std::string token = mask_str.substr(start, pos - start);
mask.push_back(std::stoi(token));
start = pos + 1;
}
if (start < mask_str.length()) {
mask.push_back(std::stoi(mask_str.substr(start)));
}
return mask;
}
inline std::vector<int> generate_scm_mask(
const std::vector<int>& compute_bins,
const std::vector<int>& cache_bins,
int total_steps) {
std::vector<int> mask;
size_t c_idx = 0, cache_idx = 0;
while (static_cast<int>(mask.size()) < total_steps) {
if (c_idx < compute_bins.size()) {
for (int i = 0; i < compute_bins[c_idx] && static_cast<int>(mask.size()) < total_steps; i++) {
mask.push_back(1);
}
c_idx++;
}
if (cache_idx < cache_bins.size()) {
for (int i = 0; i < cache_bins[cache_idx] && static_cast<int>(mask.size()) < total_steps; i++) {
mask.push_back(0);
}
cache_idx++;
}
if (c_idx >= compute_bins.size() && cache_idx >= cache_bins.size())
break;
}
if (!mask.empty()) {
mask.back() = 1;
}
return mask;
}
inline void parse_dbcache_options(const std::string& opts, DBCacheConfig& cfg) {
if (opts.empty())
return;
int Fn = 8, Bn = 0, warmup = 8, max_cached = -1, max_cont = -1;
float thresh = 0.08f;
sscanf(opts.c_str(), "%d,%d,%f,%d,%d,%d",
&Fn, &Bn, &thresh, &warmup, &max_cached, &max_cont);
cfg.Fn_compute_blocks = Fn;
cfg.Bn_compute_blocks = Bn;
cfg.residual_diff_threshold = thresh;
cfg.max_warmup_steps = warmup;
cfg.max_cached_steps = max_cached;
cfg.max_continuous_cached_steps = max_cont;
}
inline void parse_taylorseer_options(const std::string& opts, TaylorSeerConfig& cfg) {
if (opts.empty())
return;
int n_deriv = 1, warmup = 2, interval = 1;
sscanf(opts.c_str(), "%d,%d,%d", &n_deriv, &warmup, &interval);
cfg.n_derivatives = n_deriv;
cfg.max_warmup_steps = warmup;
cfg.skip_interval_steps = interval;
}
struct CacheDitConditionState {
DBCacheConfig config;
TaylorSeerConfig taylor_config;
bool initialized = false;
int current_step_index = -1;
bool step_active = false;
bool skip_current_step = false;
bool initial_step = true;
int warmup_remaining = 0;
std::vector<int> cached_steps;
int continuous_cached_steps = 0;
float accumulated_residual_diff = 0.0f;
int total_steps_skipped = 0;
const void* anchor_condition = nullptr;
struct CacheEntry {
std::vector<float> diff;
std::vector<float> prev_input;
std::vector<float> prev_output;
bool has_prev = false;
};
std::unordered_map<const void*, CacheEntry> cache_diffs;
TaylorSeerState taylor_state;
float start_sigma = std::numeric_limits<float>::max();
float end_sigma = 0.0f;
void reset_runtime() {
current_step_index = -1;
step_active = false;
skip_current_step = false;
initial_step = true;
warmup_remaining = config.max_warmup_steps;
cached_steps.clear();
continuous_cached_steps = 0;
accumulated_residual_diff = 0.0f;
total_steps_skipped = 0;
anchor_condition = nullptr;
cache_diffs.clear();
taylor_state.reset();
}
void init(const DBCacheConfig& dbcfg, const TaylorSeerConfig& tcfg) {
config = dbcfg;
taylor_config = tcfg;
initialized = dbcfg.enabled || tcfg.enabled;
reset_runtime();
if (taylor_config.enabled) {
taylor_state.init(taylor_config.n_derivatives, 0);
}
}
void set_sigmas(const std::vector<float>& sigmas) {
if (!initialized || sigmas.size() < 2)
return;
float start_percent = 0.15f;
float end_percent = 0.95f;
size_t n_steps = sigmas.size() - 1;
size_t start_step = static_cast<size_t>(start_percent * n_steps);
size_t end_step = static_cast<size_t>(end_percent * n_steps);
if (start_step >= n_steps)
start_step = n_steps - 1;
if (end_step >= n_steps)
end_step = n_steps - 1;
start_sigma = sigmas[start_step];
end_sigma = sigmas[end_step];
if (start_sigma < end_sigma) {
std::swap(start_sigma, end_sigma);
}
}
bool enabled() const {
return initialized && (config.enabled || taylor_config.enabled);
}
void begin_step(int step_index, float sigma) {
if (!enabled())
return;
if (step_index == current_step_index)
return;
current_step_index = step_index;
skip_current_step = false;
step_active = false;
if (sigma > start_sigma)
return;
if (!(sigma > end_sigma))
return;
step_active = true;
if (warmup_remaining > 0) {
warmup_remaining--;
return;
}
if (!config.steps_computation_mask.empty()) {
if (step_index < static_cast<int>(config.steps_computation_mask.size())) {
if (config.steps_computation_mask[step_index] == 1) {
return;
}
}
}
if (config.max_cached_steps >= 0 &&
static_cast<int>(cached_steps.size()) >= config.max_cached_steps) {
return;
}
if (config.max_continuous_cached_steps >= 0 &&
continuous_cached_steps >= config.max_continuous_cached_steps) {
return;
}
}
bool step_is_active() const {
return enabled() && step_active;
}
bool is_step_skipped() const {
return enabled() && step_active && skip_current_step;
}
bool has_cache(const void* cond) const {
auto it = cache_diffs.find(cond);
return it != cache_diffs.end() && !it->second.diff.empty();
}
void update_cache(const void* cond, const float* input, const float* output, size_t size) {
CacheEntry& entry = cache_diffs[cond];
entry.diff.resize(size);
for (size_t i = 0; i < size; i++) {
entry.diff[i] = output[i] - input[i];
}
entry.prev_input.resize(size);
entry.prev_output.resize(size);
for (size_t i = 0; i < size; i++) {
entry.prev_input[i] = input[i];
entry.prev_output[i] = output[i];
}
entry.has_prev = true;
}
void apply_cache(const void* cond, const float* input, float* output, size_t size) {
auto it = cache_diffs.find(cond);
if (it == cache_diffs.end() || it->second.diff.empty())
return;
if (it->second.diff.size() != size)
return;
for (size_t i = 0; i < size; i++) {
output[i] = input[i] + it->second.diff[i];
}
}
bool before_condition(const void* cond, ggml_tensor* input, ggml_tensor* output, float sigma, int step_index) {
if (!enabled() || step_index < 0)
return false;
if (step_index != current_step_index) {
begin_step(step_index, sigma);
}
if (!step_active)
return false;
if (initial_step) {
anchor_condition = cond;
initial_step = false;
}
bool is_anchor = (cond == anchor_condition);
if (skip_current_step) {
if (has_cache(cond)) {
apply_cache(cond, (float*)input->data, (float*)output->data,
static_cast<size_t>(ggml_nelements(output)));
return true;
}
return false;
}
if (!is_anchor)
return false;
auto it = cache_diffs.find(cond);
if (it == cache_diffs.end() || !it->second.has_prev)
return false;
size_t ne = static_cast<size_t>(ggml_nelements(input));
if (it->second.prev_input.size() != ne)
return false;
float* input_data = (float*)input->data;
float diff = CacheDitState::calculate_residual_diff(
it->second.prev_input.data(), input_data, ne);
float effective_threshold = config.residual_diff_threshold;
if (config.Fn_compute_blocks > 0) {
float fn_confidence = 1.0f + 0.02f * (config.Fn_compute_blocks - 8);
fn_confidence = std::max(0.5f, std::min(2.0f, fn_confidence));
effective_threshold *= fn_confidence;
}
if (config.Bn_compute_blocks > 0) {
float bn_quality = 1.0f - 0.03f * config.Bn_compute_blocks;
bn_quality = std::max(0.5f, std::min(1.0f, bn_quality));
effective_threshold *= bn_quality;
}
if (diff < effective_threshold) {
skip_current_step = true;
total_steps_skipped++;
cached_steps.push_back(current_step_index);
continuous_cached_steps++;
accumulated_residual_diff += diff;
apply_cache(cond, input_data, (float*)output->data, ne);
return true;
}
continuous_cached_steps = 0;
return false;
}
void after_condition(const void* cond, ggml_tensor* input, ggml_tensor* output) {
if (!step_is_active())
return;
size_t ne = static_cast<size_t>(ggml_nelements(output));
update_cache(cond, (float*)input->data, (float*)output->data, ne);
if (cond == anchor_condition && taylor_config.enabled) {
taylor_state.update_derivatives((float*)output->data, ne, current_step_index);
}
}
void log_metrics() const {
if (!enabled())
return;
LOG_INFO("CacheDIT: steps_skipped=%d/%d (%.1f%%), accum_residual_diff=%.4f",
total_steps_skipped,
current_step_index + 1,
(current_step_index > 0) ? (100.0f * total_steps_skipped / (current_step_index + 1)) : 0.0f,
accumulated_residual_diff);
}
};
#endif

View File

@ -4,6 +4,7 @@
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "model.h" #include "model.h"
#include "tokenize_util.h" #include "tokenize_util.h"
#include "vocab/vocab.h"
/*================================================== CLIPTokenizer ===================================================*/ /*================================================== CLIPTokenizer ===================================================*/
@ -110,7 +111,7 @@ public:
if (merges_utf8_str.size() > 0) { if (merges_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str); load_from_merges(merges_utf8_str);
} else { } else {
load_from_merges(ModelLoader::load_merges()); load_from_merges(load_clip_merges());
} }
add_special_token("<|startoftext|>"); add_special_token("<|startoftext|>");
add_special_token("<|endoftext|>"); add_special_token("<|endoftext|>");
@ -296,7 +297,7 @@ public:
size_t max_length = 0, size_t max_length = 0,
bool padding = false) { bool padding = false) {
if (max_length > 0 && padding) { if (max_length > 0 && padding) {
size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2)); size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.0 / (max_length - 2)));
if (n == 0) { if (n == 0) {
n = 1; n = 1;
} }
@ -472,16 +473,16 @@ public:
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, n_token, d_model] // x: [N, n_token, d_model]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]); auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]); auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
x = fc1->forward(ctx, x); x = fc1->forward(ctx, x);
if (use_gelu) { if (use_gelu) {
x = ggml_gelu_inplace(ctx->ggml_ctx, x); x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
} else { } else {
x = ggml_gelu_quick_inplace(ctx->ggml_ctx, x); x = ggml_ext_gelu_quick(ctx->ggml_ctx, x, true);
} }
x = fc2->forward(ctx, x); x = fc2->forward(ctx, x);
return x; return x;
@ -510,7 +511,7 @@ public:
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size)); blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, bool mask = true) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* mask = nullptr) {
// x: [N, n_token, d_model] // x: [N, n_token, d_model]
auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]); auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]); auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
@ -525,10 +526,10 @@ public:
struct CLIPEncoder : public GGMLBlock { struct CLIPEncoder : public GGMLBlock {
protected: protected:
int64_t n_layer; int n_layer;
public: public:
CLIPEncoder(int64_t n_layer, CLIPEncoder(int n_layer,
int64_t d_model, int64_t d_model,
int64_t n_head, int64_t n_head,
int64_t intermediate_size, int64_t intermediate_size,
@ -540,10 +541,10 @@ public:
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
int clip_skip = -1, ggml_tensor* mask = nullptr,
bool mask = true) { int clip_skip = -1) {
// x: [N, n_token, d_model] // x: [N, n_token, d_model]
int layer_idx = n_layer - 1; int layer_idx = n_layer - 1;
// LOG_DEBUG("clip_skip %d", clip_skip); // LOG_DEBUG("clip_skip %d", clip_skip);
@ -572,7 +573,7 @@ protected:
int64_t num_positions; int64_t num_positions;
bool force_clip_f32; bool force_clip_f32;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type token_wtype = GGML_TYPE_F32; enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) { if (!force_clip_f32) {
token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32); token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
@ -596,13 +597,13 @@ public:
force_clip_f32(force_clip_f32) { force_clip_f32(force_clip_f32) {
} }
struct ggml_tensor* get_token_embed_weight() { ggml_tensor* get_token_embed_weight() {
return params["token_embedding.weight"]; return params["token_embedding.weight"];
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
struct ggml_tensor* custom_embed_weight) { ggml_tensor* custom_embed_weight) {
// input_ids: [N, n_token] // input_ids: [N, n_token]
auto token_embed_weight = params["token_embedding.weight"]; auto token_embed_weight = params["token_embedding.weight"];
auto position_embed_weight = params["position_embedding.weight"]; auto position_embed_weight = params["position_embedding.weight"];
@ -623,13 +624,13 @@ public:
class CLIPVisionEmbeddings : public GGMLBlock { class CLIPVisionEmbeddings : public GGMLBlock {
protected: protected:
int64_t embed_dim; int64_t embed_dim;
int64_t num_channels; int num_channels;
int64_t patch_size; int patch_size;
int64_t image_size; int image_size;
int64_t num_patches; int num_patches;
int64_t num_positions; int64_t num_positions;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type patch_wtype = GGML_TYPE_F16; enum ggml_type patch_wtype = GGML_TYPE_F16;
enum ggml_type class_wtype = GGML_TYPE_F32; enum ggml_type class_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32; enum ggml_type position_wtype = GGML_TYPE_F32;
@ -641,9 +642,9 @@ protected:
public: public:
CLIPVisionEmbeddings(int64_t embed_dim, CLIPVisionEmbeddings(int64_t embed_dim,
int64_t num_channels = 3, int num_channels = 3,
int64_t patch_size = 14, int patch_size = 14,
int64_t image_size = 224) int image_size = 224)
: embed_dim(embed_dim), : embed_dim(embed_dim),
num_channels(num_channels), num_channels(num_channels),
patch_size(patch_size), patch_size(patch_size),
@ -652,7 +653,7 @@ public:
num_positions = num_patches + 1; num_positions = num_patches + 1;
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* pixel_values) {
// pixel_values: [N, num_channels, image_size, image_size] // pixel_values: [N, num_channels, image_size, image_size]
// return: [N, num_positions, embed_dim] // return: [N, num_positions, embed_dim]
GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels); GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@ -662,20 +663,20 @@ public:
auto position_embed_weight = params["position_embedding.weight"]; auto position_embed_weight = params["position_embedding.weight"];
// concat(patch_embedding, class_embedding) + position_embedding // concat(patch_embedding, class_embedding) + position_embedding
struct ggml_tensor* patch_embedding; ggml_tensor* patch_embedding;
int64_t N = pixel_values->ne[3]; int64_t N = pixel_values->ne[3];
patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N); ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim] class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim]
class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1] class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1] ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim] x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
x = ggml_add(ctx->ggml_ctx, x, position_embed_weight); x = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
return x; // [N, num_positions, embed_dim] return x; // [N, num_positions, embed_dim]
} }
}; };
@ -692,7 +693,7 @@ enum CLIPVersion {
class CLIPTextModel : public GGMLBlock { class CLIPTextModel : public GGMLBlock {
protected: protected:
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
if (version == OPEN_CLIP_VIT_BIGG_14) { if (version == OPEN_CLIP_VIT_BIGG_14) {
enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size); params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
@ -733,24 +734,25 @@ public:
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size)); blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
} }
struct ggml_tensor* get_token_embed_weight() { ggml_tensor* get_token_embed_weight() {
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]); auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
return embeddings->get_token_embed_weight(); return embeddings->get_token_embed_weight();
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
struct ggml_tensor* tkn_embeddings, ggml_tensor* tkn_embeddings,
size_t max_token_idx = 0, ggml_tensor* mask = nullptr,
bool return_pooled = false, size_t max_token_idx = 0,
int clip_skip = -1) { bool return_pooled = false,
int clip_skip = -1) {
// input_ids: [N, n_token] // input_ids: [N, n_token]
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]); auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]); auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]); auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size] auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true); x = encoder->forward(ctx, x, mask, return_pooled ? -1 : clip_skip);
if (return_pooled || with_final_ln) { if (return_pooled || with_final_ln) {
x = final_layer_norm->forward(ctx, x); x = final_layer_norm->forward(ctx, x);
} }
@ -802,10 +804,10 @@ public:
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size)); blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values, ggml_tensor* pixel_values,
bool return_pooled = true, bool return_pooled = true,
int clip_skip = -1) { int clip_skip = -1) {
// pixel_values: [N, num_channels, image_size, image_size] // pixel_values: [N, num_channels, image_size, image_size]
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]); auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]); auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@ -814,10 +816,11 @@ public:
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim] auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x); x = pre_layernorm->forward(ctx, x);
x = encoder->forward(ctx, x, clip_skip, false); x = encoder->forward(ctx, x, nullptr, clip_skip);
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
auto last_hidden_state = x; auto last_hidden_state = x;
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
GGML_ASSERT(x->ne[3] == 1); GGML_ASSERT(x->ne[3] == 1);
if (return_pooled) { if (return_pooled) {
@ -836,7 +839,7 @@ protected:
int64_t out_features; int64_t out_features;
bool transpose_weight; bool transpose_weight;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
if (transpose_weight) { if (transpose_weight) {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features); params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
@ -853,8 +856,8 @@ public:
out_features(out_features), out_features(out_features),
transpose_weight(transpose_weight) {} transpose_weight(transpose_weight) {}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
struct ggml_tensor* w = params["weight"]; ggml_tensor* w = params["weight"];
if (transpose_weight) { if (transpose_weight) {
w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w)); w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
} }
@ -883,10 +886,10 @@ public:
blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w)); blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values, ggml_tensor* pixel_values,
bool return_pooled = true, bool return_pooled = true,
int clip_skip = -1) { int clip_skip = -1) {
// pixel_values: [N, num_channels, image_size, image_size] // pixel_values: [N, num_channels, image_size, image_size]
// return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size] // return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]); auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
@ -905,6 +908,8 @@ public:
struct CLIPTextModelRunner : public GGMLRunner { struct CLIPTextModelRunner : public GGMLRunner {
CLIPTextModel model; CLIPTextModel model;
std::vector<float> attention_mask_vec;
CLIPTextModelRunner(ggml_backend_t backend, CLIPTextModelRunner(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map, const String2TensorStorage& tensor_storage_map,
@ -931,16 +936,17 @@ struct CLIPTextModelRunner : public GGMLRunner {
return "clip"; return "clip";
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix); model.get_param_tensors(tensors, prefix);
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
struct ggml_tensor* embeddings, ggml_tensor* embeddings,
size_t max_token_idx = 0, ggml_tensor* mask,
bool return_pooled = false, size_t max_token_idx = 0,
int clip_skip = -1) { bool return_pooled = false,
int clip_skip = -1) {
size_t N = input_ids->ne[1]; size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0]; size_t n_token = input_ids->ne[0];
if (input_ids->ne[0] > model.n_token) { if (input_ids->ne[0] > model.n_token) {
@ -948,20 +954,20 @@ struct CLIPTextModelRunner : public GGMLRunner {
input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token); input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
} }
return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip);
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, ggml_cgraph* build_graph(ggml_tensor* input_ids,
int num_custom_embeddings = 0, int num_custom_embeddings = 0,
void* custom_embeddings_data = nullptr, void* custom_embeddings_data = nullptr,
size_t max_token_idx = 0, size_t max_token_idx = 0,
bool return_pooled = false, bool return_pooled = false,
int clip_skip = -1) { int clip_skip = -1) {
struct ggml_cgraph* gf = new_graph_custom(2048); ggml_cgraph* gf = new_graph_custom(2048);
input_ids = to_backend(input_ids); input_ids = to_backend(input_ids);
struct ggml_tensor* embeddings = nullptr; ggml_tensor* embeddings = nullptr;
if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) { if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
auto token_embed_weight = model.get_token_embed_weight(); auto token_embed_weight = model.get_token_embed_weight();
@ -975,9 +981,23 @@ struct CLIPTextModelRunner : public GGMLRunner {
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1); embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
} }
int n_tokens = static_cast<int>(input_ids->ne[0]);
attention_mask_vec.resize(n_tokens * n_tokens);
for (int i0 = 0; i0 < n_tokens; i0++) {
for (int i1 = 0; i1 < n_tokens; i1++) {
float value = 0.f;
if (i0 > i1) {
value = -INFINITY;
}
attention_mask_vec[i1 * n_tokens + i0] = value;
}
}
auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
@ -985,7 +1005,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
} }
bool compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
int num_custom_embeddings, int num_custom_embeddings,
void* custom_embeddings_data, void* custom_embeddings_data,
size_t max_token_idx, size_t max_token_idx,
@ -993,7 +1013,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
int clip_skip, int clip_skip,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
}; };
return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);

View File

@ -1,5 +1,5 @@
#ifndef __COMMON_HPP__ #ifndef __COMMON_BLOCK_HPP__
#define __COMMON_HPP__ #define __COMMON_BLOCK_HPP__
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
@ -23,12 +23,12 @@ public:
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
if (vae_downsample) { if (vae_downsample) {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]); auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
x = conv->forward(ctx, x); x = conv->forward(ctx, x);
} else { } else {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]); auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
@ -52,7 +52,7 @@ public:
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]); auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
@ -80,7 +80,7 @@ protected:
std::pair<int, int> padding) { std::pair<int, int> padding) {
GGML_ASSERT(dims == 2 || dims == 3); GGML_ASSERT(dims == 2 || dims == 3);
if (dims == 3) { if (dims == 3) {
return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first)); return std::shared_ptr<GGMLBlock>(new Conv3d(in_channels, out_channels, {kernel_size.first, 1, 1}, {1, 1, 1}, {padding.first, 0, 0}));
} else { } else {
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding)); return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
} }
@ -121,7 +121,7 @@ public:
} }
} }
virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) { virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb = nullptr) {
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
// [N, c, t, h, w] => [N, c, t, h * w] // [N, c, t, h, w] => [N, c, t, h * w]
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w] // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@ -188,17 +188,19 @@ public:
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2)); blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in] // x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out] // return: [ne3, ne2, ne1, dim_out]
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]); auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2] x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2]
auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0); auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false);
x = x_vec[0]; // [ne3, ne2, ne1, dim_out] x = x_vec[0]; // [ne3, ne2, ne1, dim_out]
auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out] auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out]
gate = ggml_gelu_inplace(ctx->ggml_ctx, gate); gate = ggml_cont(ctx->ggml_ctx, gate);
gate = ggml_ext_gelu(ctx->ggml_ctx, gate, true);
x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out] x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out]
@ -212,13 +214,13 @@ public:
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias)); blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in] // x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out] // return: [ne3, ne2, ne1, dim_out]
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]); auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
x = proj->forward(ctx, x); x = proj->forward(ctx, x);
x = ggml_gelu_inplace(ctx->ggml_ctx, x); x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
return x; return x;
} }
}; };
@ -256,7 +258,7 @@ public:
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale)); blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [ne3, ne2, ne1, dim] // x: [ne3, ne2, ne1, dim]
// return: [ne3, ne2, ne1, dim_out] // return: [ne3, ne2, ne1, dim_out]
@ -295,9 +297,9 @@ public:
// to_out_1 is nn.Dropout(), skip for inference // to_out_1 is nn.Dropout(), skip for inference
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* context) { ggml_tensor* context) {
// x: [N, n_token, query_dim] // x: [N, n_token, query_dim]
// context: [N, n_context, context_dim] // context: [N, n_context, context_dim]
// return: [N, n_token, query_dim] // return: [N, n_token, query_dim]
@ -315,7 +317,7 @@ public:
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
return x; return x;
@ -353,9 +355,9 @@ public:
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* context) { ggml_tensor* context) {
// x: [N, n_token, query_dim] // x: [N, n_token, query_dim]
// context: [N, n_context, context_dim] // context: [N, n_context, context_dim]
// return: [N, n_token, query_dim] // return: [N, n_token, query_dim]
@ -404,7 +406,7 @@ protected:
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2 int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
bool use_linear = false; bool use_linear = false;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
auto iter = tensor_storage_map.find(prefix + "proj_out.weight"); auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
if (iter != tensor_storage_map.end()) { if (iter != tensor_storage_map.end()) {
int64_t inner_dim = n_head * d_head; int64_t inner_dim = n_head * d_head;
@ -454,9 +456,9 @@ public:
} }
} }
virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, virtual ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* context) { ggml_tensor* context) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)] // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]); auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
@ -508,7 +510,7 @@ public:
class AlphaBlender : public GGMLBlock { class AlphaBlender : public GGMLBlock {
protected: protected:
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
@ -528,23 +530,23 @@ public:
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern // since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x_spatial, ggml_tensor* x_spatial,
struct ggml_tensor* x_temporal) { ggml_tensor* x_temporal) {
// image_only_indicator is always tensor([0.]) // image_only_indicator is always tensor([0.])
float alpha = get_alpha(); float alpha = get_alpha();
auto x = ggml_add(ctx->ggml_ctx, auto x = ggml_add(ctx->ggml_ctx,
ggml_scale(ctx->ggml_ctx, x_spatial, alpha), ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
ggml_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha)); ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
return x; return x;
} }
}; };
class VideoResBlock : public ResBlock { class VideoResBlock : public ResBlock {
public: public:
VideoResBlock(int channels, VideoResBlock(int64_t channels,
int emb_channels, int64_t emb_channels,
int out_channels, int64_t out_channels,
std::pair<int, int> kernel_size = {3, 3}, std::pair<int, int> kernel_size = {3, 3},
int64_t video_kernel_size = 3, int64_t video_kernel_size = 3,
int dims = 2) // always 2 int dims = 2) // always 2
@ -553,10 +555,10 @@ public:
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender()); blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* emb, ggml_tensor* emb,
int num_video_frames) { int num_video_frames) {
// x: [N, channels, h, w] aka [b*t, channels, h, w] // x: [N, channels, h, w] aka [b*t, channels, h, w]
// emb: [N, emb_channels] aka [b*t, emb_channels] // emb: [N, emb_channels] aka [b*t, emb_channels]
// image_only_indicator is always tensor([0.]) // image_only_indicator is always tensor([0.])
@ -588,4 +590,4 @@ public:
} }
}; };
#endif // __COMMON_HPP__ #endif // __COMMON_BLOCK_HPP__

108
src/common_dit.hpp Normal file
View File

@ -0,0 +1,108 @@
#ifndef __COMMON_DIT_HPP__
#define __COMMON_DIT_HPP__
#include "ggml_extend.hpp"
namespace DiT {
ggml_tensor* patchify(ggml_context* ctx,
ggml_tensor* x,
int pw,
int ph,
bool patch_last = true) {
// x: [N, C, H, W]
// return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
int64_t N = x->ne[3];
int64_t C = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
int64_t h = H / ph;
int64_t w = W / pw;
GGML_ASSERT(h * ph == H && w * pw == W);
x = ggml_reshape_4d(ctx, x, pw, w, ph, h * C * N); // [N*C*h, ph, w, pw]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, ph, pw]
x = ggml_reshape_4d(ctx, x, pw * ph, w * h, C, N); // [N, C, h*w, ph*pw]
if (patch_last) {
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, ph*pw]
x = ggml_reshape_3d(ctx, x, pw * ph * C, w * h, N); // [N, h*w, C*ph*pw]
} else {
x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [N, h*w, C, ph*pw]
x = ggml_reshape_3d(ctx, x, C * pw * ph, w * h, N); // [N, h*w, ph*pw*C]
}
return x;
}
ggml_tensor* unpatchify(ggml_context* ctx,
ggml_tensor* x,
int64_t h,
int64_t w,
int ph,
int pw,
bool patch_last = true) {
// x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
// return: [N, C, H, W]
int64_t N = x->ne[2];
int64_t C = x->ne[0] / ph / pw;
int64_t H = h * ph;
int64_t W = w * pw;
GGML_ASSERT(C * ph * pw == x->ne[0]);
if (patch_last) {
x = ggml_reshape_4d(ctx, x, pw * ph, C, w * h, N); // [N, h*w, C, ph*pw]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, ph*pw]
} else {
x = ggml_reshape_4d(ctx, x, C, pw * ph, w * h, N); // [N, h*w, ph*pw, C]
x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // [N, C, h*w, ph*pw]
}
x = ggml_reshape_4d(ctx, x, pw, ph, w, h * C * N); // [N*C*h, w, ph, pw]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, ph, w, pw]
x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*ph, w*pw]
return x;
}
ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
ggml_tensor* x,
int ph,
int pw) {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int pad_h = (ph - H % ph) % ph;
int pad_w = (pw - W % pw) % pw;
x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
return x;
}
ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
ggml_tensor* x,
int ph,
int pw,
bool patch_last = true) {
x = pad_to_patch_size(ctx, x, ph, pw);
x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last);
return x;
}
ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
ggml_tensor* x,
int64_t H,
int64_t W,
int ph,
int pw,
bool patch_last = true) {
int pad_h = (ph - H % ph) % ph;
int pad_w = (pw - W % pw) % pw;
int64_t h = ((H + pad_h) / ph);
int64_t w = ((W + pad_w) / pw);
x = unpatchify(ctx, x, h, w, ph, pw, patch_last); // [N, C, H + pad_h, W + pad_w]
x = ggml_ext_slice(ctx, x, 1, 0, H); // [N, C, H, W + pad_w]
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
return x;
}
} // namespace DiT
#endif // __COMMON_DIT_HPP__

View File

@ -6,13 +6,18 @@
#include "t5.hpp" #include "t5.hpp"
struct SDCondition { struct SDCondition {
struct ggml_tensor* c_crossattn = nullptr; // aka context ggml_tensor* c_crossattn = nullptr; // aka context
struct ggml_tensor* c_vector = nullptr; // aka y ggml_tensor* c_vector = nullptr; // aka y
struct ggml_tensor* c_concat = nullptr; ggml_tensor* c_concat = nullptr;
std::vector<ggml_tensor*> extra_c_crossattns;
SDCondition() = default; SDCondition() = default;
SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat) SDCondition(ggml_tensor* c_crossattn,
: c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {} ggml_tensor* c_vector,
ggml_tensor* c_concat,
const std::vector<ggml_tensor*>& extra_c_crossattns = {})
: c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {}
}; };
struct ConditionerParams { struct ConditionerParams {
@ -32,8 +37,9 @@ struct Conditioner {
const ConditionerParams& conditioner_params) = 0; const ConditionerParams& conditioner_params) = 0;
virtual void alloc_params_buffer() = 0; virtual void alloc_params_buffer() = 0;
virtual void free_params_buffer() = 0; virtual void free_params_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0; virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0; virtual size_t get_params_buffer_size() = 0;
virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {} virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx, virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
int n_threads, int n_threads,
@ -86,7 +92,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
} }
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model"); text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
if (sd_version_is_sdxl(version)) { if (sd_version_is_sdxl(version)) {
text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model"); text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
@ -115,6 +121,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_flash_attention_enabled(bool enabled) override {
text_model->set_flash_attention_enabled(enabled);
if (sd_version_is_sdxl(version)) {
text_model2->set_flash_attention_enabled(enabled);
}
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override { void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
text_model->set_weight_adapter(adapter); text_model->set_weight_adapter(adapter);
if (sd_version_is_sdxl(version)) { if (sd_version_is_sdxl(version)) {
@ -136,14 +149,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
} }
return true; return true;
} }
struct ggml_init_params params; ggml_init_params params;
params.mem_size = 100 * 1024 * 1024; // max for custom embeddings 100 MB params.mem_size = 100 * 1024 * 1024; // max for custom embeddings 100 MB
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* embd_ctx = ggml_init(params); ggml_context* embd_ctx = ggml_init(params);
struct ggml_tensor* embd = nullptr; ggml_tensor* embd = nullptr;
struct ggml_tensor* embd2 = nullptr; ggml_tensor* embd2 = nullptr;
auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) { auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
if (tensor_storage.ne[0] != text_model->model.hidden_size) { if (tensor_storage.ne[0] != text_model->model.hidden_size) {
if (text_model2) { if (text_model2) {
if (tensor_storage.ne[0] == text_model2->model.hidden_size) { if (tensor_storage.ne[0] == text_model2->model.hidden_size) {
@ -303,11 +316,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int class_token = clean_input_ids[class_token_index[0]]; int class_token = clean_input_ids[class_token_index[0]];
class_idx = tokens_acc + class_token_index[0]; class_idx = tokens_acc + class_token_index[0];
std::vector<int> clean_input_ids_tmp; std::vector<int> clean_input_ids_tmp;
for (uint32_t i = 0; i < class_token_index[0]; i++) for (int i = 0; i < class_token_index[0]; i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]); clean_input_ids_tmp.push_back(clean_input_ids[i]);
for (uint32_t i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++) for (int i = 0; i < (pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
clean_input_ids_tmp.push_back(class_token); clean_input_ids_tmp.push_back(class_token);
for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++) for (int i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]); clean_input_ids_tmp.push_back(clean_input_ids[i]);
clean_input_ids.clear(); clean_input_ids.clear();
clean_input_ids = clean_input_ids_tmp; clean_input_ids = clean_input_ids_tmp;
@ -322,7 +335,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
tokenizer.pad_tokens(tokens, weights, max_length, padding); tokenizer.pad_tokens(tokens, weights, max_length, padding);
int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs; int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
for (uint32_t i = 0; i < tokens.size(); i++) { for (int i = 0; i < tokens.size(); i++) {
// if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs // if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
// hardcode for now // hardcode for now
@ -422,12 +435,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int height, int height,
int adm_in_channels = -1, int adm_in_channels = -1,
bool zero_out_masked = false) { bool zero_out_masked = false) {
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size]
struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] ggml_tensor* chunk_hidden_states = nullptr; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
struct ggml_tensor* chunk_hidden_states1 = nullptr; // [n_token, hidden_size] ggml_tensor* chunk_hidden_states1 = nullptr; // [n_token, hidden_size]
struct ggml_tensor* chunk_hidden_states2 = nullptr; // [n_token, hidden_size2] ggml_tensor* chunk_hidden_states2 = nullptr; // [n_token, hidden_size2]
struct ggml_tensor* pooled = nullptr; ggml_tensor* pooled = nullptr;
std::vector<float> hidden_states_vec; std::vector<float> hidden_states_vec;
if (clip_skip <= 0) { if (clip_skip <= 0) {
@ -442,9 +455,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::vector<float> chunk_weights(weights.begin() + chunk_idx * chunk_len, std::vector<float> chunk_weights(weights.begin() + chunk_idx * chunk_len,
weights.begin() + (chunk_idx + 1) * chunk_len); weights.begin() + (chunk_idx + 1) * chunk_len);
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
struct ggml_tensor* input_ids2 = nullptr; ggml_tensor* input_ids2 = nullptr;
size_t max_token_idx = 0; size_t max_token_idx = 0;
if (sd_version_is_sdxl(version)) { if (sd_version_is_sdxl(version)) {
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID); auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
if (it != chunk_tokens.end()) { if (it != chunk_tokens.end()) {
@ -663,18 +676,18 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
return "clip_vision"; return "clip_vision";
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
vision_model.get_param_tensors(tensors, "cond_stage_model.transformer"); vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled, int clip_skip) { ggml_cgraph* build_graph(ggml_tensor* pixel_values, bool return_pooled, int clip_skip) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); ggml_cgraph* gf = ggml_new_graph(compute_ctx);
pixel_values = to_backend(pixel_values); pixel_values = to_backend(pixel_values);
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip); ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
@ -687,7 +700,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
int clip_skip, int clip_skip,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx) { ggml_context* output_ctx) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(pixel_values, return_pooled, clip_skip); return build_graph(pixel_values, return_pooled, clip_skip);
}; };
return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
@ -733,7 +746,7 @@ struct SD3CLIPEmbedder : public Conditioner {
} }
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
if (clip_l) { if (clip_l) {
clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
} }
@ -783,6 +796,18 @@ struct SD3CLIPEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_flash_attention_enabled(bool enabled) override {
if (clip_l) {
clip_l->set_flash_attention_enabled(enabled);
}
if (clip_g) {
clip_g->set_flash_attention_enabled(enabled);
}
if (t5) {
t5->set_flash_attention_enabled(enabled);
}
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override { void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
if (clip_l) { if (clip_l) {
clip_l->set_weight_adapter(adapter); clip_l->set_weight_adapter(adapter);
@ -884,15 +909,15 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_skip = 2; clip_skip = 2;
} }
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = nullptr; // [N, n_token*2, 4096] ggml_tensor* hidden_states = nullptr; // [N, n_token*2, 4096]
struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token*2, 4096] ggml_tensor* chunk_hidden_states = nullptr; // [n_token*2, 4096]
struct ggml_tensor* chunk_hidden_states_l = nullptr; // [n_token, hidden_size_l] ggml_tensor* chunk_hidden_states_l = nullptr; // [n_token, hidden_size_l]
struct ggml_tensor* chunk_hidden_states_g = nullptr; // [n_token, hidden_size_g] ggml_tensor* chunk_hidden_states_g = nullptr; // [n_token, hidden_size_g]
struct ggml_tensor* chunk_hidden_states_t5 = nullptr; // [n_token, hidden_size_t5] ggml_tensor* chunk_hidden_states_t5 = nullptr; // [n_token, hidden_size_t5]
struct ggml_tensor* pooled = nullptr; ggml_tensor* pooled = nullptr;
struct ggml_tensor* pooled_l = nullptr; // [768,] ggml_tensor* pooled_l = nullptr; // [768,]
struct ggml_tensor* pooled_g = nullptr; // [1280,] ggml_tensor* pooled_g = nullptr; // [1280,]
std::vector<float> hidden_states_vec; std::vector<float> hidden_states_vec;
size_t chunk_len = 77; size_t chunk_len = 77;
@ -1153,7 +1178,7 @@ struct FluxCLIPEmbedder : public Conditioner {
} }
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
if (clip_l) { if (clip_l) {
clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
} }
@ -1191,6 +1216,15 @@ struct FluxCLIPEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_flash_attention_enabled(bool enabled) override {
if (clip_l) {
clip_l->set_flash_attention_enabled(enabled);
}
if (t5) {
t5->set_flash_attention_enabled(enabled);
}
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) { void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
if (clip_l) { if (clip_l) {
clip_l->set_weight_adapter(adapter); clip_l->set_weight_adapter(adapter);
@ -1272,10 +1306,10 @@ struct FluxCLIPEmbedder : public Conditioner {
clip_skip = 2; clip_skip = 2;
} }
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096]
struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096]
struct ggml_tensor* pooled = nullptr; // [768,] ggml_tensor* pooled = nullptr; // [768,]
std::vector<float> hidden_states_vec; std::vector<float> hidden_states_vec;
size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len; size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len;
@ -1414,7 +1448,7 @@ struct T5CLIPEmbedder : public Conditioner {
} }
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
if (t5) { if (t5) {
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
} }
@ -1440,6 +1474,12 @@ struct T5CLIPEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_flash_attention_enabled(bool enabled) override {
if (t5) {
t5->set_flash_attention_enabled(enabled);
}
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override { void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
if (t5) { if (t5) {
t5->set_weight_adapter(adapter); t5->set_weight_adapter(adapter);
@ -1483,7 +1523,7 @@ struct T5CLIPEmbedder : public Conditioner {
return {t5_tokens, t5_weights, t5_mask}; return {t5_tokens, t5_weights, t5_mask};
} }
void modify_mask_to_attend_padding(struct ggml_tensor* mask, int max_seq_length, int num_extra_padding = 8) { void modify_mask_to_attend_padding(ggml_tensor* mask, int max_seq_length, int num_extra_padding = 8) {
float* mask_data = (float*)mask->data; float* mask_data = (float*)mask->data;
int num_pad = 0; int num_pad = 0;
for (int64_t i = 0; i < max_seq_length; i++) { for (int64_t i = 0; i < max_seq_length; i++) {
@ -1514,11 +1554,11 @@ struct T5CLIPEmbedder : public Conditioner {
auto& t5_weights = std::get<1>(token_and_weights); auto& t5_weights = std::get<1>(token_and_weights);
auto& t5_attn_mask_vec = std::get<2>(token_and_weights); auto& t5_attn_mask_vec = std::get<2>(token_and_weights);
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096]
struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096]
struct ggml_tensor* pooled = nullptr; ggml_tensor* pooled = nullptr;
struct ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [n_token] ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [n_token]
std::vector<float> hidden_states_vec; std::vector<float> hidden_states_vec;
@ -1584,7 +1624,7 @@ struct T5CLIPEmbedder : public Conditioner {
chunk_hidden_states->ne[0], chunk_hidden_states->ne[0],
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad); modify_mask_to_attend_padding(t5_attn_mask, static_cast<int>(ggml_nelements(t5_attn_mask)), mask_pad);
return {hidden_states, t5_attn_mask, nullptr}; return {hidden_states, t5_attn_mask, nullptr};
} }
@ -1601,6 +1641,142 @@ struct T5CLIPEmbedder : public Conditioner {
} }
}; };
struct AnimaConditioner : public Conditioner {
std::shared_ptr<LLM::BPETokenizer> qwen_tokenizer;
T5UniGramTokenizer t5_tokenizer;
std::shared_ptr<LLM::LLMRunner> llm;
AnimaConditioner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {}) {
qwen_tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
backend,
offload_params_to_cpu,
tensor_storage_map,
"text_encoders.llm",
false);
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
llm->get_param_tensors(tensors, "text_encoders.llm");
}
void alloc_params_buffer() override {
llm->alloc_params_buffer();
}
void free_params_buffer() override {
llm->free_params_buffer();
}
size_t get_params_buffer_size() override {
return llm->get_params_buffer_size();
}
void set_flash_attention_enabled(bool enabled) override {
llm->set_flash_attention_enabled(enabled);
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
llm->set_weight_adapter(adapter);
}
std::tuple<std::vector<int>, std::vector<float>, std::vector<int>, std::vector<float>> tokenize(std::string text) {
auto parsed_attention = parse_prompt_attention(text);
{
std::stringstream ss;
ss << "[";
for (const auto& item : parsed_attention) {
ss << "['" << item.first << "', " << item.second << "], ";
}
ss << "]";
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
}
std::vector<int> qwen_tokens;
std::vector<float> qwen_weights;
std::vector<int> t5_tokens;
std::vector<float> t5_weights;
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
std::vector<int> curr_tokens = qwen_tokenizer->tokenize(curr_text, nullptr);
qwen_tokens.insert(qwen_tokens.end(), curr_tokens.begin(), curr_tokens.end());
// Anima uses uniform Qwen token weights.
qwen_weights.insert(qwen_weights.end(), curr_tokens.size(), 1.f);
}
if (qwen_tokens.empty()) {
qwen_tokens.push_back(151643); // qwen3 pad token
qwen_weights.push_back(1.f);
}
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}
return {qwen_tokens, qwen_weights, t5_tokens, t5_weights};
}
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
const ConditionerParams& conditioner_params) override {
int64_t t0 = ggml_time_ms();
auto tokenized = tokenize(conditioner_params.text);
auto& qwen_tokens = std::get<0>(tokenized);
auto& qwen_weights = std::get<1>(tokenized);
auto& t5_tokens = std::get<2>(tokenized);
auto& t5_weights = std::get<3>(tokenized);
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens);
ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024]
llm->compute(n_threads,
input_ids,
nullptr,
{},
{},
&hidden_states,
work_ctx);
{
auto tensor = hidden_states;
float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= qwen_weights[i1];
ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
float new_mean = ggml_ext_tensor_mean(tensor);
if (new_mean != 0.f) {
ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
}
ggml_tensor* t5_ids_tensor = nullptr;
ggml_tensor* t5_weight_tensor = nullptr;
if (!t5_tokens.empty()) {
t5_ids_tensor = vector_to_ggml_tensor_i32(work_ctx, t5_tokens);
t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights);
}
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
return {hidden_states, t5_weight_tensor, t5_ids_tensor};
}
};
struct LLMEmbedder : public Conditioner { struct LLMEmbedder : public Conditioner {
SDVersion version; SDVersion version;
std::shared_ptr<LLM::BPETokenizer> tokenizer; std::shared_ptr<LLM::BPETokenizer> tokenizer;
@ -1614,9 +1790,9 @@ struct LLMEmbedder : public Conditioner {
bool enable_vision = false) bool enable_vision = false)
: version(version) { : version(version) {
LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL; LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
if (sd_version_is_flux2(version)) { if (version == VERSION_FLUX2) {
arch = LLM::LLMArch::MISTRAL_SMALL_3_2; arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE) { } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
arch = LLM::LLMArch::QWEN3; arch = LLM::LLMArch::QWEN3;
} }
if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) { if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
@ -1632,7 +1808,7 @@ struct LLMEmbedder : public Conditioner {
enable_vision); enable_vision);
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
llm->get_param_tensors(tensors, "text_encoders.llm"); llm->get_param_tensors(tensors, "text_encoders.llm");
} }
@ -1650,6 +1826,10 @@ struct LLMEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_flash_attention_enabled(bool enabled) override {
llm->set_flash_attention_enabled(enabled);
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override { void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
if (llm) { if (llm) {
llm->set_weight_adapter(adapter); llm->set_weight_adapter(adapter);
@ -1657,18 +1837,23 @@ struct LLMEmbedder : public Conditioner {
} }
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text, std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
std::pair<int, int> attn_range, const std::pair<int, int>& attn_range,
size_t max_length = 0, size_t max_length = 0,
bool padding = false) { bool padding = false) {
std::vector<std::pair<std::string, float>> parsed_attention; std::vector<std::pair<std::string, float>> parsed_attention;
parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f); if (attn_range.first >= 0 && attn_range.second > 0) {
if (attn_range.second - attn_range.first > 0) { parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first)); if (attn_range.second - attn_range.first > 0) {
parsed_attention.insert(parsed_attention.end(), auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
new_parsed_attention.begin(), parsed_attention.insert(parsed_attention.end(),
new_parsed_attention.end()); new_parsed_attention.begin(),
new_parsed_attention.end());
}
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
} else {
parsed_attention.emplace_back(text, 1.f);
} }
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
{ {
std::stringstream ss; std::stringstream ss;
ss << "["; ss << "[";
@ -1699,145 +1884,47 @@ struct LLMEmbedder : public Conditioner {
return {tokens, weights}; return {tokens, weights};
} }
SDCondition get_learned_condition(ggml_context* work_ctx, ggml_tensor* encode_prompt(ggml_context* work_ctx,
int n_threads, int n_threads,
const ConditionerParams& conditioner_params) override { const std::string prompt,
std::string prompt; const std::pair<int, int>& prompt_attn_range,
std::vector<std::pair<int, ggml_tensor*>> image_embeds; int max_length,
std::pair<int, int> prompt_attn_range; int min_length,
int prompt_template_encode_start_idx = 34; std::vector<std::pair<int, ggml_tensor*>> image_embeds,
int max_length = 0; const std::set<int>& out_layers,
std::set<int> out_layers; int prompt_template_encode_start_idx) {
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) { auto tokens_and_weights = tokenize(prompt, prompt_attn_range);
LOG_INFO("QwenImageEditPlusPipeline");
prompt_template_encode_start_idx = 64;
int image_embed_idx = 64 + 6;
int min_pixels = 384 * 384;
int max_pixels = 560 * 560;
std::string placeholder = "<|image_pad|>";
std::string img_prompt;
for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
int height = image.height;
int width = image.width;
int h_bar = static_cast<int>(std::round(height / factor)) * factor;
int w_bar = static_cast<int>(std::round(width / factor)) * factor;
if (static_cast<double>(h_bar) * w_bar > max_pixels) {
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
h_bar = std::max(static_cast<int>(factor),
static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
w_bar = std::max(static_cast<int>(factor),
static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
} else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
h_bar = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
w_bar = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
}
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
free(image.data);
image.data = nullptr;
ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
free(resized_image.data);
resized_image.data = nullptr;
ggml_tensor* image_embed = nullptr;
llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
image_embeds.emplace_back(image_embed_idx, image_embed);
image_embed_idx += 1 + image_embed->ne[1] + 6;
img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652]
int64_t num_image_tokens = image_embed->ne[1];
img_prompt.reserve(num_image_tokens * placeholder.size());
for (int j = 0; j < num_image_tokens; j++) {
img_prompt += placeholder;
}
img_prompt += "<|vision_end|>";
}
prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
prompt += img_prompt;
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
} else if (sd_version_is_flux2(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {10, 20, 30};
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "[/INST]";
} else if (sd_version_is_z_image(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {35}; // -2
prompt = "<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
} else if (sd_version_is_flux2(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {10, 20, 30};
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
prompt_attn_range.first = prompt.size();
prompt += conditioner_params.text;
prompt_attn_range.second = prompt.size();
prompt += "[/INST]";
} else if (version == VERSION_OVIS_IMAGE) {
prompt_template_encode_start_idx = 28;
max_length = prompt_template_encode_start_idx + 256;
prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += " " + conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
} else {
prompt_template_encode_start_idx = 34;
prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
}
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0);
auto& tokens = std::get<0>(tokens_and_weights); auto& tokens = std::get<0>(tokens_and_weights);
auto& weights = std::get<1>(tokens_and_weights); auto& weights = std::get<1>(tokens_and_weights);
std::vector<float> mask;
int64_t t0 = ggml_time_ms(); if (max_length > 0 && tokens.size() < max_length) {
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 3584] mask.insert(mask.end(), tokens.size(), 1.f);
mask.insert(mask.end(), max_length - tokens.size(), 0.f);
tokenizer->pad_tokens(tokens, weights, max_length, true);
}
ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size]
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
ggml_tensor* attention_mask = nullptr;
if (!mask.empty()) {
attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size());
ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = 0.f;
if (mask[i0] == 0.f) {
value = -INFINITY;
} else if (i0 > i1) {
value = -INFINITY;
}
ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3);
});
}
llm->compute(n_threads, llm->compute(n_threads,
input_ids, input_ids,
attention_mask,
image_embeds, image_embeds,
out_layers, out_layers,
&hidden_states, &hidden_states,
@ -1860,11 +1947,6 @@ struct LLMEmbedder : public Conditioner {
GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx); GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
int64_t min_length = 0;
if (sd_version_is_flux2(version)) {
min_length = 512;
}
int64_t zero_pad_len = 0; int64_t zero_pad_len = 0;
if (min_length > 0) { if (min_length > 0) {
if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) { if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
@ -1886,11 +1968,186 @@ struct LLMEmbedder : public Conditioner {
ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3); ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
}); });
// print_ggml_tensor(new_hidden_states); return new_hidden_states;
}
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
const ConditionerParams& conditioner_params) override {
std::string prompt;
std::pair<int, int> prompt_attn_range;
std::vector<std::string> extra_prompts;
std::vector<std::pair<int, int>> extra_prompts_attn_range;
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
int prompt_template_encode_start_idx = 34;
int max_length = 0; // pad tokens
int min_length = 0; // zero pad hidden_states
std::set<int> out_layers;
int64_t t0 = ggml_time_ms();
if (sd_version_is_qwen_image(version)) {
if (llm->enable_vision && !conditioner_params.ref_images.empty()) {
LOG_INFO("QwenImageEditPlusPipeline");
prompt_template_encode_start_idx = 64;
int image_embed_idx = 64 + 6;
int min_pixels = 384 * 384;
int max_pixels = 560 * 560;
std::string placeholder = "<|image_pad|>";
std::string img_prompt;
for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
int height = image.height;
int width = image.width;
int h_bar = static_cast<int>(std::round(height / factor) * factor);
int w_bar = static_cast<int>(std::round(width / factor) * factor);
if (static_cast<double>(h_bar) * w_bar > max_pixels) {
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
h_bar = std::max(static_cast<int>(factor),
static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
w_bar = std::max(static_cast<int>(factor),
static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
} else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
h_bar = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
w_bar = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
}
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
free(image.data);
image.data = nullptr;
ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
free(resized_image.data);
resized_image.data = nullptr;
ggml_tensor* image_embed = nullptr;
llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
image_embeds.emplace_back(image_embed_idx, image_embed);
image_embed_idx += 1 + static_cast<int>(image_embed->ne[1]) + 6;
img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652]
int64_t num_image_tokens = image_embed->ne[1];
img_prompt.reserve(num_image_tokens * placeholder.size());
for (int j = 0; j < num_image_tokens; j++) {
img_prompt += placeholder;
}
img_prompt += "<|vision_end|>";
}
prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
prompt += img_prompt;
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
} else {
prompt_template_encode_start_idx = 34;
prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
}
} else if (version == VERSION_FLUX2) {
prompt_template_encode_start_idx = 0;
min_length = 512;
out_layers = {10, 20, 30};
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "[/INST]";
} else if (sd_version_is_z_image(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {35}; // -2
if (!conditioner_params.ref_images.empty()) {
LOG_INFO("ZImageOmniPipeline");
prompt = "<|im_start|>user\n<|vision_start|>";
for (int i = 0; i < conditioner_params.ref_images.size() - 1; i++) {
extra_prompts.push_back("<|vision_end|><|vision_start|>");
}
extra_prompts.push_back("<|vision_end|>" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n<|vision_start|>");
extra_prompts.push_back("<|vision_end|><|im_end|>");
} else {
prompt = "<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
}
} else if (version == VERSION_FLUX2_KLEIN) {
prompt_template_encode_start_idx = 0;
max_length = 512;
out_layers = {9, 18, 27};
prompt = "<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
} else if (version == VERSION_OVIS_IMAGE) {
prompt_template_encode_start_idx = 28;
max_length = prompt_template_encode_start_idx + 256;
prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += " " + conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
} else {
GGML_ABORT("unknown version %d", version);
}
auto hidden_states = encode_prompt(work_ctx,
n_threads,
prompt,
prompt_attn_range,
max_length,
min_length,
image_embeds,
out_layers,
prompt_template_encode_start_idx);
std::vector<ggml_tensor*> extra_hidden_states_vec;
for (int i = 0; i < extra_prompts.size(); i++) {
auto extra_hidden_states = encode_prompt(work_ctx,
n_threads,
extra_prompts[i],
extra_prompts_attn_range[i],
max_length,
min_length,
image_embeds,
out_layers,
prompt_template_encode_start_idx);
extra_hidden_states_vec.push_back(extra_hidden_states);
}
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
return {new_hidden_states, nullptr, nullptr}; return {hidden_states, nullptr, nullptr, extra_hidden_states_vec};
} }
}; };

View File

@ -1,8 +1,7 @@
#ifndef __CONTROL_HPP__ #ifndef __CONTROL_HPP__
#define __CONTROL_HPP__ #define __CONTROL_HPP__
#include "common.hpp" #include "common_block.hpp"
#include "ggml_extend.hpp"
#include "model.h" #include "model.h"
#define CONTROL_NET_GRAPH_SIZE 1536 #define CONTROL_NET_GRAPH_SIZE 1536
@ -165,26 +164,26 @@ public:
blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch)); blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
} }
struct ggml_tensor* resblock_forward(std::string name, ggml_tensor* resblock_forward(std::string name,
GGMLRunnerContext* ctx, GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* emb) { ggml_tensor* emb) {
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]); auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
return block->forward(ctx, x, emb); return block->forward(ctx, x, emb);
} }
struct ggml_tensor* attention_layer_forward(std::string name, ggml_tensor* attention_layer_forward(std::string name,
GGMLRunnerContext* ctx, GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* context) { ggml_tensor* context) {
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]); auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
return block->forward(ctx, x, context); return block->forward(ctx, x, context);
} }
struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx, ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
struct ggml_tensor* hint, ggml_tensor* hint,
struct ggml_tensor* emb, ggml_tensor* emb,
struct ggml_tensor* context) { ggml_tensor* context) {
int num_input_blocks = 15; int num_input_blocks = 15;
auto h = hint; auto h = hint;
for (int i = 0; i < num_input_blocks; i++) { for (int i = 0; i < num_input_blocks; i++) {
@ -199,13 +198,13 @@ public:
return h; return h;
} }
std::vector<struct ggml_tensor*> forward(GGMLRunnerContext* ctx, std::vector<ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* hint, ggml_tensor* hint,
struct ggml_tensor* guided_hint, ggml_tensor* guided_hint,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* y = nullptr) { ggml_tensor* y = nullptr) {
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w] // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
// timesteps: [N,] // timesteps: [N,]
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768] // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
@ -247,7 +246,7 @@ public:
emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim] emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
} }
std::vector<struct ggml_tensor*> outs; std::vector<ggml_tensor*> outs;
if (guided_hint == nullptr) { if (guided_hint == nullptr) {
guided_hint = input_hint_block_forward(ctx, hint, emb, context); guided_hint = input_hint_block_forward(ctx, hint, emb, context);
@ -313,9 +312,9 @@ struct ControlNet : public GGMLRunner {
ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory
ggml_context* control_ctx = nullptr; ggml_context* control_ctx = nullptr;
std::vector<struct ggml_tensor*> controls; // (12 input block outputs, 1 middle block output) SD 1.5 std::vector<ggml_tensor*> controls; // (12 input block outputs, 1 middle block output) SD 1.5
struct ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference
bool guided_hint_cached = false; bool guided_hint_cached = false;
ControlNet(ggml_backend_t backend, ControlNet(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
@ -329,8 +328,8 @@ struct ControlNet : public GGMLRunner {
free_control_ctx(); free_control_ctx();
} }
void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) { void alloc_control_ctx(std::vector<ggml_tensor*> outs) {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024; params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = true; params.no_alloc = true;
@ -371,16 +370,16 @@ struct ControlNet : public GGMLRunner {
return "control_net"; return "control_net";
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
control_net.get_param_tensors(tensors, prefix); control_net.get_param_tensors(tensors, prefix);
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* x, ggml_cgraph* build_graph(ggml_tensor* x,
struct ggml_tensor* hint, ggml_tensor* hint,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* y = nullptr) { ggml_tensor* y = nullptr) {
struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE); ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
x = to_backend(x); x = to_backend(x);
if (guided_hint_cached) { if (guided_hint_cached) {
@ -415,18 +414,18 @@ struct ControlNet : public GGMLRunner {
} }
bool compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* hint, ggml_tensor* hint,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* y, ggml_tensor* y,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// timesteps: [N, ] // timesteps: [N, ]
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
// y: [N, adm_in_channels] or [1, adm_in_channels] // y: [N, adm_in_channels] or [1, adm_in_channels]
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, hint, timesteps, context, y); return build_graph(x, hint, timesteps, context, y);
}; };

View File

@ -1,6 +1,8 @@
#ifndef __DENOISER_HPP__ #ifndef __DENOISER_HPP__
#define __DENOISER_HPP__ #define __DENOISER_HPP__
#include <cmath>
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "gits_noise.inl" #include "gits_noise.inl"
@ -245,7 +247,7 @@ struct SGMUniformScheduler : SigmaScheduler {
int t_max = TIMESTEPS - 1; int t_max = TIMESTEPS - 1;
int t_min = 0; int t_min = 0;
std::vector<float> timesteps = linear_space(static_cast<float>(t_max), static_cast<float>(t_min), n + 1); std::vector<float> timesteps = linear_space(static_cast<float>(t_max), static_cast<float>(t_min), n + 1);
for (int i = 0; i < n; i++) { for (uint32_t i = 0; i < n; i++) {
result.push_back(t_to_sigma_func(timesteps[i])); result.push_back(t_to_sigma_func(timesteps[i]));
} }
result.push_back(0.0f); result.push_back(0.0f);
@ -259,11 +261,11 @@ struct LCMScheduler : SigmaScheduler {
result.reserve(n + 1); result.reserve(n + 1);
const int original_steps = 50; const int original_steps = 50;
const int k = TIMESTEPS / original_steps; const int k = TIMESTEPS / original_steps;
for (int i = 0; i < n; i++) { for (uint32_t i = 0; i < n; i++) {
// the rounding ensures we match the training schedule of the LCM model // the rounding ensures we match the training schedule of the LCM model
int index = (i * original_steps) / n; int index = (i * original_steps) / n;
int timestep = (original_steps - index) * k - 1; int timestep = (original_steps - index) * k - 1;
result.push_back(t_to_sigma(timestep)); result.push_back(t_to_sigma(static_cast<float>(timestep)));
} }
result.push_back(0.0f); result.push_back(0.0f);
return result; return result;
@ -276,6 +278,10 @@ struct KarrasScheduler : SigmaScheduler {
// but does anybody ever bother to touch them? // but does anybody ever bother to touch them?
float rho = 7.f; float rho = 7.f;
if (sigma_min <= 1e-6f) {
sigma_min = 1e-6f;
}
std::vector<float> result(n + 1); std::vector<float> result(n + 1);
float min_inv_rho = pow(sigma_min, (1.f / rho)); float min_inv_rho = pow(sigma_min, (1.f / rho));
@ -347,6 +353,130 @@ struct SmoothStepScheduler : SigmaScheduler {
} }
}; };
struct BongTangentScheduler : SigmaScheduler {
static constexpr float kPi = 3.14159265358979323846f;
static std::vector<float> get_bong_tangent_sigmas(int steps, float slope, float pivot, float start, float end) {
std::vector<float> sigmas;
if (steps <= 0) {
return sigmas;
}
float smax = ((2.0f / kPi) * atanf(-slope * (0.0f - pivot)) + 1.0f) * 0.5f;
float smin = ((2.0f / kPi) * atanf(-slope * ((float)(steps - 1) - pivot)) + 1.0f) * 0.5f;
float srange = smax - smin;
float sscale = start - end;
sigmas.reserve(steps);
if (fabsf(srange) < 1e-8f) {
if (steps == 1) {
sigmas.push_back(start);
return sigmas;
}
for (int i = 0; i < steps; ++i) {
float t = (float)i / (float)(steps - 1);
sigmas.push_back(start + (end - start) * t);
}
return sigmas;
}
float inv_srange = 1.0f / srange;
for (int x = 0; x < steps; ++x) {
float v = ((2.0f / kPi) * atanf(-slope * ((float)x - pivot)) + 1.0f) * 0.5f;
float sigma = ((v - smin) * inv_srange) * sscale + end;
sigmas.push_back(sigma);
}
return sigmas;
}
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t /*t_to_sigma*/) override {
std::vector<float> result;
if (n == 0) {
return result;
}
float start = sigma_max;
float end = sigma_min;
float middle = sigma_min + (sigma_max - sigma_min) * 0.5f;
float pivot_1 = 0.6f;
float pivot_2 = 0.6f;
float slope_1 = 0.2f;
float slope_2 = 0.2f;
int steps = static_cast<int>(n) + 2;
int midpoint = static_cast<int>(((float)steps * pivot_1 + (float)steps * pivot_2) * 0.5f);
int pivot_1_i = static_cast<int>((float)steps * pivot_1);
int pivot_2_i = static_cast<int>((float)steps * pivot_2);
float slope_scale = (float)steps / 40.0f;
slope_1 = slope_1 / slope_scale;
slope_2 = slope_2 / slope_scale;
int stage_2_len = steps - midpoint;
int stage_1_len = steps - stage_2_len;
std::vector<float> sigmas_1 = get_bong_tangent_sigmas(stage_1_len, slope_1, (float)pivot_1_i, start, middle);
std::vector<float> sigmas_2 = get_bong_tangent_sigmas(stage_2_len, slope_2, (float)(pivot_2_i - stage_1_len), middle, end);
if (!sigmas_1.empty()) {
sigmas_1.pop_back();
}
result.reserve(n + 1);
result.insert(result.end(), sigmas_1.begin(), sigmas_1.end());
result.insert(result.end(), sigmas_2.begin(), sigmas_2.end());
if (result.size() < n + 1) {
while (result.size() < n + 1) {
result.push_back(end);
}
} else if (result.size() > n + 1) {
result.resize(n + 1);
}
result[n] = 0.0f;
return result;
}
};
struct KLOptimalScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector<float> sigmas;
if (n == 0) {
return sigmas;
}
if (n == 1) {
sigmas.push_back(sigma_max);
sigmas.push_back(0.0f);
return sigmas;
}
if (sigma_min <= 1e-6f) {
sigma_min = 1e-6f;
}
sigmas.reserve(n + 1);
float alpha_min = std::atan(sigma_min);
float alpha_max = std::atan(sigma_max);
for (uint32_t i = 0; i < n; ++i) {
float t = static_cast<float>(i) / static_cast<float>(n - 1);
float angle = t * alpha_min + (1.0f - t) * alpha_max;
sigmas.push_back(std::tan(angle));
}
sigmas.push_back(0.0f);
return sigmas;
}
};
struct Denoiser { struct Denoiser {
virtual float sigma_min() = 0; virtual float sigma_min() = 0;
virtual float sigma_max() = 0; virtual float sigma_max() = 0;
@ -392,6 +522,14 @@ struct Denoiser {
LOG_INFO("get_sigmas with SmoothStep scheduler"); LOG_INFO("get_sigmas with SmoothStep scheduler");
scheduler = std::make_shared<SmoothStepScheduler>(); scheduler = std::make_shared<SmoothStepScheduler>();
break; break;
case BONG_TANGENT_SCHEDULER:
LOG_INFO("get_sigmas with bong_tangent scheduler");
scheduler = std::make_shared<BongTangentScheduler>();
break;
case KL_OPTIMAL_SCHEDULER:
LOG_INFO("get_sigmas with KL Optimal scheduler");
scheduler = std::make_shared<KLOptimalScheduler>();
break;
case LCM_SCHEDULER: case LCM_SCHEDULER:
LOG_INFO("get_sigmas with LCM scheduler"); LOG_INFO("get_sigmas with LCM scheduler");
scheduler = std::make_shared<LCMScheduler>(); scheduler = std::make_shared<LCMScheduler>();
@ -482,8 +620,8 @@ struct CompVisVDenoiser : public CompVisDenoiser {
}; };
struct EDMVDenoiser : public CompVisVDenoiser { struct EDMVDenoiser : public CompVisVDenoiser {
float min_sigma = 0.002; float min_sigma = 0.002f;
float max_sigma = 120.0; float max_sigma = 120.0f;
EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0) EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0)
: min_sigma(min_sigma), max_sigma(max_sigma) { : min_sigma(min_sigma), max_sigma(max_sigma) {
@ -494,7 +632,7 @@ struct EDMVDenoiser : public CompVisVDenoiser {
} }
float sigma_to_t(float s) override { float sigma_to_t(float s) override {
return 0.25 * std::log(s); return 0.25f * std::log(s);
} }
float sigma_min() override { float sigma_min() override {
@ -519,17 +657,21 @@ struct DiscreteFlowDenoiser : public Denoiser {
float sigma_data = 1.0f; float sigma_data = 1.0f;
DiscreteFlowDenoiser(float shift = 3.0f) DiscreteFlowDenoiser(float shift = 3.0f) {
: shift(shift) { set_shift(shift);
set_parameters();
} }
void set_parameters() { void set_parameters() {
for (int i = 1; i < TIMESTEPS + 1; i++) { for (int i = 1; i < TIMESTEPS + 1; i++) {
sigmas[i - 1] = t_to_sigma(i); sigmas[i - 1] = t_to_sigma(static_cast<float>(i));
} }
} }
void set_shift(float shift) {
this->shift = shift;
set_parameters();
}
float sigma_min() override { float sigma_min() override {
return sigmas[0]; return sigmas[0];
} }
@ -569,37 +711,11 @@ struct DiscreteFlowDenoiser : public Denoiser {
}; };
float flux_time_shift(float mu, float sigma, float t) { float flux_time_shift(float mu, float sigma, float t) {
return std::exp(mu) / (std::exp(mu) + std::pow((1.0 / t - 1.0), sigma)); return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma));
} }
struct FluxFlowDenoiser : public Denoiser { struct FluxFlowDenoiser : public DiscreteFlowDenoiser {
float sigmas[TIMESTEPS]; FluxFlowDenoiser() = default;
float shift = 1.15f;
float sigma_data = 1.0f;
FluxFlowDenoiser(float shift = 1.15f) {
set_parameters(shift);
}
void set_shift(float shift) {
this->shift = shift;
}
void set_parameters(float shift) {
set_shift(shift);
for (int i = 0; i < TIMESTEPS; i++) {
sigmas[i] = t_to_sigma(i);
}
}
float sigma_min() override {
return sigmas[0];
}
float sigma_max() override {
return sigmas[TIMESTEPS - 1];
}
float sigma_to_t(float sigma) override { float sigma_to_t(float sigma) override {
return sigma; return sigma;
@ -609,26 +725,6 @@ struct FluxFlowDenoiser : public Denoiser {
t = t + 1; t = t + 1;
return flux_time_shift(shift, 1.0f, t / TIMESTEPS); return flux_time_shift(shift, 1.0f, t / TIMESTEPS);
} }
std::vector<float> get_scalings(float sigma) override {
float c_skip = 1.0f;
float c_out = -sigma;
float c_in = 1.0f;
return {c_skip, c_out, c_in};
}
// this function will modify noise/latent
ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
ggml_ext_tensor_scale_inplace(noise, sigma);
ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
ggml_ext_tensor_add_inplace(latent, noise);
return latent;
}
ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
return latent;
}
}; };
struct Flux2FlowDenoiser : public FluxFlowDenoiser { struct Flux2FlowDenoiser : public FluxFlowDenoiser {
@ -677,8 +773,8 @@ static bool sample_k_diffusion(sample_method_t method,
// sample_euler_ancestral // sample_euler_ancestral
switch (method) { switch (method) {
case EULER_A_SAMPLE_METHOD: { case EULER_A_SAMPLE_METHOD: {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
float sigma = sigmas[i]; float sigma = sigmas[i];
@ -734,7 +830,7 @@ static bool sample_k_diffusion(sample_method_t method,
} break; } break;
case EULER_SAMPLE_METHOD: // Implemented without any sigma churn case EULER_SAMPLE_METHOD: // Implemented without any sigma churn
{ {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
float sigma = sigmas[i]; float sigma = sigmas[i];
@ -769,8 +865,8 @@ static bool sample_k_diffusion(sample_method_t method,
} }
} break; } break;
case HEUN_SAMPLE_METHOD: { case HEUN_SAMPLE_METHOD: {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
@ -825,12 +921,12 @@ static bool sample_k_diffusion(sample_method_t method,
} }
} break; } break;
case DPM2_SAMPLE_METHOD: { case DPM2_SAMPLE_METHOD: {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
if (denoised == nullptr) { if (denoised == nullptr) {
return false; return false;
} }
@ -883,12 +979,12 @@ static bool sample_k_diffusion(sample_method_t method,
} break; } break;
case DPMPP2S_A_SAMPLE_METHOD: { case DPMPP2S_A_SAMPLE_METHOD: {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
if (denoised == nullptr) { if (denoised == nullptr) {
return false; return false;
} }
@ -954,7 +1050,7 @@ static bool sample_k_diffusion(sample_method_t method,
} break; } break;
case DPMPP2M_SAMPLE_METHOD: // DPM++ (2M) from Karras et al (2022) case DPMPP2M_SAMPLE_METHOD: // DPM++ (2M) from Karras et al (2022)
{ {
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
auto t_fn = [](float sigma) -> float { return -log(sigma); }; auto t_fn = [](float sigma) -> float { return -log(sigma); };
@ -996,7 +1092,7 @@ static bool sample_k_diffusion(sample_method_t method,
} break; } break;
case DPMPP2Mv2_SAMPLE_METHOD: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 case DPMPP2Mv2_SAMPLE_METHOD: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
{ {
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
auto t_fn = [](float sigma) -> float { return -log(sigma); }; auto t_fn = [](float sigma) -> float { return -log(sigma); };
@ -1061,8 +1157,8 @@ static bool sample_k_diffusion(sample_method_t method,
} }
float* vec_denoised = (float*)denoised->data; float* vec_denoised = (float*)denoised->data;
// d_cur = (x_cur - denoised) / sigma // d_cur = (x_cur - denoised) / sigma
struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur); ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur);
float* vec_d_cur = (float*)d_cur->data; float* vec_d_cur = (float*)d_cur->data;
for (int j = 0; j < ggml_nelements(d_cur); j++) { for (int j = 0; j < ggml_nelements(d_cur); j++) {
vec_d_cur[j] = (vec_x_cur[j] - vec_denoised[j]) / sigma; vec_d_cur[j] = (vec_x_cur[j] - vec_denoised[j]) / sigma;
@ -1129,11 +1225,11 @@ static bool sample_k_diffusion(sample_method_t method,
float t_next = sigmas[i + 1]; float t_next = sigmas[i + 1];
// Denoising step // Denoising step
ggml_tensor* denoised = model(x, sigma, i + 1); ggml_tensor* denoised = model(x, sigma, i + 1);
float* vec_denoised = (float*)denoised->data; float* vec_denoised = (float*)denoised->data;
struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x); ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x);
float* vec_d_cur = (float*)d_cur->data; float* vec_d_cur = (float*)d_cur->data;
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
// d_cur = (x - denoised) / sigma // d_cur = (x - denoised) / sigma
for (int j = 0; j < ggml_nelements(d_cur); j++) { for (int j = 0; j < ggml_nelements(d_cur); j++) {
@ -1194,8 +1290,8 @@ static bool sample_k_diffusion(sample_method_t method,
} break; } break;
case LCM_SAMPLE_METHOD: // Latent Consistency Models case LCM_SAMPLE_METHOD: // Latent Consistency Models
{ {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
float sigma = sigmas[i]; float sigma = sigmas[i];
@ -1262,9 +1358,9 @@ static bool sample_k_diffusion(sample_method_t method,
alphas_cumprod[i]); alphas_cumprod[i]);
} }
struct ggml_tensor* pred_original_sample = ggml_tensor* pred_original_sample =
ggml_dup_tensor(work_ctx, x); ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* variance_noise = ggml_tensor* variance_noise =
ggml_dup_tensor(work_ctx, x); ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
@ -1284,15 +1380,12 @@ static bool sample_k_diffusion(sample_method_t method,
// - pred_sample_direction -> "direction pointing to // - pred_sample_direction -> "direction pointing to
// x_t" // x_t"
// - pred_prev_sample -> "x_t-1" // - pred_prev_sample -> "x_t-1"
int timestep = int timestep = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
roundf(TIMESTEPS -
i * ((float)TIMESTEPS / steps)) -
1;
// 1. get previous step value (=t-1) // 1. get previous step value (=t-1)
int prev_timestep = timestep - TIMESTEPS / steps; int prev_timestep = timestep - TIMESTEPS / static_cast<int>(steps);
// The sigma here is chosen to cause the // The sigma here is chosen to cause the
// CompVisDenoiser to produce t = timestep // CompVisDenoiser to produce t = timestep
float sigma = compvis_sigmas[timestep]; float sigma = static_cast<float>(compvis_sigmas[timestep]);
if (i == 0) { if (i == 0) {
// The function add_noise intializes x to // The function add_noise intializes x to
// Diffusers' latents * sigma (as in Diffusers' // Diffusers' latents * sigma (as in Diffusers'
@ -1329,7 +1422,7 @@ static bool sample_k_diffusion(sample_method_t method,
// model_output = model() is the D(x, sigma) as // model_output = model() is the D(x, sigma) as
// defined in Karras et al. (2022), p. 3, Table 1 and // defined in Karras et al. (2022), p. 3, Table 1 and
// p. 8 (7), compare also p. 38 (226) therein. // p. 8 (7), compare also p. 38 (226) therein.
struct ggml_tensor* model_output = ggml_tensor* model_output =
model(x, sigma, i + 1); model(x, sigma, i + 1);
// Here model_output is still the k-diffusion denoiser // Here model_output is still the k-diffusion denoiser
// output, not the U-net output F_theta(c_in(sigma) x; // output, not the U-net output F_theta(c_in(sigma) x;
@ -1349,10 +1442,10 @@ static bool sample_k_diffusion(sample_method_t method,
} }
} }
// 2. compute alphas, betas // 2. compute alphas, betas
float alpha_prod_t = alphas_cumprod[timestep]; float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
// Note final_alpha_cumprod = alphas_cumprod[0] due to // Note final_alpha_cumprod = alphas_cumprod[0] due to
// trailing timestep spacing // trailing timestep spacing
float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]; float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
float beta_prod_t = 1 - alpha_prod_t; float beta_prod_t = 1 - alpha_prod_t;
// 3. compute predicted original sample from predicted // 3. compute predicted original sample from predicted
// noise also called "predicted x_0" of formula (12) // noise also called "predicted x_0" of formula (12)
@ -1399,8 +1492,8 @@ static bool sample_k_diffusion(sample_method_t method,
// Two step inner loop without an explicit // Two step inner loop without an explicit
// tensor // tensor
float pred_sample_direction = float pred_sample_direction =
std::sqrt(1 - alpha_prod_t_prev - ::sqrtf(1 - alpha_prod_t_prev -
std::pow(std_dev_t, 2)) * ::powf(std_dev_t, 2)) *
vec_model_output[j]; vec_model_output[j];
vec_x[j] = std::sqrt(alpha_prod_t_prev) * vec_x[j] = std::sqrt(alpha_prod_t_prev) *
vec_pred_original_sample[j] + vec_pred_original_sample[j] +
@ -1452,9 +1545,9 @@ static bool sample_k_diffusion(sample_method_t method,
} }
int original_steps = 50; int original_steps = 50;
struct ggml_tensor* pred_original_sample = ggml_tensor* pred_original_sample =
ggml_dup_tensor(work_ctx, x); ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* noise = ggml_tensor* noise =
ggml_dup_tensor(work_ctx, x); ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
@ -1475,7 +1568,7 @@ static bool sample_k_diffusion(sample_method_t method,
// Begin k-diffusion specific workaround for // Begin k-diffusion specific workaround for
// evaluating F_theta(x; ...) from D(x, sigma), same // evaluating F_theta(x; ...) from D(x, sigma), same
// as in DDIM (and see there for detailed comments) // as in DDIM (and see there for detailed comments)
float sigma = compvis_sigmas[timestep]; float sigma = static_cast<float>(compvis_sigmas[timestep]);
if (i == 0) { if (i == 0) {
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
for (int j = 0; j < ggml_nelements(x); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
@ -1488,7 +1581,7 @@ static bool sample_k_diffusion(sample_method_t method,
vec_x[j] *= std::sqrt(sigma * sigma + 1); vec_x[j] *= std::sqrt(sigma * sigma + 1);
} }
} }
struct ggml_tensor* model_output = ggml_tensor* model_output =
model(x, sigma, i + 1); model(x, sigma, i + 1);
{ {
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
@ -1514,14 +1607,14 @@ static bool sample_k_diffusion(sample_method_t method,
// is different from the notation alpha_t in // is different from the notation alpha_t in
// DPM-Solver. In fact, we have alpha_{t_n} = // DPM-Solver. In fact, we have alpha_{t_n} =
// \sqrt{\hat{alpha_n}}, [...]" // \sqrt{\hat{alpha_n}}, [...]"
float alpha_prod_t = alphas_cumprod[timestep]; float alpha_prod_t = static_cast<float>(alphas_cumprod[timestep]);
float beta_prod_t = 1 - alpha_prod_t; float beta_prod_t = 1 - alpha_prod_t;
// Note final_alpha_cumprod = alphas_cumprod[0] since // Note final_alpha_cumprod = alphas_cumprod[0] since
// TCD is always "trailing" // TCD is always "trailing"
float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]; float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
// The subscript _s are the only portion in this // The subscript _s are the only portion in this
// section (2) unique to TCD // section (2) unique to TCD
float alpha_prod_s = alphas_cumprod[timestep_s]; float alpha_prod_s = static_cast<float>(alphas_cumprod[timestep_s]);
float beta_prod_s = 1 - alpha_prod_s; float beta_prod_s = 1 - alpha_prod_s;
// 3. Compute the predicted noised sample x_s based on // 3. Compute the predicted noised sample x_s based on
// the model parameterization // the model parameterization
@ -1594,6 +1687,216 @@ static bool sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case RES_MULTISTEP_SAMPLE_METHOD: // Res Multistep sampler
{
ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
bool have_old_sigma = false;
float old_sigma_down = 0.0f;
auto t_fn = [](float sigma) -> float { return -logf(sigma); };
auto sigma_fn = [](float t) -> float { return expf(-t); };
auto phi1_fn = [](float t) -> float {
if (fabsf(t) < 1e-6f) {
return 1.0f + t * 0.5f + (t * t) / 6.0f;
}
return (expf(t) - 1.0f) / t;
};
auto phi2_fn = [&](float t) -> float {
if (fabsf(t) < 1e-6f) {
return 0.5f + t / 6.0f + (t * t) / 24.0f;
}
float phi1_val = phi1_fn(t);
return (phi1_val - 1.0f) / t;
};
for (int i = 0; i < steps; i++) {
ggml_tensor* denoised = model(x, sigmas[i], i + 1);
if (denoised == nullptr) {
return false;
}
float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1];
float sigma_up = 0.0f;
float sigma_down = sigma_to;
if (eta > 0.0f) {
float sigma_from_sq = sigma_from * sigma_from;
float sigma_to_sq = sigma_to * sigma_to;
if (sigma_from_sq > 0.0f) {
float term = sigma_to_sq * (sigma_from_sq - sigma_to_sq) / sigma_from_sq;
if (term > 0.0f) {
sigma_up = eta * std::sqrt(term);
}
}
sigma_up = std::min(sigma_up, sigma_to);
float sigma_down_sq = sigma_to_sq - sigma_up * sigma_up;
sigma_down = sigma_down_sq > 0.0f ? std::sqrt(sigma_down_sq) : 0.0f;
}
if (sigma_down == 0.0f || !have_old_sigma) {
float dt = sigma_down - sigma_from;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
float d = (vec_x[j] - vec_denoised[j]) / sigma_from;
vec_x[j] = vec_x[j] + d * dt;
}
} else {
float t = t_fn(sigma_from);
float t_old = t_fn(old_sigma_down);
float t_next = t_fn(sigma_down);
float t_prev = t_fn(sigmas[i - 1]);
float h = t_next - t;
float c2 = (t_prev - t_old) / h;
float phi1_val = phi1_fn(-h);
float phi2_val = phi2_fn(-h);
float b1 = phi1_val - phi2_val / c2;
float b2 = phi2_val / c2;
if (!std::isfinite(b1)) {
b1 = 0.0f;
}
if (!std::isfinite(b2)) {
b2 = 0.0f;
}
float sigma_h = sigma_fn(h);
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
float* vec_old_denoised = (float*)old_denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = sigma_h * vec_x[j] + h * (b1 * vec_denoised[j] + b2 * vec_old_denoised[j]);
}
}
if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
ggml_ext_im_set_randn_f32(noise, rng);
float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up;
}
}
float* vec_old_denoised = (float*)old_denoised->data;
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_old_denoised[j] = vec_denoised[j];
}
old_sigma_down = sigma_down;
have_old_sigma = true;
}
} break;
case RES_2S_SAMPLE_METHOD: // Res 2s sampler
{
ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
ggml_tensor* x0 = ggml_dup_tensor(work_ctx, x);
ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
const float c2 = 0.5f;
auto t_fn = [](float sigma) -> float { return -logf(sigma); };
auto phi1_fn = [](float t) -> float {
if (fabsf(t) < 1e-6f) {
return 1.0f + t * 0.5f + (t * t) / 6.0f;
}
return (expf(t) - 1.0f) / t;
};
auto phi2_fn = [&](float t) -> float {
if (fabsf(t) < 1e-6f) {
return 0.5f + t / 6.0f + (t * t) / 24.0f;
}
float phi1_val = phi1_fn(t);
return (phi1_val - 1.0f) / t;
};
for (int i = 0; i < steps; i++) {
float sigma_from = sigmas[i];
float sigma_to = sigmas[i + 1];
ggml_tensor* denoised = model(x, sigma_from, -(i + 1));
if (denoised == nullptr) {
return false;
}
float sigma_up = 0.0f;
float sigma_down = sigma_to;
if (eta > 0.0f) {
float sigma_from_sq = sigma_from * sigma_from;
float sigma_to_sq = sigma_to * sigma_to;
if (sigma_from_sq > 0.0f) {
float term = sigma_to_sq * (sigma_from_sq - sigma_to_sq) / sigma_from_sq;
if (term > 0.0f) {
sigma_up = eta * std::sqrt(term);
}
}
sigma_up = std::min(sigma_up, sigma_to);
float sigma_down_sq = sigma_to_sq - sigma_up * sigma_up;
sigma_down = sigma_down_sq > 0.0f ? std::sqrt(sigma_down_sq) : 0.0f;
}
float* vec_x = (float*)x->data;
float* vec_x0 = (float*)x0->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x0[j] = vec_x[j];
}
if (sigma_down == 0.0f || sigma_from == 0.0f) {
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_denoised[j];
}
} else {
float t = t_fn(sigma_from);
float t_next = t_fn(sigma_down);
float h = t_next - t;
float a21 = c2 * phi1_fn(-h * c2);
float phi1_val = phi1_fn(-h);
float phi2_val = phi2_fn(-h);
float b2 = phi2_val / c2;
float b1 = phi1_val - b2;
float sigma_c2 = expf(-(t + h * c2));
float* vec_denoised = (float*)denoised->data;
float* vec_x2 = (float*)x2->data;
for (int j = 0; j < ggml_nelements(x); j++) {
float eps1 = vec_denoised[j] - vec_x0[j];
vec_x2[j] = vec_x0[j] + h * a21 * eps1;
}
ggml_tensor* denoised2 = model(x2, sigma_c2, i + 1);
if (denoised2 == nullptr) {
return false;
}
float* vec_denoised2 = (float*)denoised2->data;
for (int j = 0; j < ggml_nelements(x); j++) {
float eps1 = vec_denoised[j] - vec_x0[j];
float eps2 = vec_denoised2[j] - vec_x0[j];
vec_x[j] = vec_x0[j] + h * (b1 * eps1 + b2 * eps2);
}
}
if (sigmas[i + 1] > 0 && sigma_up > 0.0f) {
ggml_ext_im_set_randn_f32(noise, rng);
float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up;
}
}
}
} break;
default: default:
LOG_ERROR("Attempting to sample with nonexisting sample method %i", method); LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);

View File

@ -1,6 +1,7 @@
#ifndef __DIFFUSION_MODEL_H__ #ifndef __DIFFUSION_MODEL_H__
#define __DIFFUSION_MODEL_H__ #define __DIFFUSION_MODEL_H__
#include "anima.hpp"
#include "flux.hpp" #include "flux.hpp"
#include "mmdit.hpp" #include "mmdit.hpp"
#include "qwen_image.hpp" #include "qwen_image.hpp"
@ -9,36 +10,37 @@
#include "z_image.hpp" #include "z_image.hpp"
struct DiffusionParams { struct DiffusionParams {
struct ggml_tensor* x = nullptr; ggml_tensor* x = nullptr;
struct ggml_tensor* timesteps = nullptr; ggml_tensor* timesteps = nullptr;
struct ggml_tensor* context = nullptr; ggml_tensor* context = nullptr;
struct ggml_tensor* c_concat = nullptr; ggml_tensor* c_concat = nullptr;
struct ggml_tensor* y = nullptr; ggml_tensor* y = nullptr;
struct ggml_tensor* guidance = nullptr; ggml_tensor* guidance = nullptr;
std::vector<ggml_tensor*> ref_latents = {}; std::vector<ggml_tensor*> ref_latents = {};
bool increase_ref_index = false; bool increase_ref_index = false;
int num_video_frames = -1; int num_video_frames = -1;
std::vector<struct ggml_tensor*> controls = {}; std::vector<ggml_tensor*> controls = {};
float control_strength = 0.f; float control_strength = 0.f;
struct ggml_tensor* vace_context = nullptr; ggml_tensor* vace_context = nullptr;
float vace_strength = 1.f; float vace_strength = 1.f;
std::vector<int> skip_layers = {}; std::vector<int> skip_layers = {};
}; };
struct DiffusionModel { struct DiffusionModel {
virtual std::string get_desc() = 0; virtual std::string get_desc() = 0;
virtual bool compute(int n_threads, virtual bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) = 0; ggml_context* output_ctx = nullptr) = 0;
virtual void alloc_params_buffer() = 0; virtual void alloc_params_buffer() = 0;
virtual void free_params_buffer() = 0; virtual void free_params_buffer() = 0;
virtual void free_compute_buffer() = 0; virtual void free_compute_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0; virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0; virtual size_t get_params_buffer_size() = 0;
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){}; virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
virtual int64_t get_adm_in_channels() = 0; virtual int64_t get_adm_in_channels() = 0;
virtual void set_flash_attn_enabled(bool enabled) = 0; virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
}; };
struct UNetModel : public DiffusionModel { struct UNetModel : public DiffusionModel {
@ -67,7 +69,7 @@ struct UNetModel : public DiffusionModel {
unet.free_compute_buffer(); unet.free_compute_buffer();
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
unet.get_param_tensors(tensors, "model.diffusion_model"); unet.get_param_tensors(tensors, "model.diffusion_model");
} }
@ -83,14 +85,18 @@ struct UNetModel : public DiffusionModel {
return unet.unet.adm_in_channels; return unet.unet.adm_in_channels;
} }
void set_flash_attn_enabled(bool enabled) { void set_flash_attention_enabled(bool enabled) {
unet.set_flash_attention_enabled(enabled); unet.set_flash_attention_enabled(enabled);
} }
void set_circular_axes(bool circular_x, bool circular_y) override {
unet.set_circular_axes(circular_x, circular_y);
}
bool compute(int n_threads, bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { ggml_context* output_ctx = nullptr) override {
return unet.compute(n_threads, return unet.compute(n_threads,
diffusion_params.x, diffusion_params.x,
diffusion_params.timesteps, diffusion_params.timesteps,
@ -128,7 +134,7 @@ struct MMDiTModel : public DiffusionModel {
mmdit.free_compute_buffer(); mmdit.free_compute_buffer();
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
mmdit.get_param_tensors(tensors, "model.diffusion_model"); mmdit.get_param_tensors(tensors, "model.diffusion_model");
} }
@ -144,14 +150,18 @@ struct MMDiTModel : public DiffusionModel {
return 768 + 1280; return 768 + 1280;
} }
void set_flash_attn_enabled(bool enabled) { void set_flash_attention_enabled(bool enabled) {
mmdit.set_flash_attention_enabled(enabled); mmdit.set_flash_attention_enabled(enabled);
} }
void set_circular_axes(bool circular_x, bool circular_y) override {
mmdit.set_circular_axes(circular_x, circular_y);
}
bool compute(int n_threads, bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { ggml_context* output_ctx = nullptr) override {
return mmdit.compute(n_threads, return mmdit.compute(n_threads,
diffusion_params.x, diffusion_params.x,
diffusion_params.timesteps, diffusion_params.timesteps,
@ -190,7 +200,7 @@ struct FluxModel : public DiffusionModel {
flux.free_compute_buffer(); flux.free_compute_buffer();
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
flux.get_param_tensors(tensors, "model.diffusion_model"); flux.get_param_tensors(tensors, "model.diffusion_model");
} }
@ -206,14 +216,18 @@ struct FluxModel : public DiffusionModel {
return 768; return 768;
} }
void set_flash_attn_enabled(bool enabled) { void set_flash_attention_enabled(bool enabled) {
flux.set_flash_attention_enabled(enabled); flux.set_flash_attention_enabled(enabled);
} }
void set_circular_axes(bool circular_x, bool circular_y) override {
flux.set_circular_axes(circular_x, circular_y);
}
bool compute(int n_threads, bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { ggml_context* output_ctx = nullptr) override {
return flux.compute(n_threads, return flux.compute(n_threads,
diffusion_params.x, diffusion_params.x,
diffusion_params.timesteps, diffusion_params.timesteps,
@ -229,6 +243,72 @@ struct FluxModel : public DiffusionModel {
} }
}; };
struct AnimaModel : public DiffusionModel {
std::string prefix;
Anima::AnimaRunner anima;
AnimaModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
}
std::string get_desc() override {
return anima.get_desc();
}
void alloc_params_buffer() override {
anima.alloc_params_buffer();
}
void free_params_buffer() override {
anima.free_params_buffer();
}
void free_compute_buffer() override {
anima.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
anima.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return anima.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
anima.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
anima.set_flash_attention_enabled(enabled);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
anima.set_circular_axes(circular_x, circular_y);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
ggml_tensor** output = nullptr,
ggml_context* output_ctx = nullptr) override {
return anima.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.c_concat,
diffusion_params.y,
output,
output_ctx);
}
};
struct WanModel : public DiffusionModel { struct WanModel : public DiffusionModel {
std::string prefix; std::string prefix;
WAN::WanRunner wan; WAN::WanRunner wan;
@ -257,7 +337,7 @@ struct WanModel : public DiffusionModel {
wan.free_compute_buffer(); wan.free_compute_buffer();
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
wan.get_param_tensors(tensors, prefix); wan.get_param_tensors(tensors, prefix);
} }
@ -273,14 +353,18 @@ struct WanModel : public DiffusionModel {
return 768; return 768;
} }
void set_flash_attn_enabled(bool enabled) { void set_flash_attention_enabled(bool enabled) {
wan.set_flash_attention_enabled(enabled); wan.set_flash_attention_enabled(enabled);
} }
void set_circular_axes(bool circular_x, bool circular_y) override {
wan.set_circular_axes(circular_x, circular_y);
}
bool compute(int n_threads, bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { ggml_context* output_ctx = nullptr) override {
return wan.compute(n_threads, return wan.compute(n_threads,
diffusion_params.x, diffusion_params.x,
diffusion_params.timesteps, diffusion_params.timesteps,
@ -303,8 +387,9 @@ struct QwenImageModel : public DiffusionModel {
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {}, const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model", const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_QWEN_IMAGE) SDVersion version = VERSION_QWEN_IMAGE,
: prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) { bool zero_cond_t = false)
: prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version, zero_cond_t) {
} }
std::string get_desc() override { std::string get_desc() override {
@ -323,7 +408,7 @@ struct QwenImageModel : public DiffusionModel {
qwen_image.free_compute_buffer(); qwen_image.free_compute_buffer();
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
qwen_image.get_param_tensors(tensors, prefix); qwen_image.get_param_tensors(tensors, prefix);
} }
@ -339,14 +424,18 @@ struct QwenImageModel : public DiffusionModel {
return 768; return 768;
} }
void set_flash_attn_enabled(bool enabled) { void set_flash_attention_enabled(bool enabled) {
qwen_image.set_flash_attention_enabled(enabled); qwen_image.set_flash_attention_enabled(enabled);
} }
void set_circular_axes(bool circular_x, bool circular_y) override {
qwen_image.set_circular_axes(circular_x, circular_y);
}
bool compute(int n_threads, bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { ggml_context* output_ctx = nullptr) override {
return qwen_image.compute(n_threads, return qwen_image.compute(n_threads,
diffusion_params.x, diffusion_params.x,
diffusion_params.timesteps, diffusion_params.timesteps,
@ -386,7 +475,7 @@ struct ZImageModel : public DiffusionModel {
z_image.free_compute_buffer(); z_image.free_compute_buffer();
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
z_image.get_param_tensors(tensors, prefix); z_image.get_param_tensors(tensors, prefix);
} }
@ -402,14 +491,18 @@ struct ZImageModel : public DiffusionModel {
return 768; return 768;
} }
void set_flash_attn_enabled(bool enabled) { void set_flash_attention_enabled(bool enabled) {
z_image.set_flash_attention_enabled(enabled); z_image.set_flash_attention_enabled(enabled);
} }
void set_circular_axes(bool circular_x, bool circular_y) override {
z_image.set_circular_axes(circular_x, circular_y);
}
bool compute(int n_threads, bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { ggml_context* output_ctx = nullptr) override {
return z_image.compute(n_threads, return z_image.compute(n_threads,
diffusion_params.x, diffusion_params.x,
diffusion_params.timesteps, diffusion_params.timesteps,

View File

@ -27,11 +27,11 @@ public:
blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1})); blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) {
return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true); return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [n, num_feat, h, w] // x: [n, num_feat, h, w]
// return: [n, num_feat, h, w] // return: [n, num_feat, h, w]
@ -51,7 +51,7 @@ public:
x_cat = ggml_concat(ctx->ggml_ctx, x_cat, x4, 2); x_cat = ggml_concat(ctx->ggml_ctx, x_cat, x4, 2);
auto x5 = conv5->forward(ctx, x_cat); auto x5 = conv5->forward(ctx, x_cat);
x5 = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, x5, 0.2f), x); x5 = ggml_add(ctx->ggml_ctx, ggml_ext_scale(ctx->ggml_ctx, x5, 0.2f), x);
return x5; return x5;
} }
}; };
@ -64,7 +64,7 @@ public:
blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch)); blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [n, num_feat, h, w] // x: [n, num_feat, h, w]
// return: [n, num_feat, h, w] // return: [n, num_feat, h, w]
@ -76,7 +76,7 @@ public:
out = rdb2->forward(ctx, out); out = rdb2->forward(ctx, out);
out = rdb3->forward(ctx, out); out = rdb3->forward(ctx, out);
out = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, out, 0.2f), x); out = ggml_add(ctx->ggml_ctx, ggml_ext_scale(ctx->ggml_ctx, out, 0.2f), x);
return out; return out;
} }
}; };
@ -112,11 +112,11 @@ public:
int get_scale() { return scale; } int get_scale() { return scale; }
int get_num_block() { return num_block; } int get_num_block() { return num_block; }
struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) {
return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true); return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [n, num_in_ch, h, w] // x: [n, num_in_ch, h, w]
// return: [n, num_out_ch, h*scale, w*scale] // return: [n, num_out_ch, h*scale, w*scale]
auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]); auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
@ -341,24 +341,24 @@ struct ESRGAN : public GGMLRunner {
return success; return success;
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* x) { ggml_cgraph* build_graph(ggml_tensor* x) {
if (!rrdb_net) if (!rrdb_net)
return nullptr; return nullptr;
constexpr int kGraphNodes = 1 << 16; // 65k constexpr int kGraphNodes = 1 << 16; // 65k
struct ggml_cgraph* gf = new_graph_custom(kGraphNodes); ggml_cgraph* gf = new_graph_custom(kGraphNodes);
x = to_backend(x); x = to_backend(x);
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* out = rrdb_net->forward(&runner_ctx, x); ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
return gf; return gf;
} }
bool compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* x, ggml_tensor* x,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x); return build_graph(x);
}; };
return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -151,7 +151,7 @@ private:
} }
if (n_dims > GGML_MAX_DIMS) { if (n_dims > GGML_MAX_DIMS) {
for (int i = GGML_MAX_DIMS; i < n_dims; i++) { for (uint32_t i = GGML_MAX_DIMS; i < n_dims; i++) {
info.shape[GGML_MAX_DIMS - 1] *= info.shape[i]; // stack to last dim; info.shape[GGML_MAX_DIMS - 1] *= info.shape[i]; // stack to last dim;
} }
info.shape.resize(GGML_MAX_DIMS); info.shape.resize(GGML_MAX_DIMS);

View File

@ -1,234 +1,234 @@
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include "ggml.h" #include "ggml.h"
const float wan_21_latent_rgb_proj[16][3] = { const float wan_21_latent_rgb_proj[16][3] = {
{0.015123f, -0.148418f, 0.479828f}, {0.015123f, -0.148418f, 0.479828f},
{0.003652f, -0.010680f, -0.037142f}, {0.003652f, -0.010680f, -0.037142f},
{0.212264f, 0.063033f, 0.016779f}, {0.212264f, 0.063033f, 0.016779f},
{0.232999f, 0.406476f, 0.220125f}, {0.232999f, 0.406476f, 0.220125f},
{-0.051864f, -0.082384f, -0.069396f}, {-0.051864f, -0.082384f, -0.069396f},
{0.085005f, -0.161492f, 0.010689f}, {0.085005f, -0.161492f, 0.010689f},
{-0.245369f, -0.506846f, -0.117010f}, {-0.245369f, -0.506846f, -0.117010f},
{-0.151145f, 0.017721f, 0.007207f}, {-0.151145f, 0.017721f, 0.007207f},
{-0.293239f, -0.207936f, -0.421135f}, {-0.293239f, -0.207936f, -0.421135f},
{-0.187721f, 0.050783f, 0.177649f}, {-0.187721f, 0.050783f, 0.177649f},
{-0.013067f, 0.265964f, 0.166578f}, {-0.013067f, 0.265964f, 0.166578f},
{0.028327f, 0.109329f, 0.108642f}, {0.028327f, 0.109329f, 0.108642f},
{-0.205343f, 0.043991f, 0.148914f}, {-0.205343f, 0.043991f, 0.148914f},
{0.014307f, -0.048647f, -0.007219f}, {0.014307f, -0.048647f, -0.007219f},
{0.217150f, 0.053074f, 0.319923f}, {0.217150f, 0.053074f, 0.319923f},
{0.155357f, 0.083156f, 0.064780f}}; {0.155357f, 0.083156f, 0.064780f}};
float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f}; float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f};
const float wan_22_latent_rgb_proj[48][3] = { const float wan_22_latent_rgb_proj[48][3] = {
{0.017126f, -0.027230f, -0.019257f}, {0.017126f, -0.027230f, -0.019257f},
{-0.113739f, -0.028715f, -0.022885f}, {-0.113739f, -0.028715f, -0.022885f},
{-0.000106f, 0.021494f, 0.004629f}, {-0.000106f, 0.021494f, 0.004629f},
{-0.013273f, -0.107137f, -0.033638f}, {-0.013273f, -0.107137f, -0.033638f},
{-0.000381f, 0.000279f, 0.025877f}, {-0.000381f, 0.000279f, 0.025877f},
{-0.014216f, -0.003975f, 0.040528f}, {-0.014216f, -0.003975f, 0.040528f},
{0.001638f, -0.000748f, 0.011022f}, {0.001638f, -0.000748f, 0.011022f},
{0.029238f, -0.006697f, 0.035933f}, {0.029238f, -0.006697f, 0.035933f},
{0.021641f, -0.015874f, 0.040531f}, {0.021641f, -0.015874f, 0.040531f},
{-0.101984f, -0.070160f, -0.028855f}, {-0.101984f, -0.070160f, -0.028855f},
{0.033207f, -0.021068f, 0.002663f}, {0.033207f, -0.021068f, 0.002663f},
{-0.104711f, 0.121673f, 0.102981f}, {-0.104711f, 0.121673f, 0.102981f},
{0.082647f, -0.004991f, 0.057237f}, {0.082647f, -0.004991f, 0.057237f},
{-0.027375f, 0.031581f, 0.006868f}, {-0.027375f, 0.031581f, 0.006868f},
{-0.045434f, 0.029444f, 0.019287f}, {-0.045434f, 0.029444f, 0.019287f},
{-0.046572f, -0.012537f, 0.006675f}, {-0.046572f, -0.012537f, 0.006675f},
{0.074709f, 0.033690f, 0.025289f}, {0.074709f, 0.033690f, 0.025289f},
{-0.008251f, -0.002745f, -0.006999f}, {-0.008251f, -0.002745f, -0.006999f},
{0.012685f, -0.061856f, -0.048658f}, {0.012685f, -0.061856f, -0.048658f},
{0.042304f, -0.007039f, 0.000295f}, {0.042304f, -0.007039f, 0.000295f},
{-0.007644f, -0.060843f, -0.033142f}, {-0.007644f, -0.060843f, -0.033142f},
{0.159909f, 0.045628f, 0.367541f}, {0.159909f, 0.045628f, 0.367541f},
{0.095171f, 0.086438f, 0.010271f}, {0.095171f, 0.086438f, 0.010271f},
{0.006812f, 0.019643f, 0.029637f}, {0.006812f, 0.019643f, 0.029637f},
{0.003467f, -0.010705f, 0.014252f}, {0.003467f, -0.010705f, 0.014252f},
{-0.099681f, -0.066272f, -0.006243f}, {-0.099681f, -0.066272f, -0.006243f},
{0.047357f, 0.037040f, 0.000185f}, {0.047357f, 0.037040f, 0.000185f},
{-0.041797f, -0.089225f, -0.032257f}, {-0.041797f, -0.089225f, -0.032257f},
{0.008928f, 0.017028f, 0.018684f}, {0.008928f, 0.017028f, 0.018684f},
{-0.042255f, 0.016045f, 0.006849f}, {-0.042255f, 0.016045f, 0.006849f},
{0.011268f, 0.036462f, 0.037387f}, {0.011268f, 0.036462f, 0.037387f},
{0.011553f, -0.016375f, -0.048589f}, {0.011553f, -0.016375f, -0.048589f},
{0.046266f, -0.027189f, 0.056979f}, {0.046266f, -0.027189f, 0.056979f},
{0.009640f, -0.017576f, 0.030324f}, {0.009640f, -0.017576f, 0.030324f},
{-0.045794f, -0.036083f, -0.010616f}, {-0.045794f, -0.036083f, -0.010616f},
{0.022418f, 0.039783f, -0.032939f}, {0.022418f, 0.039783f, -0.032939f},
{-0.052714f, -0.015525f, 0.007438f}, {-0.052714f, -0.015525f, 0.007438f},
{0.193004f, 0.223541f, 0.264175f}, {0.193004f, 0.223541f, 0.264175f},
{-0.059406f, -0.008188f, 0.022867f}, {-0.059406f, -0.008188f, 0.022867f},
{-0.156742f, -0.263791f, -0.007385f}, {-0.156742f, -0.263791f, -0.007385f},
{-0.015717f, 0.016570f, 0.033969f}, {-0.015717f, 0.016570f, 0.033969f},
{0.037969f, 0.109835f, 0.200449f}, {0.037969f, 0.109835f, 0.200449f},
{-0.000782f, -0.009566f, -0.008058f}, {-0.000782f, -0.009566f, -0.008058f},
{0.010709f, 0.052960f, -0.044195f}, {0.010709f, 0.052960f, -0.044195f},
{0.017271f, 0.045839f, 0.034569f}, {0.017271f, 0.045839f, 0.034569f},
{0.009424f, 0.013088f, -0.001714f}, {0.009424f, 0.013088f, -0.001714f},
{-0.024805f, -0.059378f, -0.033756f}, {-0.024805f, -0.059378f, -0.033756f},
{-0.078293f, 0.029070f, 0.026129f}}; {-0.078293f, 0.029070f, 0.026129f}};
float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f}; float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f};
const float flux_latent_rgb_proj[16][3] = { const float flux_latent_rgb_proj[16][3] = {
{-0.041168f, 0.019917f, 0.097253f}, {-0.041168f, 0.019917f, 0.097253f},
{0.028096f, 0.026730f, 0.129576f}, {0.028096f, 0.026730f, 0.129576f},
{0.065618f, -0.067950f, -0.014651f}, {0.065618f, -0.067950f, -0.014651f},
{-0.012998f, -0.014762f, 0.081251f}, {-0.012998f, -0.014762f, 0.081251f},
{0.078567f, 0.059296f, -0.024687f}, {0.078567f, 0.059296f, -0.024687f},
{-0.015987f, -0.003697f, 0.005012f}, {-0.015987f, -0.003697f, 0.005012f},
{0.033605f, 0.138999f, 0.068517f}, {0.033605f, 0.138999f, 0.068517f},
{-0.024450f, -0.063567f, -0.030101f}, {-0.024450f, -0.063567f, -0.030101f},
{-0.040194f, -0.016710f, 0.127185f}, {-0.040194f, -0.016710f, 0.127185f},
{0.112681f, 0.088764f, -0.041940f}, {0.112681f, 0.088764f, -0.041940f},
{-0.023498f, 0.093664f, 0.025543f}, {-0.023498f, 0.093664f, 0.025543f},
{0.082899f, 0.048320f, 0.007491f}, {0.082899f, 0.048320f, 0.007491f},
{0.075712f, 0.074139f, 0.081965f}, {0.075712f, 0.074139f, 0.081965f},
{-0.143501f, 0.018263f, -0.136138f}, {-0.143501f, 0.018263f, -0.136138f},
{-0.025767f, -0.082035f, -0.040023f}, {-0.025767f, -0.082035f, -0.040023f},
{-0.111849f, -0.055589f, -0.032361f}}; {-0.111849f, -0.055589f, -0.032361f}};
float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f}; float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
const float flux2_latent_rgb_proj[32][3] = { const float flux2_latent_rgb_proj[32][3] = {
{0.000736f, -0.008385f, -0.019710f}, {0.000736f, -0.008385f, -0.019710f},
{-0.001352f, -0.016392f, 0.020693f}, {-0.001352f, -0.016392f, 0.020693f},
{-0.006376f, 0.002428f, 0.036736f}, {-0.006376f, 0.002428f, 0.036736f},
{0.039384f, 0.074167f, 0.119789f}, {0.039384f, 0.074167f, 0.119789f},
{0.007464f, -0.005705f, -0.004734f}, {0.007464f, -0.005705f, -0.004734f},
{-0.004086f, 0.005287f, -0.000409f}, {-0.004086f, 0.005287f, -0.000409f},
{-0.032835f, 0.050802f, -0.028120f}, {-0.032835f, 0.050802f, -0.028120f},
{-0.003158f, -0.000835f, 0.000406f}, {-0.003158f, -0.000835f, 0.000406f},
{-0.112840f, -0.084337f, -0.023083f}, {-0.112840f, -0.084337f, -0.023083f},
{0.001462f, -0.006656f, 0.000549f}, {0.001462f, -0.006656f, 0.000549f},
{-0.009980f, -0.007480f, 0.009702f}, {-0.009980f, -0.007480f, 0.009702f},
{0.032540f, 0.000214f, -0.061388f}, {0.032540f, 0.000214f, -0.061388f},
{0.011023f, 0.000694f, 0.007143f}, {0.011023f, 0.000694f, 0.007143f},
{-0.001468f, -0.006723f, -0.001678f}, {-0.001468f, -0.006723f, -0.001678f},
{-0.005921f, -0.010320f, -0.003907f}, {-0.005921f, -0.010320f, -0.003907f},
{-0.028434f, 0.027584f, 0.018457f}, {-0.028434f, 0.027584f, 0.018457f},
{0.014349f, 0.011523f, 0.000441f}, {0.014349f, 0.011523f, 0.000441f},
{0.009874f, 0.003081f, 0.001507f}, {0.009874f, 0.003081f, 0.001507f},
{0.002218f, 0.005712f, 0.001563f}, {0.002218f, 0.005712f, 0.001563f},
{0.053010f, -0.019844f, 0.008683f}, {0.053010f, -0.019844f, 0.008683f},
{-0.002507f, 0.005384f, 0.000938f}, {-0.002507f, 0.005384f, 0.000938f},
{-0.002177f, -0.011366f, 0.003559f}, {-0.002177f, -0.011366f, 0.003559f},
{-0.000261f, 0.015121f, -0.003240f}, {-0.000261f, 0.015121f, -0.003240f},
{-0.003944f, -0.002083f, 0.005043f}, {-0.003944f, -0.002083f, 0.005043f},
{-0.009138f, 0.011336f, 0.003781f}, {-0.009138f, 0.011336f, 0.003781f},
{0.011429f, 0.003985f, -0.003855f}, {0.011429f, 0.003985f, -0.003855f},
{0.010518f, -0.005586f, 0.010131f}, {0.010518f, -0.005586f, 0.010131f},
{0.007883f, 0.002912f, -0.001473f}, {0.007883f, 0.002912f, -0.001473f},
{-0.003318f, -0.003160f, 0.003684f}, {-0.003318f, -0.003160f, 0.003684f},
{-0.034560f, -0.008740f, 0.012996f}, {-0.034560f, -0.008740f, 0.012996f},
{0.000166f, 0.001079f, -0.012153f}, {0.000166f, 0.001079f, -0.012153f},
{0.017772f, 0.000937f, -0.011953f}}; {0.017772f, 0.000937f, -0.011953f}};
float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f}; float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
// This one was taken straight from // This one was taken straight from
// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303 // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
// (MiT Licence) // (MiT Licence)
const float sd3_latent_rgb_proj[16][3] = { const float sd3_latent_rgb_proj[16][3] = {
{-0.0645f, 0.0177f, 0.1052f}, {-0.0645f, 0.0177f, 0.1052f},
{0.0028f, 0.0312f, 0.0650f}, {0.0028f, 0.0312f, 0.0650f},
{0.1848f, 0.0762f, 0.0360f}, {0.1848f, 0.0762f, 0.0360f},
{0.0944f, 0.0360f, 0.0889f}, {0.0944f, 0.0360f, 0.0889f},
{0.0897f, 0.0506f, -0.0364f}, {0.0897f, 0.0506f, -0.0364f},
{-0.0020f, 0.1203f, 0.0284f}, {-0.0020f, 0.1203f, 0.0284f},
{0.0855f, 0.0118f, 0.0283f}, {0.0855f, 0.0118f, 0.0283f},
{-0.0539f, 0.0658f, 0.1047f}, {-0.0539f, 0.0658f, 0.1047f},
{-0.0057f, 0.0116f, 0.0700f}, {-0.0057f, 0.0116f, 0.0700f},
{-0.0412f, 0.0281f, -0.0039f}, {-0.0412f, 0.0281f, -0.0039f},
{0.1106f, 0.1171f, 0.1220f}, {0.1106f, 0.1171f, 0.1220f},
{-0.0248f, 0.0682f, -0.0481f}, {-0.0248f, 0.0682f, -0.0481f},
{0.0815f, 0.0846f, 0.1207f}, {0.0815f, 0.0846f, 0.1207f},
{-0.0120f, -0.0055f, -0.0867f}, {-0.0120f, -0.0055f, -0.0867f},
{-0.0749f, -0.0634f, -0.0456f}, {-0.0749f, -0.0634f, -0.0456f},
{-0.1418f, -0.1457f, -0.1259f}, {-0.1418f, -0.1457f, -0.1259f},
}; };
float sd3_latent_rgb_bias[3] = {0, 0, 0}; float sd3_latent_rgb_bias[3] = {0, 0, 0};
const float sdxl_latent_rgb_proj[4][3] = { const float sdxl_latent_rgb_proj[4][3] = {
{0.258303f, 0.277640f, 0.329699f}, {0.258303f, 0.277640f, 0.329699f},
{-0.299701f, 0.105446f, 0.014194f}, {-0.299701f, 0.105446f, 0.014194f},
{0.050522f, 0.186163f, -0.143257f}, {0.050522f, 0.186163f, -0.143257f},
{-0.211938f, -0.149892f, -0.080036f}}; {-0.211938f, -0.149892f, -0.080036f}};
float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f}; float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f};
const float sd_latent_rgb_proj[4][3] = { const float sd_latent_rgb_proj[4][3] = {
{0.337366f, 0.216344f, 0.257386f}, {0.337366f, 0.216344f, 0.257386f},
{0.165636f, 0.386828f, 0.046994f}, {0.165636f, 0.386828f, 0.046994f},
{-0.267803f, 0.237036f, 0.223517f}, {-0.267803f, 0.237036f, 0.223517f},
{-0.178022f, -0.200862f, -0.678514f}}; {-0.178022f, -0.200862f, -0.678514f}};
float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f}; float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) { void preview_latent_video(uint8_t* buffer, ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
size_t buffer_head = 0; size_t buffer_head = 0;
uint32_t latent_width = latents->ne[0]; uint32_t latent_width = static_cast<uint32_t>(latents->ne[0]);
uint32_t latent_height = latents->ne[1]; uint32_t latent_height = static_cast<uint32_t>(latents->ne[1]);
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; uint32_t dim = static_cast<uint32_t>(latents->ne[ggml_n_dims(latents) - 1]);
uint32_t frames = 1; uint32_t frames = 1;
if (ggml_n_dims(latents) == 4) { if (ggml_n_dims(latents) == 4) {
frames = latents->ne[2]; frames = static_cast<uint32_t>(latents->ne[2]);
} }
uint32_t rgb_width = latent_width * patch_size; uint32_t rgb_width = latent_width * patch_size;
uint32_t rgb_height = latent_height * patch_size; uint32_t rgb_height = latent_height * patch_size;
uint32_t unpatched_dim = dim / (patch_size * patch_size); uint32_t unpatched_dim = dim / (patch_size * patch_size);
for (int k = 0; k < frames; k++) { for (uint32_t k = 0; k < frames; k++) {
for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) { for (uint32_t rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) { for (uint32_t rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
int latent_x = rgb_x / patch_size; int latent_x = rgb_x / patch_size;
int latent_y = rgb_y / patch_size; int latent_y = rgb_y / patch_size;
int channel_offset = 0; int channel_offset = 0;
if (patch_size > 1) { if (patch_size > 1) {
channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size)); channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
} }
size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]); size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
// should be incremented by 1 for each pixel // should be incremented by 1 for each pixel
size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x; size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
float r = 0, g = 0, b = 0; float r = 0, g = 0, b = 0;
if (latent_rgb_proj != nullptr) { if (latent_rgb_proj != nullptr) {
for (int d = 0; d < unpatched_dim; d++) { for (uint32_t d = 0; d < unpatched_dim; d++) {
float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]); float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
r += value * latent_rgb_proj[d][0]; r += value * latent_rgb_proj[d][0];
g += value * latent_rgb_proj[d][1]; g += value * latent_rgb_proj[d][1];
b += value * latent_rgb_proj[d][2]; b += value * latent_rgb_proj[d][2];
} }
} else { } else {
// interpret first 3 channels as RGB // interpret first 3 channels as RGB
r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]); r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]);
g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]); g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]); b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
} }
if (latent_rgb_bias != nullptr) { if (latent_rgb_bias != nullptr) {
// bias // bias
r += latent_rgb_bias[0]; r += latent_rgb_bias[0];
g += latent_rgb_bias[1]; g += latent_rgb_bias[1];
b += latent_rgb_bias[2]; b += latent_rgb_bias[2];
} }
// change range // change range
r = r * .5f + .5f; r = r * .5f + .5f;
g = g * .5f + .5f; g = g * .5f + .5f;
b = b * .5f + .5f; b = b * .5f + .5f;
// clamp rgb values to [0,1] range // clamp rgb values to [0,1] range
r = r >= 0 ? r <= 1 ? r : 1 : 0; r = r >= 0 ? r <= 1 ? r : 1 : 0;
g = g >= 0 ? g <= 1 ? g : 1 : 0; g = g >= 0 ? g <= 1 ? g : 1 : 0;
b = b >= 0 ? b <= 1 ? b : 1 : 0; b = b >= 0 ? b <= 1 ? b : 1 : 0;
buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255); buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255); buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255); buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
} }
} }
} }
} }

View File

@ -19,6 +19,7 @@
#include "json.hpp" #include "json.hpp"
#include "rope.hpp" #include "rope.hpp"
#include "tokenize_util.h" #include "tokenize_util.h"
#include "vocab/vocab.h"
namespace LLM { namespace LLM {
constexpr int LLM_GRAPH_SIZE = 10240; constexpr int LLM_GRAPH_SIZE = 10240;
@ -195,14 +196,14 @@ namespace LLM {
tokens.insert(tokens.begin(), BOS_TOKEN_ID); tokens.insert(tokens.begin(), BOS_TOKEN_ID);
} }
if (max_length > 0 && padding) { if (max_length > 0 && padding) {
size_t n = std::ceil(tokens.size() * 1.0 / max_length); size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.f / max_length));
if (n == 0) { if (n == 0) {
n = 1; n = 1;
} }
size_t length = max_length * n; size_t length = max_length * n;
LOG_DEBUG("token length: %llu", length); LOG_DEBUG("token length: %llu", length);
tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID); tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID);
weights.insert(weights.end(), length - weights.size(), 1.0); weights.insert(weights.end(), length - weights.size(), 1.f);
} }
} }
@ -365,7 +366,7 @@ namespace LLM {
if (merges_utf8_str.size() > 0) { if (merges_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str); load_from_merges(merges_utf8_str);
} else { } else {
load_from_merges(ModelLoader::load_qwen2_merges()); load_from_merges(load_qwen2_merges());
} }
} }
}; };
@ -377,7 +378,7 @@ namespace LLM {
try { try {
vocab = nlohmann::json::parse(vocab_utf8_str); vocab = nlohmann::json::parse(vocab_utf8_str);
} catch (const nlohmann::json::parse_error& e) { } catch (const nlohmann::json::parse_error&) {
GGML_ABORT("invalid vocab json str"); GGML_ABORT("invalid vocab json str");
} }
for (const auto& [key, value] : vocab.items()) { for (const auto& [key, value] : vocab.items()) {
@ -386,7 +387,7 @@ namespace LLM {
encoder[token] = i; encoder[token] = i;
decoder[i] = token; decoder[i] = token;
} }
encoder_len = vocab.size(); encoder_len = static_cast<int>(vocab.size());
LOG_DEBUG("vocab size: %d", encoder_len); LOG_DEBUG("vocab size: %d", encoder_len);
auto byte_unicode_pairs = bytes_to_unicode(); auto byte_unicode_pairs = bytes_to_unicode();
@ -466,7 +467,7 @@ namespace LLM {
if (merges_utf8_str.size() > 0 && vocab_utf8_str.size() > 0) { if (merges_utf8_str.size() > 0 && vocab_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str, vocab_utf8_str); load_from_merges(merges_utf8_str, vocab_utf8_str);
} else { } else {
load_from_merges(ModelLoader::load_mistral_merges(), ModelLoader::load_mistral_vocab_json()); load_from_merges(load_mistral_merges(), load_mistral_vocab_json());
} }
} }
}; };
@ -485,16 +486,16 @@ namespace LLM {
}; };
struct LLMVisionParams { struct LLMVisionParams {
int64_t num_layers = 32; int num_layers = 32;
int64_t hidden_size = 1280; int64_t hidden_size = 1280;
int64_t intermediate_size = 3420; int64_t intermediate_size = 3420;
int64_t num_heads = 16; int num_heads = 16;
int64_t in_channels = 3; int64_t in_channels = 3;
int64_t out_hidden_size = 3584; int64_t out_hidden_size = 3584;
int64_t temporal_patch_size = 2; int temporal_patch_size = 2;
int64_t patch_size = 14; int patch_size = 14;
int64_t spatial_merge_size = 2; int spatial_merge_size = 2;
int64_t window_size = 112; int window_size = 112;
std::set<int> fullatt_block_indexes = {7, 15, 23, 31}; std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
}; };
@ -503,9 +504,9 @@ namespace LLM {
int64_t num_layers = 28; int64_t num_layers = 28;
int64_t hidden_size = 3584; int64_t hidden_size = 3584;
int64_t intermediate_size = 18944; int64_t intermediate_size = 18944;
int64_t num_heads = 28; int num_heads = 28;
int64_t num_kv_heads = 4; int num_kv_heads = 4;
int64_t head_dim = 128; int head_dim = 128;
bool qkv_bias = true; bool qkv_bias = true;
bool qk_norm = false; bool qk_norm = false;
int64_t vocab_size = 152064; int64_t vocab_size = 152064;
@ -521,7 +522,7 @@ namespace LLM {
blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, bias)); blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, bias));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]); auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
auto up_proj = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]); auto up_proj = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
@ -581,7 +582,7 @@ namespace LLM {
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N*grid_t*grid_h*grid_w, in_channels, temporal_patch_size*patch_size*patch_size] // x: [N*grid_t*grid_h*grid_w, in_channels, temporal_patch_size*patch_size*patch_size]
// return: [N*grid_t*grid_h*grid_w, embed_dim] // return: [N*grid_t*grid_h*grid_w, embed_dim]
x = ggml_reshape_4d(ctx->ggml_ctx, x = ggml_reshape_4d(ctx->ggml_ctx,
@ -630,7 +631,7 @@ namespace LLM {
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, dim)); blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, dim));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto ln_q = std::dynamic_pointer_cast<RMSNorm>(blocks["ln_q"]); auto ln_q = std::dynamic_pointer_cast<RMSNorm>(blocks["ln_q"]);
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]); auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]); auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
@ -638,7 +639,7 @@ namespace LLM {
x = ln_q->forward(ctx, x); x = ln_q->forward(ctx, x);
x = ggml_reshape_2d(ctx->ggml_ctx, x, hidden_size, ggml_nelements(x) / hidden_size); x = ggml_reshape_2d(ctx->ggml_ctx, x, hidden_size, ggml_nelements(x) / hidden_size);
x = mlp_0->forward(ctx, x); x = mlp_0->forward(ctx, x);
x = ggml_gelu(ctx->ggml_ctx, x); x = ggml_ext_gelu(ctx->ggml_ctx, x);
x = mlp_2->forward(ctx, x); x = mlp_2->forward(ctx, x);
return x; return x;
} }
@ -647,15 +648,15 @@ namespace LLM {
struct VisionAttention : public GGMLBlock { struct VisionAttention : public GGMLBlock {
protected: protected:
bool llama_cpp_style; bool llama_cpp_style;
int64_t head_dim; int head_dim;
int64_t num_heads; int num_heads;
public: public:
VisionAttention(bool llama_cpp_style, VisionAttention(bool llama_cpp_style,
int64_t hidden_size, int64_t hidden_size,
int64_t num_heads) int num_heads)
: llama_cpp_style(llama_cpp_style), num_heads(num_heads) { : llama_cpp_style(llama_cpp_style), num_heads(num_heads) {
head_dim = hidden_size / num_heads; head_dim = static_cast<int>(hidden_size / num_heads);
GGML_ASSERT(num_heads * head_dim == hidden_size); GGML_ASSERT(num_heads * head_dim == hidden_size);
if (llama_cpp_style) { if (llama_cpp_style) {
blocks["q_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size)); blocks["q_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
@ -667,10 +668,10 @@ namespace LLM {
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size)); blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* pe, ggml_tensor* pe,
struct ggml_tensor* mask = nullptr) { ggml_tensor* mask = nullptr) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
int64_t n_token = x->ne[1]; int64_t n_token = x->ne[1];
int64_t N = x->ne[2]; int64_t N = x->ne[2];
@ -709,7 +710,7 @@ namespace LLM {
VisionBlock(bool llama_cpp_style, VisionBlock(bool llama_cpp_style,
int64_t hidden_size, int64_t hidden_size,
int64_t intermediate_size, int64_t intermediate_size,
int64_t num_heads, int num_heads,
float eps = 1e-6f) { float eps = 1e-6f) {
blocks["attn"] = std::shared_ptr<GGMLBlock>(new VisionAttention(llama_cpp_style, hidden_size, num_heads)); blocks["attn"] = std::shared_ptr<GGMLBlock>(new VisionAttention(llama_cpp_style, hidden_size, num_heads));
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new MLP(hidden_size, intermediate_size, true)); blocks["mlp"] = std::shared_ptr<GGMLBlock>(new MLP(hidden_size, intermediate_size, true));
@ -717,10 +718,10 @@ namespace LLM {
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps)); blocks["norm2"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* pe, ggml_tensor* pe,
struct ggml_tensor* mask = nullptr) { ggml_tensor* mask = nullptr) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
auto attn = std::dynamic_pointer_cast<VisionAttention>(blocks["attn"]); auto attn = std::dynamic_pointer_cast<VisionAttention>(blocks["attn"]);
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]); auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
@ -743,22 +744,22 @@ namespace LLM {
struct VisionModel : public GGMLBlock { struct VisionModel : public GGMLBlock {
protected: protected:
int64_t num_layers; int num_layers;
int64_t spatial_merge_size; int spatial_merge_size;
std::set<int> fullatt_block_indexes; std::set<int> fullatt_block_indexes;
public: public:
VisionModel(bool llama_cpp_style, VisionModel(bool llama_cpp_style,
int64_t num_layers, int num_layers,
int64_t in_channels, int64_t in_channels,
int64_t hidden_size, int64_t hidden_size,
int64_t out_hidden_size, int64_t out_hidden_size,
int64_t intermediate_size, int64_t intermediate_size,
int64_t num_heads, int num_heads,
int64_t spatial_merge_size, int spatial_merge_size,
int64_t patch_size, int patch_size,
int64_t temporal_patch_size, int temporal_patch_size,
int64_t window_size, int window_size,
std::set<int> fullatt_block_indexes = {7, 15, 23, 31}, std::set<int> fullatt_block_indexes = {7, 15, 23, 31},
float eps = 1e-6f) float eps = 1e-6f)
: num_layers(num_layers), fullatt_block_indexes(std::move(fullatt_block_indexes)), spatial_merge_size(spatial_merge_size) { : num_layers(num_layers), fullatt_block_indexes(std::move(fullatt_block_indexes)), spatial_merge_size(spatial_merge_size) {
@ -777,12 +778,12 @@ namespace LLM {
blocks["merger"] = std::shared_ptr<GGMLBlock>(new PatchMerger(out_hidden_size, hidden_size, spatial_merge_size)); blocks["merger"] = std::shared_ptr<GGMLBlock>(new PatchMerger(out_hidden_size, hidden_size, spatial_merge_size));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values, ggml_tensor* pixel_values,
struct ggml_tensor* pe, ggml_tensor* pe,
struct ggml_tensor* window_index, ggml_tensor* window_index,
struct ggml_tensor* window_inverse_index, ggml_tensor* window_inverse_index,
struct ggml_tensor* window_mask) { ggml_tensor* window_mask) {
// pixel_values: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw] // pixel_values: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw]
// window_index: [grid_t*(H/mh/ph)*(W/mw/pw)] // window_index: [grid_t*(H/mh/ph)*(W/mw/pw)]
// window_inverse_index: [grid_t*(H/mh/ph)*(W/mw/pw)] // window_inverse_index: [grid_t*(H/mh/ph)*(W/mw/pw)]
@ -817,7 +818,7 @@ namespace LLM {
struct Attention : public GGMLBlock { struct Attention : public GGMLBlock {
protected: protected:
LLMArch arch; LLMArch arch;
int64_t head_dim; int head_dim;
int64_t num_heads; int64_t num_heads;
int64_t num_kv_heads; int64_t num_kv_heads;
bool qk_norm; bool qk_norm;
@ -835,9 +836,10 @@ namespace LLM {
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* input_pos) { ggml_tensor* input_pos,
ggml_tensor* attention_mask = nullptr) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
int64_t n_token = x->ne[1]; int64_t n_token = x->ne[1];
int64_t N = x->ne[2]; int64_t N = x->ne[2];
@ -880,7 +882,7 @@ namespace LLM {
k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim] k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim]
k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim] k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim]
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, nullptr, true, true, false); // [N, n_token, hidden_size] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false); // [N, n_token, hidden_size]
x = out_proj->forward(ctx, x); // [N, n_token, hidden_size] x = out_proj->forward(ctx, x); // [N, n_token, hidden_size]
return x; return x;
@ -896,9 +898,10 @@ namespace LLM {
blocks["post_attention_layernorm"] = std::make_shared<RMSNorm>(params.hidden_size, params.rms_norm_eps); blocks["post_attention_layernorm"] = std::make_shared<RMSNorm>(params.hidden_size, params.rms_norm_eps);
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* input_pos) { ggml_tensor* input_pos,
ggml_tensor* attention_mask = nullptr) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
auto self_attn = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]); auto self_attn = std::dynamic_pointer_cast<Attention>(blocks["self_attn"]);
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]); auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]);
@ -907,7 +910,7 @@ namespace LLM {
auto residual = x; auto residual = x;
x = input_layernorm->forward(ctx, x); x = input_layernorm->forward(ctx, x);
x = self_attn->forward(ctx, x, input_pos); x = self_attn->forward(ctx, x, input_pos, attention_mask);
x = ggml_add_inplace(ctx->ggml_ctx, x, residual); x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
residual = x; residual = x;
@ -933,11 +936,12 @@ namespace LLM {
blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.hidden_size, params.rms_norm_eps)); blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.hidden_size, params.rms_norm_eps));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
struct ggml_tensor* input_pos, ggml_tensor* input_pos,
std::vector<std::pair<int, ggml_tensor*>> image_embeds, ggml_tensor* attention_mask,
std::set<int> out_layers) { std::vector<std::pair<int, ggml_tensor*>> image_embeds,
std::set<int> out_layers) {
// input_ids: [N, n_token] // input_ids: [N, n_token]
// return: [N, n_token, hidden_size] // return: [N, n_token, hidden_size]
@ -990,7 +994,7 @@ namespace LLM {
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);
x = block->forward(ctx, x, input_pos); x = block->forward(ctx, x, input_pos, attention_mask);
if (out_layers.find(i + 1) != out_layers.end()) { if (out_layers.find(i + 1) != out_layers.end()) {
intermediate_outputs.push_back(x); intermediate_outputs.push_back(x);
} }
@ -1033,24 +1037,25 @@ namespace LLM {
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
struct ggml_tensor* input_pos, ggml_tensor* input_pos,
std::vector<std::pair<int, ggml_tensor*>> image_embeds, ggml_tensor* attention_mask,
std::set<int> out_layers) { std::vector<std::pair<int, ggml_tensor*>> image_embeds,
std::set<int> out_layers) {
// input_ids: [N, n_token] // input_ids: [N, n_token]
auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]); auto model = std::dynamic_pointer_cast<TextModel>(blocks["model"]);
auto x = model->forward(ctx, input_ids, input_pos, image_embeds, out_layers); auto x = model->forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
return x; return x;
} }
struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx, ggml_tensor* vision_forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values, ggml_tensor* pixel_values,
struct ggml_tensor* pe, ggml_tensor* pe,
struct ggml_tensor* window_index, ggml_tensor* window_index,
struct ggml_tensor* window_inverse_index, ggml_tensor* window_inverse_index,
struct ggml_tensor* window_mask) { ggml_tensor* window_mask) {
GGML_ASSERT(enable_vision); GGML_ASSERT(enable_vision);
auto vision_model = std::dynamic_pointer_cast<VisionModel>(blocks["visual"]); auto vision_model = std::dynamic_pointer_cast<VisionModel>(blocks["visual"]);
return vision_model->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask); return vision_model->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask);
@ -1063,6 +1068,7 @@ namespace LLM {
LLM model; LLM model;
std::vector<int> input_pos_vec; std::vector<int> input_pos_vec;
std::vector<float> attention_mask_vec;
std::vector<float> window_mask_vec; std::vector<float> window_mask_vec;
std::vector<int> window_index_vec; std::vector<int> window_index_vec;
std::vector<int> window_inverse_index_vec; std::vector<int> window_inverse_index_vec;
@ -1150,33 +1156,35 @@ namespace LLM {
return llm_arch_to_str[static_cast<int>(params.arch)]; return llm_arch_to_str[static_cast<int>(params.arch)];
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix); model.get_param_tensors(tensors, prefix);
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
struct ggml_tensor* input_pos, ggml_tensor* input_pos,
std::vector<std::pair<int, ggml_tensor*>> image_embeds, ggml_tensor* attention_mask,
std::set<int> out_layers) { std::vector<std::pair<int, ggml_tensor*>> image_embeds,
auto hidden_states = model.forward(ctx, input_ids, input_pos, image_embeds, out_layers); // [N, n_token, hidden_size] std::set<int> out_layers) {
auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); // [N, n_token, hidden_size]
return hidden_states; return hidden_states;
} }
struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx, ggml_tensor* vision_forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values, ggml_tensor* pixel_values,
struct ggml_tensor* input_pos, ggml_tensor* input_pos,
struct ggml_tensor* window_index, ggml_tensor* window_index,
struct ggml_tensor* window_inverse_index, ggml_tensor* window_inverse_index,
struct ggml_tensor* window_mask) { ggml_tensor* window_mask) {
auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask); auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask);
return hidden_states; return hidden_states;
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, ggml_cgraph* build_graph(ggml_tensor* input_ids,
std::vector<std::pair<int, ggml_tensor*>> image_embeds, ggml_tensor* attention_mask,
std::set<int> out_layers) { std::vector<std::pair<int, ggml_tensor*>> image_embeds,
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); std::set<int> out_layers) {
ggml_cgraph* gf = ggml_new_graph(compute_ctx);
input_ids = to_backend(input_ids); input_ids = to_backend(input_ids);
@ -1205,9 +1213,26 @@ namespace LLM {
input_pos_vec.size()); input_pos_vec.size());
set_backend_tensor_data(input_pos, input_pos_vec.data()); set_backend_tensor_data(input_pos, input_pos_vec.data());
if (attention_mask != nullptr) {
attention_mask = to_backend(attention_mask);
} else {
attention_mask_vec.resize(n_tokens * n_tokens);
for (int i0 = 0; i0 < n_tokens; i0++) {
for (int i1 = 0; i1 < n_tokens; i1++) {
float value = 0.f;
if (i0 > i1) {
value = -INFINITY;
}
attention_mask_vec[i1 * n_tokens + i0] = value;
}
}
attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
}
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, image_embeds, out_layers); ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
@ -1215,27 +1240,28 @@ namespace LLM {
} }
bool compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
ggml_tensor* attention_mask,
std::vector<std::pair<int, ggml_tensor*>> image_embeds, std::vector<std::pair<int, ggml_tensor*>> image_embeds,
std::set<int> out_layers, std::set<int> out_layers,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(input_ids, image_embeds, out_layers); return build_graph(input_ids, attention_mask, image_embeds, out_layers);
}; };
return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
} }
int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) { int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) {
int grid_t = 1; int64_t grid_t = 1;
int grid_h = h / params.vision.patch_size; int64_t grid_h = h / params.vision.patch_size;
int grid_w = w / params.vision.patch_size; int64_t grid_w = w / params.vision.patch_size;
int llm_grid_h = grid_h / params.vision.spatial_merge_size; int64_t llm_grid_h = grid_h / params.vision.spatial_merge_size;
int llm_grid_w = grid_w / params.vision.spatial_merge_size; int64_t llm_grid_w = grid_w / params.vision.spatial_merge_size;
return grid_t * grid_h * grid_w; return grid_t * grid_h * grid_w;
} }
struct ggml_tensor* process_image(struct ggml_context* ctx, struct ggml_tensor* image) { ggml_tensor* process_image(ggml_context* ctx, ggml_tensor* image) {
// image: [C, H, W] // image: [C, H, W]
// return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1 // return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1
int64_t C = image->ne[2]; int64_t C = image->ne[2];
@ -1262,15 +1288,15 @@ namespace LLM {
return image; return image;
} }
struct ggml_cgraph* build_encode_image_graph(struct ggml_tensor* image) { ggml_cgraph* build_encode_image_graph(ggml_tensor* image) {
struct ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE); ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE);
GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
int grid_t = 1; int grid_t = 1;
int grid_h = image->ne[1] / params.vision.patch_size; int grid_h = static_cast<int>(image->ne[1]) / params.vision.patch_size;
int grid_w = image->ne[0] / params.vision.patch_size; int grid_w = static_cast<int>(image->ne[0]) / params.vision.patch_size;
int llm_grid_h = grid_h / params.vision.spatial_merge_size; int llm_grid_h = grid_h / params.vision.spatial_merge_size;
int llm_grid_w = grid_w / params.vision.spatial_merge_size; int llm_grid_w = grid_w / params.vision.spatial_merge_size;
int vit_merger_window_size = params.vision.window_size / params.vision.patch_size / params.vision.spatial_merge_size; int vit_merger_window_size = params.vision.window_size / params.vision.patch_size / params.vision.spatial_merge_size;
@ -1358,14 +1384,14 @@ namespace LLM {
set_backend_tensor_data(window_mask, window_mask_vec.data()); set_backend_tensor_data(window_mask, window_mask_vec.data());
// pe // pe
int head_dim = params.vision.hidden_size / params.vision.num_heads; int head_dim = static_cast<int>(params.vision.hidden_size / params.vision.num_heads);
pe_vec = Rope::gen_qwen2vl_pe(grid_h, pe_vec = Rope::gen_qwen2vl_pe(grid_h,
grid_w, grid_w,
params.vision.spatial_merge_size, params.vision.spatial_merge_size,
window_inverse_index_vec, window_inverse_index_vec,
10000.f, 10000,
{head_dim / 2, head_dim / 2}); {head_dim / 2, head_dim / 2});
int pos_len = pe_vec.size() / head_dim / 2; int pos_len = static_cast<int>(pe_vec.size() / head_dim / 2);
// LOG_DEBUG("pos_len %d", pos_len); // LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len); auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
// pe->data = pe_vec.data(); // pe->data = pe_vec.data();
@ -1373,23 +1399,23 @@ namespace LLM {
// pe->data = nullptr; // pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data()); set_backend_tensor_data(pe, pe_vec.data());
auto runnter_ctx = get_context(); auto runnter_ctx = get_context();
struct ggml_tensor* hidden_states = vision_forward(&runnter_ctx, ggml_tensor* hidden_states = vision_forward(&runnter_ctx,
pixel_values, pixel_values,
pe, pe,
window_index, window_index,
window_inverse_index, window_inverse_index,
window_mask); window_mask);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
return gf; return gf;
} }
void encode_image(const int n_threads, void encode_image(const int n_threads,
struct ggml_tensor* image, ggml_tensor* image,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_encode_image_graph(image); return build_encode_image_graph(image);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
@ -1414,7 +1440,7 @@ namespace LLM {
} }
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix); model.get_param_tensors(tensors, prefix);
} }
@ -1466,12 +1492,12 @@ namespace LLM {
} }
void test() { void test() {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != nullptr); GGML_ASSERT(work_ctx != nullptr);
bool test_mistral = false; bool test_mistral = false;
bool test_qwen3 = true; bool test_qwen3 = true;
@ -1483,15 +1509,15 @@ namespace LLM {
{ {
auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
print_ggml_tensor(image, false, "image"); print_ggml_tensor(image, false, "image");
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
model.encode_image(8, image, &out, work_ctx); model.encode_image(8, image, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out, false, "image_embed"); print_ggml_tensor(out, false, "image_embed");
image_embed = out; image_embed = out;
LOG_DEBUG("llm encode_image test done in %dms", t1 - t0); LOG_DEBUG("llm encode_image test done in %lldms", t1 - t0);
} }
std::string placeholder = "<|image_pad|>"; std::string placeholder = "<|image_pad|>";
@ -1521,32 +1547,32 @@ namespace LLM {
printf("%d ", token); printf("%d ", token);
} }
printf("\n"); printf("\n");
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
model.compute(8, input_ids, image_embeds, {}, &out, work_ctx); model.compute(8, input_ids, nullptr, image_embeds, {}, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("llm test done in %dms", t1 - t0); LOG_DEBUG("llm test done in %lldms", t1 - t0);
} else if (test_vit) { } else if (test_vit) {
// auto image = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 280, 280, 3); // auto image = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 280, 280, 3);
// ggml_set_f32(image, 0.f); // ggml_set_f32(image, 0.f);
auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
print_ggml_tensor(image, false, "image"); print_ggml_tensor(image, false, "image");
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
model.encode_image(8, image, &out, work_ctx); model.encode_image(8, image, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out, false, "out"); print_ggml_tensor(out, false, "out");
// auto ref_out = load_tensor_from_file(work_ctx, "qwen2vl.bin"); // auto ref_out = load_tensor_from_file(work_ctx, "qwen2vl.bin");
// ggml_ext_tensor_diff(ref_out, out, 0.01f); // ggml_ext_tensor_diff(ref_out, out, 0.01f);
LOG_DEBUG("llm test done in %dms", t1 - t0); LOG_DEBUG("llm test done in %lldms", t1 - t0);
} else if (test_mistral) { } else if (test_mistral) {
std::pair<int, int> prompt_attn_range; std::pair<int, int> prompt_attn_range;
std::string text = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]"; std::string text = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
@ -1561,15 +1587,15 @@ namespace LLM {
printf("%d ", token); printf("%d ", token);
} }
printf("\n"); printf("\n");
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
model.compute(8, input_ids, {}, {10, 20, 30}, &out, work_ctx); model.compute(8, input_ids, nullptr, {}, {10, 20, 30}, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("llm test done in %dms", t1 - t0); LOG_DEBUG("llm test done in %lldms", t1 - t0);
} else if (test_qwen3) { } else if (test_qwen3) {
std::pair<int, int> prompt_attn_range; std::pair<int, int> prompt_attn_range;
std::string text = "<|im_start|>user\n"; std::string text = "<|im_start|>user\n";
@ -1584,15 +1610,15 @@ namespace LLM {
printf("%d ", token); printf("%d ", token);
} }
printf("\n"); printf("\n");
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
model.compute(8, input_ids, {}, {35}, &out, work_ctx); model.compute(8, input_ids, nullptr, {}, {35}, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("llm test done in %dms", t1 - t0); LOG_DEBUG("llm test done in %lldms", t1 - t0);
} else { } else {
std::pair<int, int> prompt_attn_range; std::pair<int, int> prompt_attn_range;
std::string text = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n"; std::string text = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
@ -1607,15 +1633,15 @@ namespace LLM {
printf("%d ", token); printf("%d ", token);
} }
printf("\n"); printf("\n");
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
model.compute(8, input_ids, {}, {}, &out, work_ctx); model.compute(8, input_ids, nullptr, {}, {}, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("llm test done in %dms", t1 - t0); LOG_DEBUG("llm test done in %lldms", t1 - t0);
} }
} }

View File

@ -9,7 +9,7 @@
struct LoraModel : public GGMLRunner { struct LoraModel : public GGMLRunner {
std::string lora_id; std::string lora_id;
float multiplier = 1.0f; float multiplier = 1.0f;
std::unordered_map<std::string, struct ggml_tensor*> lora_tensors; std::unordered_map<std::string, ggml_tensor*> lora_tensors;
std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor; std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
std::set<std::string> applied_lora_tensors; std::set<std::string> applied_lora_tensors;
std::string file_path; std::string file_path;
@ -76,13 +76,13 @@ struct LoraModel : public GGMLRunner {
} }
for (const auto& pair : tensors_to_create) { for (const auto& pair : tensors_to_create) {
const auto& name = pair.first; const auto& name = pair.first;
const auto& ts = pair.second; const auto& ts = pair.second;
struct ggml_tensor* real = ggml_new_tensor(params_ctx, ggml_tensor* real = ggml_new_tensor(params_ctx,
ts.type, ts.type,
ts.n_dims, ts.n_dims,
ts.ne); ts.ne);
lora_tensors[name] = real; lora_tensors[name] = real;
} }
alloc_params_buffer(); alloc_params_buffer();
@ -195,7 +195,7 @@ struct LoraModel : public GGMLRunner {
scale_value *= multiplier; scale_value *= multiplier;
auto curr_updown = ggml_ext_merge_lora(ctx, lora_down, lora_up, lora_mid); auto curr_updown = ggml_ext_merge_lora(ctx, lora_down, lora_up, lora_mid);
curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value); curr_updown = ggml_ext_scale(ctx, curr_updown, scale_value, true);
if (updown == nullptr) { if (updown == nullptr) {
updown = curr_updown; updown = curr_updown;
@ -235,7 +235,7 @@ struct LoraModel : public GGMLRunner {
float scale_value = 1.0f; float scale_value = 1.0f;
scale_value *= multiplier; scale_value *= multiplier;
curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value); curr_updown = ggml_ext_scale(ctx, curr_updown, scale_value, true);
if (updown == nullptr) { if (updown == nullptr) {
updown = curr_updown; updown = curr_updown;
@ -337,10 +337,10 @@ struct LoraModel : public GGMLRunner {
} }
scale_value *= multiplier; scale_value *= multiplier;
struct ggml_tensor* updown_1 = ggml_ext_merge_lora(ctx, hada_1_down, hada_1_up, hada_1_mid); ggml_tensor* updown_1 = ggml_ext_merge_lora(ctx, hada_1_down, hada_1_up, hada_1_mid);
struct ggml_tensor* updown_2 = ggml_ext_merge_lora(ctx, hada_2_down, hada_2_up, hada_2_mid); ggml_tensor* updown_2 = ggml_ext_merge_lora(ctx, hada_2_down, hada_2_up, hada_2_mid);
auto curr_updown = ggml_mul_inplace(ctx, updown_1, updown_2); auto curr_updown = ggml_mul_inplace(ctx, updown_1, updown_2);
curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value); curr_updown = ggml_ext_scale(ctx, curr_updown, scale_value, true);
if (updown == nullptr) { if (updown == nullptr) {
updown = curr_updown; updown = curr_updown;
} else { } else {
@ -456,7 +456,7 @@ struct LoraModel : public GGMLRunner {
scale_value *= multiplier; scale_value *= multiplier;
auto curr_updown = ggml_ext_kronecker(ctx, lokr_w1, lokr_w2); auto curr_updown = ggml_ext_kronecker(ctx, lokr_w1, lokr_w2);
curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value); curr_updown = ggml_ext_scale(ctx, curr_updown, scale_value, true);
if (updown == nullptr) { if (updown == nullptr) {
updown = curr_updown; updown = curr_updown;
@ -468,10 +468,10 @@ struct LoraModel : public GGMLRunner {
return updown; return updown;
} }
ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora = true) { ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) {
// lora // lora
ggml_tensor* diff = nullptr; ggml_tensor* diff = nullptr;
if (with_lora) { if (with_lora_and_lokr) {
diff = get_lora_weight_diff(model_tensor_name, ctx); diff = get_lora_weight_diff(model_tensor_name, ctx);
} }
// diff // diff
@ -483,7 +483,7 @@ struct LoraModel : public GGMLRunner {
diff = get_loha_weight_diff(model_tensor_name, ctx); diff = get_loha_weight_diff(model_tensor_name, ctx);
} }
// lokr // lokr
if (diff == nullptr) { if (diff == nullptr && with_lora_and_lokr) {
diff = get_lokr_weight_diff(model_tensor_name, ctx); diff = get_lokr_weight_diff(model_tensor_name, ctx);
} }
if (diff != nullptr) { if (diff != nullptr) {
@ -514,6 +514,108 @@ struct LoraModel : public GGMLRunner {
} else { } else {
key = model_tensor_name + "." + std::to_string(index); key = model_tensor_name + "." + std::to_string(index);
} }
bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
std::string lokr_w1_name = "lora." + key + ".lokr_w1";
std::string lokr_w1_a_name = "lora." + key + ".lokr_w1_a";
// if either of these is found, then we have a lokr lora
auto iter = lora_tensors.find(lokr_w1_name);
auto iter_a = lora_tensors.find(lokr_w1_a_name);
if (iter != lora_tensors.end() || iter_a != lora_tensors.end()) {
std::string lokr_w1_b_name = "lora." + key + ".lokr_w1_b";
std::string lokr_w2_name = "lora." + key + ".lokr_w2";
std::string lokr_w2_a_name = "lora." + key + ".lokr_w2_a";
std::string lokr_w2_b_name = "lora." + key + ".lokr_w2_b";
std::string alpha_name = "lora." + key + ".alpha";
ggml_tensor* lokr_w1 = nullptr;
ggml_tensor* lokr_w1_a = nullptr;
ggml_tensor* lokr_w1_b = nullptr;
ggml_tensor* lokr_w2 = nullptr;
ggml_tensor* lokr_w2_a = nullptr;
ggml_tensor* lokr_w2_b = nullptr;
if (iter != lora_tensors.end()) {
lokr_w1 = iter->second;
}
iter = iter_a;
if (iter != lora_tensors.end()) {
lokr_w1_a = iter->second;
}
iter = lora_tensors.find(lokr_w1_b_name);
if (iter != lora_tensors.end()) {
lokr_w1_b = iter->second;
}
iter = lora_tensors.find(lokr_w2_name);
if (iter != lora_tensors.end()) {
lokr_w2 = iter->second;
if (is_conv2d && lokr_w2->type != GGML_TYPE_F16) {
lokr_w2 = ggml_cast(ctx, lokr_w2, GGML_TYPE_F16);
}
}
iter = lora_tensors.find(lokr_w2_a_name);
if (iter != lora_tensors.end()) {
lokr_w2_a = iter->second;
if (is_conv2d && lokr_w2_a->type != GGML_TYPE_F16) {
lokr_w2_a = ggml_cast(ctx, lokr_w2_a, GGML_TYPE_F16);
}
}
iter = lora_tensors.find(lokr_w2_b_name);
if (iter != lora_tensors.end()) {
lokr_w2_b = iter->second;
if (is_conv2d && lokr_w2_b->type != GGML_TYPE_F16) {
lokr_w2_b = ggml_cast(ctx, lokr_w2_b, GGML_TYPE_F16);
}
}
int rank = 1;
if (lokr_w1_b) {
rank = (int)lokr_w1_b->ne[ggml_n_dims(lokr_w1_b) - 1];
}
if (lokr_w2_b) {
rank = (int)lokr_w2_b->ne[ggml_n_dims(lokr_w2_b) - 1];
}
float scale_value = 1.0f;
iter = lora_tensors.find(alpha_name);
if (iter != lora_tensors.end()) {
float alpha = ggml_ext_backend_tensor_get_f32(iter->second);
scale_value = alpha / rank;
applied_lora_tensors.insert(alpha_name);
}
if (rank == 1) {
scale_value = 1.0f;
}
scale_value *= multiplier;
auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
if (out_diff == nullptr) {
out_diff = curr_out_diff;
} else {
out_diff = ggml_concat(ctx, out_diff, curr_out_diff, 0);
}
if (lokr_w1)
applied_lora_tensors.insert(lokr_w1_name);
if (lokr_w1_a)
applied_lora_tensors.insert(lokr_w1_a_name);
if (lokr_w1_b)
applied_lora_tensors.insert(lokr_w1_b_name);
if (lokr_w2)
applied_lora_tensors.insert(lokr_w2_name);
if (lokr_w2_a)
applied_lora_tensors.insert(lokr_w2_name);
if (lokr_w2_b)
applied_lora_tensors.insert(lokr_w2_b_name);
applied_lora_tensors.insert(alpha_name);
index++;
continue;
}
// not a lokr, normal lora path
std::string lora_down_name = "lora." + key + ".lora_down"; std::string lora_down_name = "lora." + key + ".lora_down";
std::string lora_up_name = "lora." + key + ".lora_up"; std::string lora_up_name = "lora." + key + ".lora_up";
@ -525,9 +627,7 @@ struct LoraModel : public GGMLRunner {
ggml_tensor* lora_mid = nullptr; ggml_tensor* lora_mid = nullptr;
ggml_tensor* lora_down = nullptr; ggml_tensor* lora_down = nullptr;
bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D; iter = lora_tensors.find(lora_up_name);
auto iter = lora_tensors.find(lora_up_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lora_up = iter->second; lora_up = iter->second;
if (is_conv2d && lora_up->type != GGML_TYPE_F16) { if (is_conv2d && lora_up->type != GGML_TYPE_F16) {
@ -599,6 +699,8 @@ struct LoraModel : public GGMLRunner {
forward_params.conv2d.d0, forward_params.conv2d.d0,
forward_params.conv2d.d1, forward_params.conv2d.d1,
forward_params.conv2d.direct, forward_params.conv2d.direct,
forward_params.conv2d.circular_x,
forward_params.conv2d.circular_y,
forward_params.conv2d.scale); forward_params.conv2d.scale);
if (lora_mid) { if (lora_mid) {
lx = ggml_ext_conv_2d(ctx, lx = ggml_ext_conv_2d(ctx,
@ -612,6 +714,8 @@ struct LoraModel : public GGMLRunner {
1, 1,
1, 1,
forward_params.conv2d.direct, forward_params.conv2d.direct,
forward_params.conv2d.circular_x,
forward_params.conv2d.circular_y,
forward_params.conv2d.scale); forward_params.conv2d.scale);
} }
lx = ggml_ext_conv_2d(ctx, lx = ggml_ext_conv_2d(ctx,
@ -625,10 +729,12 @@ struct LoraModel : public GGMLRunner {
1, 1,
1, 1,
forward_params.conv2d.direct, forward_params.conv2d.direct,
forward_params.conv2d.circular_x,
forward_params.conv2d.circular_y,
forward_params.conv2d.scale); forward_params.conv2d.scale);
} }
auto curr_out_diff = ggml_scale_inplace(ctx, lx, scale_value); auto curr_out_diff = ggml_ext_scale(ctx, lx, scale_value, true);
if (out_diff == nullptr) { if (out_diff == nullptr) {
out_diff = curr_out_diff; out_diff = curr_out_diff;
@ -641,9 +747,9 @@ struct LoraModel : public GGMLRunner {
return out_diff; return out_diff;
} }
struct ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors, SDVersion version) { ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors, SDVersion version) {
size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10; size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false); ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);
preprocess_lora_tensors(model_tensors); preprocess_lora_tensors(model_tensors);
@ -682,8 +788,8 @@ struct LoraModel : public GGMLRunner {
return gf; return gf;
} }
void apply(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version, int n_threads) { void apply(std::map<std::string, ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_lora_graph(model_tensors, version); return build_lora_graph(model_tensors, version);
}; };
GGMLRunner::compute(get_graph, n_threads, false); GGMLRunner::compute(get_graph, n_threads, false);
@ -735,9 +841,9 @@ public:
: lora_models(lora_models) { : lora_models(lora_models) {
} }
ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora) { ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) {
for (auto& lora_model : lora_models) { for (auto& lora_model : lora_models) {
ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora); ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora_and_lokr);
if (diff == nullptr) { if (diff == nullptr) {
continue; continue;
} }
@ -779,6 +885,8 @@ public:
forward_params.conv2d.d0, forward_params.conv2d.d0,
forward_params.conv2d.d1, forward_params.conv2d.d1,
forward_params.conv2d.direct, forward_params.conv2d.direct,
forward_params.conv2d.circular_x,
forward_params.conv2d.circular_y,
forward_params.conv2d.scale); forward_params.conv2d.scale);
} }
for (auto& lora_model : lora_models) { for (auto& lora_model : lora_models) {

View File

@ -1,8 +1,7 @@
#ifndef __LTXV_HPP__ #ifndef __LTXV_HPP__
#define __LTXV_HPP__ #define __LTXV_HPP__
#include "common.hpp" #include "common_block.hpp"
#include "ggml_extend.hpp"
namespace LTXV { namespace LTXV {
@ -27,9 +26,9 @@ namespace LTXV {
bias)); bias));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
bool causal = true) { bool causal = true) {
// x: [N*IC, ID, IH, IW] // x: [N*IC, ID, IH, IW]
// result: [N*OC, OD, OH, OW] // result: [N*OC, OD, OH, OW]
auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]); auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);

View File

@ -27,13 +27,13 @@ public:
blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias)); blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, n_token, in_features] // x: [N, n_token, in_features]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]); auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]); auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
x = fc1->forward(ctx, x); x = fc1->forward(ctx, x);
x = ggml_gelu_inplace(ctx->ggml_ctx, x); x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
x = fc2->forward(ctx, x); x = fc2->forward(ctx, x);
return x; return x;
} }
@ -72,7 +72,7 @@ public:
bias)); bias));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, C, H, W] // x: [N, C, H, W]
// return: [N, H*W, embed_dim] // return: [N, H*W, embed_dim]
auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]); auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);
@ -97,12 +97,12 @@ public:
struct TimestepEmbedder : public GGMLBlock { struct TimestepEmbedder : public GGMLBlock {
// Embeds scalar timesteps into vector representations. // Embeds scalar timesteps into vector representations.
protected: protected:
int64_t frequency_embedding_size; int frequency_embedding_size;
public: public:
TimestepEmbedder(int64_t hidden_size, TimestepEmbedder(int64_t hidden_size,
int64_t frequency_embedding_size = 256, int frequency_embedding_size = 256,
int64_t out_channels = 0) int64_t out_channels = 0)
: frequency_embedding_size(frequency_embedding_size) { : frequency_embedding_size(frequency_embedding_size) {
if (out_channels <= 0) { if (out_channels <= 0) {
out_channels = hidden_size; out_channels = hidden_size;
@ -111,7 +111,7 @@ public:
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, out_channels, true, true)); blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, out_channels, true, true));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* t) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
// t: [N, ] // t: [N, ]
// return: [N, hidden_size] // return: [N, hidden_size]
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]); auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
@ -135,7 +135,7 @@ public:
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true)); blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, input_dim] // x: [N, input_dim]
// return: [N, hidden_size] // return: [N, hidden_size]
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]); auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
@ -167,15 +167,15 @@ public:
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim)); blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
} }
if (qk_norm == "rms") { if (qk_norm == "rms") {
blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6)); blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6f));
blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6)); blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6f));
} else if (qk_norm == "ln") { } else if (qk_norm == "ln") {
blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6)); blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6f));
blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6)); blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6f));
} }
} }
std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) { std::vector<ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]); auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
auto qkv = qkv_proj->forward(ctx, x); auto qkv = qkv_proj->forward(ctx, x);
@ -198,7 +198,7 @@ public:
return {q, k, v}; return {q, k, v};
} }
struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* post_attention(GGMLRunnerContext* ctx, ggml_tensor* x) {
GGML_ASSERT(!pre_only); GGML_ASSERT(!pre_only);
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]); auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@ -208,19 +208,19 @@ public:
} }
// x: [N, n_token, dim] // x: [N, n_token, dim]
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x) { ggml_tensor* x) {
auto qkv = pre_attention(ctx, x); auto qkv = pre_attention(ctx, x);
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, dim]
x = post_attention(ctx, x); // [N, n_token, dim] x = post_attention(ctx, x); // [N, n_token, dim]
return x; return x;
} }
}; };
__STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx, __STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* shift, ggml_tensor* shift,
struct ggml_tensor* scale) { ggml_tensor* scale) {
// x: [N, L, C] // x: [N, L, C]
// scale: [N, C] // scale: [N, C]
// shift: [N, C] // shift: [N, C]
@ -274,8 +274,8 @@ public:
} }
std::tuple<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention_x(GGMLRunnerContext* ctx, std::tuple<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention_x(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* c) { ggml_tensor* c) {
GGML_ASSERT(self_attn); GGML_ASSERT(self_attn);
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// c: [N, hidden_size] // c: [N, hidden_size]
@ -284,23 +284,19 @@ public:
auto attn2 = std::dynamic_pointer_cast<SelfAttention>(blocks["attn2"]); auto attn2 = std::dynamic_pointer_cast<SelfAttention>(blocks["attn2"]);
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]); auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
int64_t n_mods = 9; int n_mods = 9;
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, n_mods * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, n_mods * hidden_size]
m = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size] auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, n_mods, 0);
m = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size]
int64_t offset = m->nb[1] * m->ne[1]; auto shift_msa = m_vec[0]; // [N, hidden_size]
auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size] auto scale_msa = m_vec[1]; // [N, hidden_size]
auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size] auto gate_msa = m_vec[2]; // [N, hidden_size]
auto gate_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size] auto shift_mlp = m_vec[3]; // [N, hidden_size]
auto scale_mlp = m_vec[4]; // [N, hidden_size]
auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size] auto gate_mlp = m_vec[5]; // [N, hidden_size]
auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size] auto shift_msa2 = m_vec[6]; // [N, hidden_size]
auto gate_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size] auto scale_msa2 = m_vec[7]; // [N, hidden_size]
auto gate_msa2 = m_vec[8]; // [N, hidden_size]
auto shift_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6); // [N, hidden_size]
auto scale_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7); // [N, hidden_size]
auto gate_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8); // [N, hidden_size]
auto x_norm = norm1->forward(ctx, x); auto x_norm = norm1->forward(ctx, x);
@ -313,31 +309,29 @@ public:
return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}}; return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}};
} }
std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(GGMLRunnerContext* ctx, std::pair<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* c) { ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// c: [N, hidden_size] // c: [N, hidden_size]
auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]); auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
auto attn = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]); auto attn = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]); auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
int64_t n_mods = 6; int n_mods = 6;
if (pre_only) { if (pre_only) {
n_mods = 2; n_mods = 2;
} }
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, n_mods * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, n_mods * hidden_size]
m = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size] auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, n_mods, 0);
m = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size]
int64_t offset = m->nb[1] * m->ne[1]; auto shift_msa = m_vec[0]; // [N, hidden_size]
auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size] auto scale_msa = m_vec[1]; // [N, hidden_size]
auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
if (!pre_only) { if (!pre_only) {
auto gate_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size] auto gate_msa = m_vec[2]; // [N, hidden_size]
auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size] auto shift_mlp = m_vec[3]; // [N, hidden_size]
auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size] auto scale_mlp = m_vec[4]; // [N, hidden_size]
auto gate_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size] auto gate_mlp = m_vec[5]; // [N, hidden_size]
auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa); auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
@ -352,15 +346,15 @@ public:
} }
} }
struct ggml_tensor* post_attention_x(GGMLRunnerContext* ctx, ggml_tensor* post_attention_x(GGMLRunnerContext* ctx,
struct ggml_tensor* attn_out, ggml_tensor* attn_out,
struct ggml_tensor* attn2_out, ggml_tensor* attn2_out,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* gate_msa, ggml_tensor* gate_msa,
struct ggml_tensor* shift_mlp, ggml_tensor* shift_mlp,
struct ggml_tensor* scale_mlp, ggml_tensor* scale_mlp,
struct ggml_tensor* gate_mlp, ggml_tensor* gate_mlp,
struct ggml_tensor* gate_msa2) { ggml_tensor* gate_msa2) {
// attn_out: [N, n_token, hidden_size] // attn_out: [N, n_token, hidden_size]
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// gate_msa: [N, hidden_size] // gate_msa: [N, hidden_size]
@ -390,13 +384,13 @@ public:
return x; return x;
} }
struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, ggml_tensor* post_attention(GGMLRunnerContext* ctx,
struct ggml_tensor* attn_out, ggml_tensor* attn_out,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* gate_msa, ggml_tensor* gate_msa,
struct ggml_tensor* shift_mlp, ggml_tensor* shift_mlp,
struct ggml_tensor* scale_mlp, ggml_tensor* scale_mlp,
struct ggml_tensor* gate_mlp) { ggml_tensor* gate_mlp) {
// attn_out: [N, n_token, hidden_size] // attn_out: [N, n_token, hidden_size]
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// gate_msa: [N, hidden_size] // gate_msa: [N, hidden_size]
@ -422,9 +416,9 @@ public:
return x; return x;
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* c) { ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// c: [N, hidden_size] // c: [N, hidden_size]
// return: [N, n_token, hidden_size] // return: [N, n_token, hidden_size]
@ -439,8 +433,8 @@ public:
auto qkv2 = std::get<1>(qkv_intermediates); auto qkv2 = std::get<1>(qkv_intermediates);
auto intermediates = std::get<2>(qkv_intermediates); auto intermediates = std::get<2>(qkv_intermediates);
auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim] auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, dim]
auto attn2_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim] auto attn2_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, dim]
x = post_attention_x(ctx, x = post_attention_x(ctx,
attn_out, attn_out,
attn2_out, attn2_out,
@ -456,7 +450,7 @@ public:
auto qkv = qkv_intermediates.first; auto qkv = qkv_intermediates.first;
auto intermediates = qkv_intermediates.second; auto intermediates = qkv_intermediates.second;
auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim] auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, dim]
x = post_attention(ctx, x = post_attention(ctx,
attn_out, attn_out,
intermediates[0], intermediates[0],
@ -469,11 +463,11 @@ public:
} }
}; };
__STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*> __STATIC_INLINE__ std::pair<ggml_tensor*, ggml_tensor*>
block_mixing(GGMLRunnerContext* ctx, block_mixing(GGMLRunnerContext* ctx,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* c, ggml_tensor* c,
std::shared_ptr<DismantledBlock> context_block, std::shared_ptr<DismantledBlock> context_block,
std::shared_ptr<DismantledBlock> x_block) { std::shared_ptr<DismantledBlock> x_block) {
// context: [N, n_context, hidden_size] // context: [N, n_context, hidden_size]
@ -495,31 +489,29 @@ block_mixing(GGMLRunnerContext* ctx,
x_qkv = x_qkv_intermediates.first; x_qkv = x_qkv_intermediates.first;
x_intermediates = x_qkv_intermediates.second; x_intermediates = x_qkv_intermediates.second;
} }
std::vector<struct ggml_tensor*> qkv; std::vector<ggml_tensor*> qkv;
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
qkv.push_back(ggml_concat(ctx->ggml_ctx, context_qkv[i], x_qkv[i], 1)); qkv.push_back(ggml_concat(ctx->ggml_ctx, context_qkv[i], x_qkv[i], 1));
} }
auto attn = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_context + n_token, hidden_size] auto attn = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, ctx->flash_attn_enabled); // [N, n_context + n_token, hidden_size]
attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size]
auto context_attn = ggml_view_3d(ctx->ggml_ctx, auto context_attn = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1],
context->ne[1], context->ne[1],
attn->ne[2],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
0); // [n_context, N, hidden_size] 0); // [N, n_context, hidden_size]
context_attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context_attn, 0, 2, 1, 3)); // [N, n_context, hidden_size]
auto x_attn = ggml_view_3d(ctx->ggml_ctx, auto x_attn = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1],
x->ne[1], x->ne[1],
attn->ne[2],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
attn->nb[2] * context->ne[1]); // [n_token, N, hidden_size] context->ne[1] * attn->nb[1]); // [N, n_token, hidden_size]
x_attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x_attn, 0, 2, 1, 3)); // [N, n_token, hidden_size]
if (!context_block->pre_only) { if (!context_block->pre_only) {
context = context_block->post_attention(ctx, context = context_block->post_attention(ctx,
@ -534,7 +526,7 @@ block_mixing(GGMLRunnerContext* ctx,
} }
if (x_block->self_attn) { if (x_block->self_attn) {
auto attn2 = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, hidden_size] auto attn2 = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, hidden_size]
x = x_block->post_attention_x(ctx, x = x_block->post_attention_x(ctx,
x_attn, x_attn,
@ -571,10 +563,10 @@ public:
blocks["x_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x)); blocks["x_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
} }
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx, std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* c) { ggml_tensor* c) {
auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]); auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
auto x_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]); auto x_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);
@ -594,9 +586,9 @@ public:
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size)); blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* c) { ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// c: [N, hidden_size] // c: [N, hidden_size]
// return: [N, n_token, patch_size * patch_size * out_channels] // return: [N, n_token, patch_size * patch_size * out_channels]
@ -604,13 +596,10 @@ public:
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]); auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]); auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 2 * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 2 * hidden_size]
m = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], 2, c->ne[1]); // [N, 2, hidden_size] auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, 2, 0);
m = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3)); // [2, N, hidden_size] auto shift = m_vec[0]; // [N, hidden_size]
auto scale = m_vec[1]; // [N, hidden_size]
int64_t offset = m->nb[1] * m->ne[1];
auto shift = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
auto scale = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
x = modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale); x = modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
x = linear->forward(ctx, x); x = linear->forward(ctx, x);
@ -623,7 +612,7 @@ struct MMDiT : public GGMLBlock {
// Diffusion model with a Transformer backbone. // Diffusion model with a Transformer backbone.
protected: protected:
int64_t input_size = -1; int64_t input_size = -1;
int64_t patch_size = 2; int patch_size = 2;
int64_t in_channels = 16; int64_t in_channels = 16;
int64_t d_self = -1; // >=0 for MMdiT-X int64_t d_self = -1; // >=0 for MMdiT-X
int64_t depth = 24; int64_t depth = 24;
@ -637,7 +626,7 @@ protected:
int64_t hidden_size; int64_t hidden_size;
std::string qk_norm; std::string qk_norm;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1); params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
} }
@ -716,8 +705,8 @@ public:
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels)); blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
} }
struct ggml_tensor* ggml_tensor*
cropped_pos_embed(struct ggml_context* ctx, cropped_pos_embed(ggml_context* ctx,
int64_t h, int64_t h,
int64_t w) { int64_t w) {
auto pos_embed = params["pos_embed"]; auto pos_embed = params["pos_embed"];
@ -756,33 +745,11 @@ public:
return spatial_pos_embed; return spatial_pos_embed;
} }
struct ggml_tensor* unpatchify(struct ggml_context* ctx, ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
int64_t h, ggml_tensor* c_mod,
int64_t w) { ggml_tensor* context,
// x: [N, H*W, patch_size * patch_size * C] std::vector<int> skip_layers = std::vector<int>()) {
// return: [N, C, H, W]
int64_t n = x->ne[2];
int64_t c = out_channels;
int64_t p = patch_size;
h = (h + 1) / p;
w = (w + 1) / p;
GGML_ASSERT(h * w == x->ne[1]);
x = ggml_reshape_4d(ctx, x, c, p * p, w * h, n); // [N, H*W, P*P, C]
x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // [N, C, H*W, P*P]
x = ggml_reshape_4d(ctx, x, p, p, w, h * c * n); // [N*C*H, W, P, P]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*H, P, W, P]
x = ggml_reshape_4d(ctx, x, p * w, p * h, c, n); // [N, C, H*P, W*P]
return x;
}
struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* c_mod,
struct ggml_tensor* context,
std::vector<int> skip_layers = std::vector<int>()) {
// x: [N, H*W, hidden_size] // x: [N, H*W, hidden_size]
// context: [N, n_context, d_context] // context: [N, n_context, d_context]
// c: [N, hidden_size] // c: [N, hidden_size]
@ -807,12 +774,12 @@ public:
return x; return x;
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* t, ggml_tensor* t,
struct ggml_tensor* y = nullptr, ggml_tensor* y = nullptr,
struct ggml_tensor* context = nullptr, ggml_tensor* context = nullptr,
std::vector<int> skip_layers = std::vector<int>()) { std::vector<int> skip_layers = std::vector<int>()) {
// Forward pass of DiT. // Forward pass of DiT.
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
// t: (N,) tensor of diffusion timesteps // t: (N,) tensor of diffusion timesteps
@ -822,11 +789,11 @@ public:
auto x_embedder = std::dynamic_pointer_cast<PatchEmbed>(blocks["x_embedder"]); auto x_embedder = std::dynamic_pointer_cast<PatchEmbed>(blocks["x_embedder"]);
auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]); auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
int64_t w = x->ne[0]; int64_t W = x->ne[0];
int64_t h = x->ne[1]; int64_t H = x->ne[1];
auto patch_embed = x_embedder->forward(ctx, x); // [N, H*W, hidden_size] auto patch_embed = x_embedder->forward(ctx, x); // [N, H*W, hidden_size]
auto pos_embed = cropped_pos_embed(ctx->ggml_ctx, h, w); // [1, H*W, hidden_size] auto pos_embed = cropped_pos_embed(ctx->ggml_ctx, H, W); // [1, H*W, hidden_size]
x = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed); // [N, H*W, hidden_size] x = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed); // [N, H*W, hidden_size]
auto c = t_embedder->forward(ctx, t); // [N, hidden_size] auto c = t_embedder->forward(ctx, t); // [N, hidden_size]
@ -845,7 +812,7 @@ public:
x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels) x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels)
x = unpatchify(ctx->ggml_ctx, x, h, w); // [N, C, H, W] x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, /*patch_last*/ false); // [N, C, H, W]
return x; return x;
} }
@ -865,29 +832,29 @@ struct MMDiTRunner : public GGMLRunner {
return "mmdit"; return "mmdit";
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
mmdit.get_param_tensors(tensors, prefix); mmdit.get_param_tensors(tensors, prefix);
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* x, ggml_cgraph* build_graph(ggml_tensor* x,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* y, ggml_tensor* y,
std::vector<int> skip_layers = std::vector<int>()) { std::vector<int> skip_layers = std::vector<int>()) {
struct ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE); ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);
x = to_backend(x); x = to_backend(x);
context = to_backend(context); context = to_backend(context);
y = to_backend(y); y = to_backend(y);
timesteps = to_backend(timesteps); timesteps = to_backend(timesteps);
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* out = mmdit.forward(&runner_ctx, ggml_tensor* out = mmdit.forward(&runner_ctx,
x, x,
timesteps, timesteps,
y, y,
context, context,
skip_layers); skip_layers);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
@ -895,18 +862,18 @@ struct MMDiTRunner : public GGMLRunner {
} }
bool compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* y, ggml_tensor* y,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr, ggml_context* output_ctx = nullptr,
std::vector<int> skip_layers = std::vector<int>()) { std::vector<int> skip_layers = std::vector<int>()) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// timesteps: [N, ] // timesteps: [N, ]
// context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size] // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size]
// y: [N, adm_in_channels] or [1, adm_in_channels] // y: [N, adm_in_channels] or [1, adm_in_channels]
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, y, skip_layers); return build_graph(x, timesteps, context, y, skip_layers);
}; };
@ -914,12 +881,12 @@ struct MMDiTRunner : public GGMLRunner {
} }
void test() { void test() {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != nullptr); GGML_ASSERT(work_ctx != nullptr);
{ {
@ -941,14 +908,14 @@ struct MMDiTRunner : public GGMLRunner {
ggml_set_f32(y, 0.01f); ggml_set_f32(y, 0.01f);
// print_ggml_tensor(y); // print_ggml_tensor(y);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
compute(8, x, timesteps, context, y, &out, work_ctx); compute(8, x, timesteps, context, y, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("mmdit test done in %dms", t1 - t0); LOG_DEBUG("mmdit test done in %lldms", t1 - t0);
} }
} }
@ -983,4 +950,4 @@ struct MMDiTRunner : public GGMLRunner {
} }
}; };
#endif #endif

View File

@ -16,10 +16,6 @@
#include "model.h" #include "model.h"
#include "stable-diffusion.h" #include "stable-diffusion.h"
#include "util.h" #include "util.h"
#include "vocab.hpp"
#include "vocab_mistral.hpp"
#include "vocab_qwen.hpp"
#include "vocab_umt5.hpp"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
@ -291,7 +287,7 @@ void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) {
} }
bool is_zip_file(const std::string& file_path) { bool is_zip_file(const std::string& file_path) {
struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
if (zip == nullptr) { if (zip == nullptr) {
return false; return false;
} }
@ -376,7 +372,11 @@ bool ModelLoader::init_from_file(const std::string& file_path, const std::string
LOG_INFO("load %s using checkpoint format", file_path.c_str()); LOG_INFO("load %s using checkpoint format", file_path.c_str());
return init_from_ckpt_file(file_path, prefix); return init_from_ckpt_file(file_path, prefix);
} else { } else {
LOG_WARN("unknown format %s", file_path.c_str()); if (file_exists(file_path)) {
LOG_WARN("unknown format %s", file_path.c_str());
} else {
LOG_WARN("file %s not found", file_path.c_str());
}
return false; return false;
} }
} }
@ -436,7 +436,7 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
name, name,
gguf_tensor_info.type, gguf_tensor_info.type,
gguf_tensor_info.shape.data(), gguf_tensor_info.shape.data(),
gguf_tensor_info.shape.size(), static_cast<int>(gguf_tensor_info.shape.size()),
file_index, file_index,
data_offset + gguf_tensor_info.offset); data_offset + gguf_tensor_info.offset);
@ -448,14 +448,14 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
return true; return true;
} }
int n_tensors = gguf_get_n_tensors(ctx_gguf_); int n_tensors = static_cast<int>(gguf_get_n_tensors(ctx_gguf_));
size_t total_size = 0; size_t total_size = 0;
size_t data_offset = gguf_get_data_offset(ctx_gguf_); size_t data_offset = gguf_get_data_offset(ctx_gguf_);
for (int i = 0; i < n_tensors; i++) { for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(ctx_gguf_, i); std::string name = gguf_get_tensor_name(ctx_gguf_, i);
struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str()); ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str());
size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i); size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
// LOG_DEBUG("%s", name.c_str()); // LOG_DEBUG("%s", name.c_str());
@ -812,7 +812,7 @@ struct PickleTensorReader {
} }
} }
void read_string(const std::string& str, struct zip_t* zip, std::string dir) { void read_string(const std::string& str, zip_t* zip, std::string dir) {
if (str == "storage") { if (str == "storage") {
read_global_type = true; read_global_type = true;
} else if (str != "state_dict") { } else if (str != "state_dict") {
@ -995,7 +995,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
file_paths_.push_back(file_path); file_paths_.push_back(file_path);
size_t file_index = file_paths_.size() - 1; size_t file_index = file_paths_.size() - 1;
struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
if (zip == nullptr) { if (zip == nullptr) {
LOG_ERROR("failed to open '%s'", file_path.c_str()); LOG_ERROR("failed to open '%s'", file_path.c_str());
return false; return false;
@ -1034,10 +1034,14 @@ SDVersion ModelLoader::get_sd_version() {
bool is_xl = false; bool is_xl = false;
bool is_flux = false; bool is_flux = false;
bool is_flux2 = false;
bool has_single_block_47 = false;
bool is_wan = false; bool is_wan = false;
int64_t patch_embedding_channels = 0; int64_t patch_embedding_channels = 0;
bool has_img_emb = false; bool has_img_emb = false;
bool has_middle_block_1 = false; bool has_middle_block_1 = false;
bool has_output_block_311 = false;
bool has_output_block_71 = false;
for (auto& [name, tensor_storage] : tensor_storage_map) { for (auto& [name, tensor_storage] : tensor_storage_map) {
if (!(is_xl)) { if (!(is_xl)) {
@ -1053,8 +1057,14 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
return VERSION_QWEN_IMAGE; return VERSION_QWEN_IMAGE;
} }
if (tensor_storage.name.find("llm_adapter.blocks.0.cross_attn.q_proj.weight") != std::string::npos) {
return VERSION_ANIMA;
}
if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
return VERSION_FLUX2; is_flux2 = true;
}
if (tensor_storage.name.find("single_blocks.47.linear1.weight") != std::string::npos) {
has_single_block_47 = true;
} }
if (tensor_storage.name.find("model.diffusion_model.double_blocks.0.img_mlp.gate_proj.weight") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.double_blocks.0.img_mlp.gate_proj.weight") != std::string::npos) {
return VERSION_OVIS_IMAGE; return VERSION_OVIS_IMAGE;
@ -1094,6 +1104,14 @@ SDVersion ModelLoader::get_sd_version() {
tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) { tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) {
has_middle_block_1 = true; has_middle_block_1 = true;
} }
if (tensor_storage.name.find("model.diffusion_model.output_blocks.3.1.transformer_blocks.1") != std::string::npos ||
tensor_storage.name.find("unet.up_blocks.1.attentions.0.transformer_blocks.1") != std::string::npos) {
has_output_block_311 = true;
}
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
has_output_block_71 = true;
}
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" || if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" || tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
tensor_storage.name == "text_model.embeddings.token_embedding.weight" || tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
@ -1129,12 +1147,15 @@ SDVersion ModelLoader::get_sd_version() {
return VERSION_SDXL_PIX2PIX; return VERSION_SDXL_PIX2PIX;
} }
if (!has_middle_block_1) { if (!has_middle_block_1) {
if (!has_output_block_311) {
return VERSION_SDXL_VEGA;
}
return VERSION_SDXL_SSD1B; return VERSION_SDXL_SSD1B;
} }
return VERSION_SDXL; return VERSION_SDXL;
} }
if (is_flux) { if (is_flux && !is_flux2) {
if (input_block_weight.ne[0] == 384) { if (input_block_weight.ne[0] == 384) {
return VERSION_FLUX_FILL; return VERSION_FLUX_FILL;
} }
@ -1147,6 +1168,13 @@ SDVersion ModelLoader::get_sd_version() {
return VERSION_FLUX; return VERSION_FLUX;
} }
if (is_flux2) {
if (has_single_block_47) {
return VERSION_FLUX2;
}
return VERSION_FLUX2_KLEIN;
}
if (token_embedding_weight.ne[0] == 768) { if (token_embedding_weight.ne[0] == 768) {
if (is_inpaint) { if (is_inpaint) {
return VERSION_SD1_INPAINT; return VERSION_SD1_INPAINT;
@ -1155,6 +1183,9 @@ SDVersion ModelLoader::get_sd_version() {
return VERSION_SD1_PIX2PIX; return VERSION_SD1_PIX2PIX;
} }
if (!has_middle_block_1) { if (!has_middle_block_1) {
if (!has_output_block_71) {
return VERSION_SDXS;
}
return VERSION_SD1_TINY_UNET; return VERSION_SD1_TINY_UNET;
} }
return VERSION_SD1; return VERSION_SD1;
@ -1310,37 +1341,7 @@ void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_ru
} }
} }
std::string ModelLoader::load_merges() { bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
std::string merges_utf8_str(reinterpret_cast<const char*>(merges_utf8_c_str), sizeof(merges_utf8_c_str));
return merges_utf8_str;
}
std::string ModelLoader::load_qwen2_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(qwen2_merges_utf8_c_str), sizeof(qwen2_merges_utf8_c_str));
return merges_utf8_str;
}
std::string ModelLoader::load_mistral_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(mistral_merges_utf8_c_str), sizeof(mistral_merges_utf8_c_str));
return merges_utf8_str;
}
std::string ModelLoader::load_mistral_vocab_json() {
std::string json_str(reinterpret_cast<const char*>(mistral_vocab_json_utf8_c_str), sizeof(mistral_vocab_json_utf8_c_str));
return json_str;
}
std::string ModelLoader::load_t5_tokenizer_json() {
std::string json_str(reinterpret_cast<const char*>(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str));
return json_str;
}
std::string ModelLoader::load_umt5_tokenizer_json() {
std::string json_str(reinterpret_cast<const char*>(umt5_tokenizer_json_str), sizeof(umt5_tokenizer_json_str));
return json_str;
}
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
int64_t process_time_ms = 0; int64_t process_time_ms = 0;
std::atomic<int64_t> read_time_ms(0); std::atomic<int64_t> read_time_ms(0);
std::atomic<int64_t> memcpy_time_ms(0); std::atomic<int64_t> memcpy_time_ms(0);
@ -1390,6 +1391,15 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
} }
} }
std::unique_ptr<MmapWrapper> mmapped;
if (enable_mmap && !is_zip) {
LOG_DEBUG("using mmap for I/O");
mmapped = MmapWrapper::create(file_path);
if (!mmapped) {
LOG_WARN("failed to memory-map '%s'", file_path.c_str());
}
}
int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size()); int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
if (n_threads < 1) { if (n_threads < 1) {
n_threads = 1; n_threads = 1;
@ -1403,7 +1413,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
for (int i = 0; i < n_threads; ++i) { for (int i = 0; i < n_threads; ++i) {
workers.emplace_back([&, file_path, is_zip]() { workers.emplace_back([&, file_path, is_zip]() {
std::ifstream file; std::ifstream file;
struct zip_t* zip = nullptr; zip_t* zip = nullptr;
if (is_zip) { if (is_zip) {
zip = zip_open(file_path.c_str(), 0, 'r'); zip = zip_open(file_path.c_str(), 0, 'r');
if (zip == nullptr) { if (zip == nullptr) {
@ -1411,7 +1421,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
failed = true; failed = true;
return; return;
} }
} else { } else if (!mmapped) {
file.open(file_path, std::ios::binary); file.open(file_path, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
LOG_ERROR("failed to open '%s'", file_path.c_str()); LOG_ERROR("failed to open '%s'", file_path.c_str());
@ -1464,6 +1474,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
zip_entry_noallocread(zip, (void*)buf, n); zip_entry_noallocread(zip, (void*)buf, n);
} }
zip_entry_close(zip); zip_entry_close(zip);
} else if (mmapped) {
if (!mmapped->copy_data(buf, n, tensor_storage.offset)) {
LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
failed = true;
}
} else { } else {
file.seekg(tensor_storage.offset); file.seekg(tensor_storage.offset);
file.read(buf, n); file.read(buf, n);
@ -1520,6 +1535,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
i64_to_i32_vec((int64_t*)read_buf, (int32_t*)target_buf, tensor_storage.nelements()); i64_to_i32_vec((int64_t*)read_buf, (int32_t*)target_buf, tensor_storage.nelements());
} }
if (tensor_storage.type != dst_tensor->type) { if (tensor_storage.type != dst_tensor->type) {
if (convert_buf == nullptr) {
LOG_ERROR("read tensor data failed: too less memory for conversion");
failed = true;
return;
}
convert_tensor((void*)target_buf, convert_tensor((void*)target_buf,
tensor_storage.type, tensor_storage.type,
convert_buf, convert_buf,
@ -1551,7 +1571,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
break; break;
} }
size_t curr_num = total_tensors_processed + current_idx; size_t curr_num = total_tensors_processed + current_idx;
pretty_progress(curr_num, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (curr_num + 1e-6f)); pretty_progress(static_cast<int>(curr_num), static_cast<int>(total_tensors_to_process), (ggml_time_ms() - t_start) / 1000.0f / (curr_num + 1e-6f));
std::this_thread::sleep_for(std::chrono::milliseconds(200)); std::this_thread::sleep_for(std::chrono::milliseconds(200));
} }
@ -1564,7 +1584,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
break; break;
} }
total_tensors_processed += file_tensors.size(); total_tensors_processed += file_tensors.size();
pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (total_tensors_processed + 1e-6f)); pretty_progress(static_cast<int>(total_tensors_processed), static_cast<int>(total_tensors_to_process), (ggml_time_ms() - t_start) / 1000.0f / (total_tensors_processed + 1e-6f));
if (total_tensors_processed < total_tensors_to_process) { if (total_tensors_processed < total_tensors_to_process) {
printf("\n"); printf("\n");
} }
@ -1581,9 +1601,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
return success; return success;
} }
bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors, bool ModelLoader::load_tensors(std::map<std::string, ggml_tensor*>& tensors,
std::set<std::string> ignore_tensors, std::set<std::string> ignore_tensors,
int n_threads) { int n_threads,
bool enable_mmap) {
std::set<std::string> tensor_names_in_file; std::set<std::string> tensor_names_in_file;
std::mutex tensor_names_mutex; std::mutex tensor_names_mutex;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
@ -1594,7 +1615,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
tensor_names_in_file.insert(name); tensor_names_in_file.insert(name);
} }
struct ggml_tensor* real; ggml_tensor* real;
if (tensors.find(name) != tensors.end()) { if (tensors.find(name) != tensors.end()) {
real = tensors[name]; real = tensors[name];
} else { } else {
@ -1626,7 +1647,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
return true; return true;
}; };
bool success = load_tensors(on_new_tensor_cb, n_threads); bool success = load_tensors(on_new_tensor_cb, n_threads, enable_mmap);
if (!success) { if (!success) {
LOG_ERROR("load tensors from file failed"); LOG_ERROR("load tensors from file failed");
return false; return false;
@ -1732,6 +1753,13 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
// tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3], // tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3],
// tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); // tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
if (!tensor->data) {
GGML_ASSERT(ggml_nelements(tensor) == 0);
// avoid crashing the gguf writer by setting a dummy pointer for zero-sized tensors
LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str());
tensor->data = ggml_get_mem_buffer(ggml_ctx);
}
*dst_tensor = tensor; *dst_tensor = tensor;
gguf_add_tensor(gguf_ctx, tensor); gguf_add_tensor(gguf_ctx, tensor);
@ -1771,7 +1799,12 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
return mem_size; return mem_size;
} }
bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, const char* tensor_type_rules) { bool convert(const char* input_path,
const char* vae_path,
const char* output_path,
sd_type_t output_type,
const char* tensor_type_rules,
bool convert_name) {
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(input_path)) { if (!model_loader.init_from_file(input_path)) {
@ -1785,7 +1818,9 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
return false; return false;
} }
} }
model_loader.convert_tensors_name(); if (convert_name) {
model_loader.convert_tensors_name();
}
bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules); bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules);
return success; return success;
} }

View File

@ -28,9 +28,11 @@ enum SDVersion {
VERSION_SD2, VERSION_SD2,
VERSION_SD2_INPAINT, VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET, VERSION_SD2_TINY_UNET,
VERSION_SDXS,
VERSION_SDXL, VERSION_SDXL,
VERSION_SDXL_INPAINT, VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX, VERSION_SDXL_PIX2PIX,
VERSION_SDXL_VEGA,
VERSION_SDXL_SSD1B, VERSION_SDXL_SSD1B,
VERSION_SVD, VERSION_SVD,
VERSION_SD3, VERSION_SD3,
@ -43,14 +45,16 @@ enum SDVersion {
VERSION_WAN2_2_I2V, VERSION_WAN2_2_I2V,
VERSION_WAN2_2_TI2V, VERSION_WAN2_2_TI2V,
VERSION_QWEN_IMAGE, VERSION_QWEN_IMAGE,
VERSION_ANIMA,
VERSION_FLUX2, VERSION_FLUX2,
VERSION_FLUX2_KLEIN,
VERSION_Z_IMAGE, VERSION_Z_IMAGE,
VERSION_OVIS_IMAGE, VERSION_OVIS_IMAGE,
VERSION_COUNT, VERSION_COUNT,
}; };
static inline bool sd_version_is_sd1(SDVersion version) { static inline bool sd_version_is_sd1(SDVersion version) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) { if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
return true; return true;
} }
return false; return false;
@ -64,7 +68,7 @@ static inline bool sd_version_is_sd2(SDVersion version) {
} }
static inline bool sd_version_is_sdxl(SDVersion version) { static inline bool sd_version_is_sdxl(SDVersion version) {
if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX || version == VERSION_SDXL_SSD1B) { if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX || version == VERSION_SDXL_SSD1B || version == VERSION_SDXL_VEGA) {
return true; return true;
} }
return false; return false;
@ -99,7 +103,7 @@ static inline bool sd_version_is_flux(SDVersion version) {
} }
static inline bool sd_version_is_flux2(SDVersion version) { static inline bool sd_version_is_flux2(SDVersion version) {
if (version == VERSION_FLUX2) { if (version == VERSION_FLUX2 || version == VERSION_FLUX2_KLEIN) {
return true; return true;
} }
return false; return false;
@ -119,6 +123,13 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
return false; return false;
} }
static inline bool sd_version_is_anima(SDVersion version) {
if (version == VERSION_ANIMA) {
return true;
}
return false;
}
static inline bool sd_version_is_z_image(SDVersion version) { static inline bool sd_version_is_z_image(SDVersion version) {
if (version == VERSION_Z_IMAGE) { if (version == VERSION_Z_IMAGE) {
return true; return true;
@ -143,6 +154,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
sd_version_is_sd3(version) || sd_version_is_sd3(version) ||
sd_version_is_wan(version) || sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) || sd_version_is_qwen_image(version) ||
sd_version_is_anima(version) ||
sd_version_is_z_image(version)) { sd_version_is_z_image(version)) {
return true; return true;
} }
@ -310,10 +322,11 @@ public:
std::map<ggml_type, uint32_t> get_vae_wtype_stat(); std::map<ggml_type, uint32_t> get_vae_wtype_stat();
String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; } String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = ""); void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0); bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors, bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
std::set<std::string> ignore_tensors = {}, std::set<std::string> ignore_tensors = {},
int n_threads = 0); int n_threads = 0,
bool use_mmap = false);
std::vector<std::string> get_tensor_names() const { std::vector<std::string> get_tensor_names() const {
std::vector<std::string> names; std::vector<std::string> names;
@ -327,13 +340,6 @@ public:
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
~ModelLoader() = default; ~ModelLoader() = default;
static std::string load_merges();
static std::string load_qwen2_merges();
static std::string load_mistral_merges();
static std::string load_mistral_vocab_json();
static std::string load_t5_tokenizer_json();
static std::string load_umt5_tokenizer_json();
}; };
#endif // __MODEL_H__ #endif // __MODEL_H__

View File

@ -653,6 +653,14 @@ std::string convert_diffusers_dit_to_original_lumina2(std::string name) {
return name; return name;
} }
std::string convert_other_dit_to_original_anima(std::string name) {
static const std::string anima_net_prefix = "net.";
if (!starts_with(name, anima_net_prefix)) {
name = anima_net_prefix + name;
}
return name;
}
std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) { std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) {
if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
name = convert_diffusers_unet_to_original_sd1(name); name = convert_diffusers_unet_to_original_sd1(name);
@ -664,6 +672,8 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
name = convert_diffusers_dit_to_original_flux(name); name = convert_diffusers_dit_to_original_flux(name);
} else if (sd_version_is_z_image(version)) { } else if (sd_version_is_z_image(version)) {
name = convert_diffusers_dit_to_original_lumina2(name); name = convert_diffusers_dit_to_original_lumina2(name);
} else if (sd_version_is_anima(version)) {
name = convert_other_dit_to_original_anima(name);
} }
return name; return name;
} }
@ -835,12 +845,14 @@ std::string convert_sep_to_dot(std::string name) {
"proj_out", "proj_out",
"transformer_blocks", "transformer_blocks",
"single_transformer_blocks", "single_transformer_blocks",
"single_blocks",
"diffusion_model", "diffusion_model",
"cond_stage_model", "cond_stage_model",
"first_stage_model", "first_stage_model",
"conv_in", "conv_in",
"conv_out", "conv_out",
"lora_down", "lora_down",
"lora_mid",
"lora_up", "lora_up",
"diff_b", "diff_b",
"hada_w1_a", "hada_w1_a",
@ -876,7 +888,18 @@ std::string convert_sep_to_dot(std::string name) {
"ff_context", "ff_context",
"norm_added_q", "norm_added_q",
"norm_added_v", "norm_added_v",
"to_add_out"}; "to_add_out",
"txt_mod",
"img_mod",
"txt_mlp",
"img_mlp",
"proj_mlp",
"wi_0",
"wi_1",
"norm1_context",
"ff_context",
"x_embedder",
};
// record the positions of underscores that should NOT be replaced // record the positions of underscores that should NOT be replaced
std::unordered_set<size_t> protected_positions; std::unordered_set<size_t> protected_positions;
@ -948,6 +971,7 @@ bool is_first_stage_model_name(const std::string& name) {
std::string convert_tensor_name(std::string name, SDVersion version) { std::string convert_tensor_name(std::string name, SDVersion version) {
bool is_lora = false; bool is_lora = false;
bool is_lycoris_underline = false; bool is_lycoris_underline = false;
bool is_underline = false;
std::vector<std::string> lora_prefix_vec = { std::vector<std::string> lora_prefix_vec = {
"lora.lora.", "lora.lora.",
"lora.lora_", "lora.lora_",
@ -955,12 +979,27 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
"lora.lycoris.", "lora.lycoris.",
"lora.", "lora.",
}; };
std::vector<std::string> underline_lora_prefix_vec = {
"unet_",
"te_",
"te1_",
"te2_",
"te3_",
"vae_",
};
for (const auto& prefix : lora_prefix_vec) { for (const auto& prefix : lora_prefix_vec) {
if (starts_with(name, prefix)) { if (starts_with(name, prefix)) {
is_lora = true; is_lora = true;
name = name.substr(prefix.size()); name = name.substr(prefix.size());
if (contains(prefix, "lycoris_")) { if (contains(prefix, "lycoris_")) {
is_lycoris_underline = true; is_lycoris_underline = true;
} else {
for (const auto& underline_lora_prefix : underline_lora_prefix_vec) {
if (starts_with(name, underline_lora_prefix)) {
is_underline = true;
break;
}
}
} }
break; break;
} }
@ -969,10 +1008,13 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
if (is_lora) { if (is_lora) {
std::map<std::string, std::string> lora_suffix_map = { std::map<std::string, std::string> lora_suffix_map = {
{".lora_down.weight", ".weight.lora_down"}, {".lora_down.weight", ".weight.lora_down"},
{".lora_mid.weight", ".weight.lora_mid"},
{".lora_up.weight", ".weight.lora_up"}, {".lora_up.weight", ".weight.lora_up"},
{".lora.down.weight", ".weight.lora_down"}, {".lora.down.weight", ".weight.lora_down"},
{".lora.mid.weight", ".weight.lora_mid"},
{".lora.up.weight", ".weight.lora_up"}, {".lora.up.weight", ".weight.lora_up"},
{"_lora.down.weight", ".weight.lora_down"}, {"_lora.down.weight", ".weight.lora_down"},
{"_lora.mid.weight", ".weight.lora_mid"},
{"_lora.up.weight", ".weight.lora_up"}, {"_lora.up.weight", ".weight.lora_up"},
{".lora_A.weight", ".weight.lora_down"}, {".lora_A.weight", ".weight.lora_down"},
{".lora_B.weight", ".weight.lora_up"}, {".lora_B.weight", ".weight.lora_up"},
@ -1020,12 +1062,14 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
} }
} }
if (sd_version_is_unet(version) || is_lycoris_underline) { // LOG_DEBUG("name %s %d", name.c_str(), version);
if (sd_version_is_unet(version) || is_underline || is_lycoris_underline) {
name = convert_sep_to_dot(name); name = convert_sep_to_dot(name);
} }
} }
std::vector<std::pair<std::string, std::string>> prefix_map = { std::unordered_map<std::string, std::string> prefix_map = {
{"diffusion_model.", "model.diffusion_model."}, {"diffusion_model.", "model.diffusion_model."},
{"unet.", "model.diffusion_model."}, {"unet.", "model.diffusion_model."},
{"transformer.", "model.diffusion_model."}, // dit {"transformer.", "model.diffusion_model."}, // dit
@ -1040,8 +1084,13 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
// {"te2.text_model.encoder.layers.", "cond_stage_model.1.model.transformer.resblocks."}, // {"te2.text_model.encoder.layers.", "cond_stage_model.1.model.transformer.resblocks."},
{"te2.", "cond_stage_model.1.transformer."}, {"te2.", "cond_stage_model.1.transformer."},
{"te1.", "cond_stage_model.transformer."}, {"te1.", "cond_stage_model.transformer."},
{"te3.", "text_encoders.t5xxl.transformer."},
}; };
if (sd_version_is_flux(version)) {
prefix_map["te1."] = "text_encoders.clip_l.transformer.";
}
replace_with_prefix_map(name, prefix_map); replace_with_prefix_map(name, prefix_map);
// diffusion model // diffusion model
@ -1071,7 +1120,11 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
for (const auto& prefix : first_stage_model_prefix_vec) { for (const auto& prefix : first_stage_model_prefix_vec) {
if (starts_with(name, prefix)) { if (starts_with(name, prefix)) {
name = convert_first_stage_model_name(name.substr(prefix.size()), prefix); name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
name = prefix + name; if (version == VERSION_SDXS) {
name = "tae." + name;
} else {
name = prefix + name;
}
break; break;
} }
} }

View File

@ -21,19 +21,19 @@ public:
blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim)); blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]); auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]); auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]); auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]);
struct ggml_tensor* r = x; ggml_tensor* r = x;
// x = ggml_ext_layer_norm(ctx, x, ln_w, ln_b); // x = ggml_ext_layer_norm(ctx, x, ln_w, ln_b);
x = layer_norm->forward(ctx, x); x = layer_norm->forward(ctx, x);
// x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x), fc1_b); // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x), fc1_b);
x = fc1->forward(ctx, x); x = fc1->forward(ctx, x);
x = ggml_gelu_inplace(ctx->ggml_ctx, x); x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
x = fc2->forward(ctx, x); x = fc2->forward(ctx, x);
// x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x), fc2_b); // x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x), fc2_b);
if (use_residue) if (use_residue)
@ -54,8 +54,8 @@ public:
blocks["1"] = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false)); blocks["1"] = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x) { ggml_tensor* x) {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]); auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
auto ff = std::dynamic_pointer_cast<Mlp>(blocks["1"]); auto ff = std::dynamic_pointer_cast<Mlp>(blocks["1"]);
@ -72,7 +72,7 @@ struct PerceiverAttention : public GGMLBlock {
int heads; // = heads int heads; // = heads
public: public:
PerceiverAttention(int dim, int dim_h = 64, int h = 8) PerceiverAttention(int dim, int dim_h = 64, int h = 8)
: scale(powf(dim_h, -0.5)), dim_head(dim_h), heads(h) { : scale(powf(static_cast<float>(dim_h), -0.5f)), dim_head(dim_h), heads(h) {
int inner_dim = dim_head * heads; int inner_dim = dim_head * heads;
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim)); blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim)); blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
@ -81,9 +81,9 @@ public:
blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, false)); blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, false));
} }
struct ggml_tensor* reshape_tensor(struct ggml_context* ctx, ggml_tensor* reshape_tensor(ggml_context* ctx,
struct ggml_tensor* x, ggml_tensor* x,
int heads) { int heads) {
int64_t ne[4]; int64_t ne[4];
for (int i = 0; i < 4; ++i) for (int i = 0; i < 4; ++i)
ne[i] = x->ne[i]; ne[i] = x->ne[i];
@ -92,17 +92,17 @@ public:
return x; return x;
} }
std::vector<struct ggml_tensor*> chunk_half(struct ggml_context* ctx, std::vector<ggml_tensor*> chunk_half(ggml_context* ctx,
struct ggml_tensor* x) { ggml_tensor* x) {
auto tlo = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0); auto tlo = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
auto tli = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], x->nb[0] * x->ne[0] / 2); auto tli = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], x->nb[0] * x->ne[0] / 2);
return {ggml_cont(ctx, tlo), return {ggml_cont(ctx, tlo),
ggml_cont(ctx, tli)}; ggml_cont(ctx, tli)};
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* latents) { ggml_tensor* latents) {
// x (torch.Tensor): image features // x (torch.Tensor): image features
// shape (b, n1, D) // shape (b, n1, D)
// latent (torch.Tensor): latent features // latent (torch.Tensor): latent features
@ -129,8 +129,8 @@ public:
k = reshape_tensor(ctx->ggml_ctx, k, heads); k = reshape_tensor(ctx->ggml_ctx, k, heads);
v = reshape_tensor(ctx->ggml_ctx, v, heads); v = reshape_tensor(ctx->ggml_ctx, v, heads);
scale = 1.f / sqrt(sqrt((float)dim_head)); scale = 1.f / sqrt(sqrt((float)dim_head));
k = ggml_scale_inplace(ctx->ggml_ctx, k, scale); k = ggml_ext_scale(ctx->ggml_ctx, k, scale, true);
q = ggml_scale_inplace(ctx->ggml_ctx, q, scale); q = ggml_ext_scale(ctx->ggml_ctx, q, scale, true);
// auto weight = ggml_mul_mat(ctx, q, k); // auto weight = ggml_mul_mat(ctx, q, k);
auto weight = ggml_mul_mat(ctx->ggml_ctx, k, q); // NOTE order of mul is opposite to pytorch auto weight = ggml_mul_mat(ctx->ggml_ctx, k, q); // NOTE order of mul is opposite to pytorch
@ -176,9 +176,9 @@ public:
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* latents, ggml_tensor* latents,
struct ggml_tensor* x) { ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto proj_in = std::dynamic_pointer_cast<Linear>(blocks["proj_in"]); auto proj_in = std::dynamic_pointer_cast<Linear>(blocks["proj_in"]);
auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]); auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
@ -225,19 +225,19 @@ public:
4)); 4));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* last_hidden_state) { ggml_tensor* last_hidden_state) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto token_proj = std::dynamic_pointer_cast<Mlp>(blocks["token_proj"]); auto token_proj = std::dynamic_pointer_cast<Mlp>(blocks["token_proj"]);
auto token_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["token_norm"]); auto token_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["token_norm"]);
auto perceiver_resampler = std::dynamic_pointer_cast<FacePerceiverResampler>(blocks["perceiver_resampler"]); auto perceiver_resampler = std::dynamic_pointer_cast<FacePerceiverResampler>(blocks["perceiver_resampler"]);
x = token_proj->forward(ctx, x); x = token_proj->forward(ctx, x);
int64_t nel = ggml_nelements(x); int64_t nel = ggml_nelements(x);
x = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens)); x = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
x = token_norm->forward(ctx, x); x = token_norm->forward(ctx, x);
struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state); ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
if (use_residul) if (use_residul)
out = ggml_add(ctx->ggml_ctx, x, out); out = ggml_add(ctx->ggml_ctx, x, out);
return out; return out;
@ -256,9 +256,9 @@ public:
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim)); blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
} }
struct ggml_tensor* fuse_fn(GGMLRunnerContext* ctx, ggml_tensor* fuse_fn(GGMLRunnerContext* ctx,
struct ggml_tensor* prompt_embeds, ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds) { ggml_tensor* id_embeds) {
auto mlp1 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]); auto mlp1 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
auto mlp2 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]); auto mlp2 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]); auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);
@ -273,24 +273,24 @@ public:
return stacked_id_embeds; return stacked_id_embeds;
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* prompt_embeds, ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds, ggml_tensor* id_embeds,
struct ggml_tensor* class_tokens_mask, ggml_tensor* class_tokens_mask,
struct ggml_tensor* class_tokens_mask_pos, ggml_tensor* class_tokens_mask_pos,
struct ggml_tensor* left, ggml_tensor* left,
struct ggml_tensor* right) { ggml_tensor* right) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
struct ggml_tensor* valid_id_embeds = id_embeds; ggml_tensor* valid_id_embeds = id_embeds;
// # slice out the image token embeddings // # slice out the image token embeddings
ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos"); ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
ggml_set_name(prompt_embeds, "prompt_embeds"); ggml_set_name(prompt_embeds, "prompt_embeds");
struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos); ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos);
ggml_set_name(image_token_embeds, "image_token_embeds"); ggml_set_name(image_token_embeds, "image_token_embeds");
valid_id_embeds = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0], valid_id_embeds = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0],
ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]); ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds); ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);
if (left && right) { if (left && right) {
stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
@ -301,10 +301,10 @@ public:
stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
} }
class_tokens_mask = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask)); class_tokens_mask = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask));
class_tokens_mask = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds); class_tokens_mask = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds);
prompt_embeds = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask); prompt_embeds = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask);
struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds); ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds);
ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds"); ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
return updated_prompt_embeds; return updated_prompt_embeds;
} }
@ -317,22 +317,22 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048)); blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* id_pixel_values, ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, ggml_tensor* prompt_embeds,
struct ggml_tensor* class_tokens_mask, ggml_tensor* class_tokens_mask,
struct ggml_tensor* class_tokens_mask_pos, ggml_tensor* class_tokens_mask_pos,
struct ggml_tensor* left, ggml_tensor* left,
struct ggml_tensor* right) { ggml_tensor* right) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]); auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]); auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]); auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]);
auto fuse_module = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]); auto fuse_module = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size] ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size]
struct ggml_tensor* id_embeds = visual_projection->forward(ctx, shared_id_embeds); // [N, proj_dim(768)] ggml_tensor* id_embeds = visual_projection->forward(ctx, shared_id_embeds); // [N, proj_dim(768)]
struct ggml_tensor* id_embeds_2 = visual_projection_2->forward(ctx, shared_id_embeds); // [N, 1280] ggml_tensor* id_embeds_2 = visual_projection_2->forward(ctx, shared_id_embeds); // [N, 1280]
id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 2, 0, 1, 3)); id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 2, 0, 1, 3));
id_embeds_2 = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds_2, 2, 0, 1, 3)); id_embeds_2 = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds_2, 2, 0, 1, 3));
@ -340,12 +340,12 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
id_embeds = ggml_concat(ctx->ggml_ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right id_embeds = ggml_concat(ctx->ggml_ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 1, 2, 0, 3)); id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 1, 2, 0, 3));
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
prompt_embeds, prompt_embeds,
id_embeds, id_embeds,
class_tokens_mask, class_tokens_mask,
class_tokens_mask_pos, class_tokens_mask_pos,
left, right); left, right);
return updated_prompt_embeds; return updated_prompt_embeds;
} }
}; };
@ -365,29 +365,29 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo
num_tokens)); num_tokens));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* id_pixel_values, ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, ggml_tensor* prompt_embeds,
struct ggml_tensor* class_tokens_mask, ggml_tensor* class_tokens_mask,
struct ggml_tensor* class_tokens_mask_pos, ggml_tensor* class_tokens_mask_pos,
struct ggml_tensor* id_embeds, ggml_tensor* id_embeds,
struct ggml_tensor* left, ggml_tensor* left,
struct ggml_tensor* right) { ggml_tensor* right) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]); auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
auto fuse_module = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]); auto fuse_module = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]); auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]);
// struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size] // ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size]
struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false); // [N, hidden_size] ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false); // [N, hidden_size]
id_embeds = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state); id_embeds = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
prompt_embeds, prompt_embeds,
id_embeds, id_embeds,
class_tokens_mask, class_tokens_mask,
class_tokens_mask_pos, class_tokens_mask_pos,
left, right); left, right);
return updated_prompt_embeds; return updated_prompt_embeds;
} }
}; };
@ -436,18 +436,18 @@ public:
return pm_version; return pm_version;
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
if (pm_version == PM_VERSION_1) if (pm_version == PM_VERSION_1)
id_encoder.get_param_tensors(tensors, prefix); id_encoder.get_param_tensors(tensors, prefix);
else if (pm_version == PM_VERSION_2) else if (pm_version == PM_VERSION_2)
id_encoder2.get_param_tensors(tensors, prefix); id_encoder2.get_param_tensors(tensors, prefix);
} }
struct ggml_cgraph* build_graph( // struct ggml_allocr* allocr, ggml_cgraph* build_graph( // ggml_allocr* allocr,
struct ggml_tensor* id_pixel_values, ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, ggml_tensor* prompt_embeds,
std::vector<bool>& class_tokens_mask, std::vector<bool>& class_tokens_mask,
struct ggml_tensor* id_embeds) { ggml_tensor* id_embeds) {
ctm.clear(); ctm.clear();
ctmf16.clear(); ctmf16.clear();
ctmpos.clear(); ctmpos.clear();
@ -458,20 +458,20 @@ public:
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); ggml_cgraph* gf = ggml_new_graph(compute_ctx);
int64_t hidden_size = prompt_embeds->ne[0]; int64_t hidden_size = prompt_embeds->ne[0];
int64_t seq_length = prompt_embeds->ne[1]; int64_t seq_length = prompt_embeds->ne[1];
ggml_type type = GGML_TYPE_F32; ggml_type type = GGML_TYPE_F32;
struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size()); ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size());
struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values); ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
struct ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds); ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds);
struct ggml_tensor* id_embeds_d = to_backend(id_embeds); ggml_tensor* id_embeds_d = to_backend(id_embeds);
struct ggml_tensor* left = nullptr; ggml_tensor* left = nullptr;
struct ggml_tensor* right = nullptr; ggml_tensor* right = nullptr;
for (int i = 0; i < class_tokens_mask.size(); i++) { for (int i = 0; i < class_tokens_mask.size(); i++) {
if (class_tokens_mask[i]) { if (class_tokens_mask[i]) {
// printf(" 1,"); // printf(" 1,");
@ -495,7 +495,7 @@ public:
right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1); hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1);
} }
struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size()); ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size());
{ {
if (type == GGML_TYPE_F16) if (type == GGML_TYPE_F16)
@ -526,7 +526,7 @@ public:
} }
} }
} }
struct ggml_tensor* updated_prompt_embeds = nullptr; ggml_tensor* updated_prompt_embeds = nullptr;
if (pm_version == PM_VERSION_1) if (pm_version == PM_VERSION_1)
updated_prompt_embeds = id_encoder.forward(&runner_ctx, updated_prompt_embeds = id_encoder.forward(&runner_ctx,
id_pixel_values_d, id_pixel_values_d,
@ -549,13 +549,13 @@ public:
} }
bool compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* id_pixel_values, ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds, ggml_tensor* id_embeds,
std::vector<bool>& class_tokens_mask, std::vector<bool>& class_tokens_mask,
struct ggml_tensor** updated_prompt_embeds, ggml_tensor** updated_prompt_embeds,
ggml_context* output_ctx) { ggml_context* output_ctx) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
// return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask); // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask);
return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds); return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds);
}; };
@ -566,7 +566,7 @@ public:
}; };
struct PhotoMakerIDEmbed : public GGMLRunner { struct PhotoMakerIDEmbed : public GGMLRunner {
std::map<std::string, struct ggml_tensor*> tensors; std::map<std::string, ggml_tensor*> tensors;
std::string file_path; std::string file_path;
ModelLoader* model_loader; ModelLoader* model_loader;
bool load_failed = false; bool load_failed = false;
@ -606,11 +606,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
} }
if (dry_run) { if (dry_run) {
std::lock_guard<std::mutex> lock(tensor_mutex); std::lock_guard<std::mutex> lock(tensor_mutex);
struct ggml_tensor* real = ggml_new_tensor(params_ctx, ggml_tensor* real = ggml_new_tensor(params_ctx,
tensor_storage.type, tensor_storage.type,
tensor_storage.n_dims, tensor_storage.n_dims,
tensor_storage.ne); tensor_storage.ne);
tensors[name] = real; tensors[name] = real;
} else { } else {
auto real = tensors[name]; auto real = tensors[name];
*dst_tensor = real; *dst_tensor = real;
@ -629,8 +629,8 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return true; return true;
} }
struct ggml_tensor* get() { ggml_tensor* get() {
std::map<std::string, struct ggml_tensor*>::iterator pos; std::map<std::string, ggml_tensor*>::iterator pos;
pos = tensors.find("pmid.id_embeds"); pos = tensors.find("pmid.id_embeds");
if (pos != tensors.end()) if (pos != tensors.end())
return pos->second; return pos->second;

View File

@ -2,15 +2,15 @@
#define __PREPROCESSING_HPP__ #define __PREPROCESSING_HPP__
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#define M_PI_ 3.14159265358979323846 #define M_PI_ 3.14159265358979323846f
void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) { void convolve(ggml_tensor* input, ggml_tensor* output, ggml_tensor* kernel, int padding) {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = 80 * input->ne[0] * input->ne[1]; // 20M for 512x512 params.mem_size = 80 * input->ne[0] * input->ne[1]; // 20M for 512x512
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* ctx0 = ggml_init(params); ggml_context* ctx0 = ggml_init(params);
struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1); ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel)); ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel));
ggml_tensor* h = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1); ggml_tensor* h = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1);
ggml_cgraph* gf = ggml_new_graph(ctx0); ggml_cgraph* gf = ggml_new_graph(ctx0);
@ -19,21 +19,21 @@ void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml
ggml_free(ctx0); ggml_free(ctx0);
} }
void gaussian_kernel(struct ggml_tensor* kernel) { void gaussian_kernel(ggml_tensor* kernel) {
int ks_mid = kernel->ne[0] / 2; int ks_mid = static_cast<int>(kernel->ne[0] / 2);
float sigma = 1.4f; float sigma = 1.4f;
float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f)); float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f));
for (int y = 0; y < kernel->ne[0]; y++) { for (int y = 0; y < kernel->ne[0]; y++) {
float gx = -ks_mid + y; float gx = static_cast<float>(-ks_mid + y);
for (int x = 0; x < kernel->ne[1]; x++) { for (int x = 0; x < kernel->ne[1]; x++) {
float gy = -ks_mid + x; float gy = static_cast<float>(-ks_mid + x);
float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal; float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
ggml_ext_tensor_set_f32(kernel, k_, x, y); ggml_ext_tensor_set_f32(kernel, k_, x, y);
} }
} }
} }
void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) { void grayscale(ggml_tensor* rgb_img, ggml_tensor* grayscale) {
for (int iy = 0; iy < rgb_img->ne[1]; iy++) { for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
for (int ix = 0; ix < rgb_img->ne[0]; ix++) { for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
float r = ggml_ext_tensor_get_f32(rgb_img, ix, iy); float r = ggml_ext_tensor_get_f32(rgb_img, ix, iy);
@ -45,8 +45,8 @@ void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
} }
} }
void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) { void prop_hypot(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) {
int n_elements = ggml_nelements(h); int n_elements = static_cast<int>(ggml_nelements(h));
float* dx = (float*)x->data; float* dx = (float*)x->data;
float* dy = (float*)y->data; float* dy = (float*)y->data;
float* dh = (float*)h->data; float* dh = (float*)h->data;
@ -55,8 +55,8 @@ void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor
} }
} }
void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) { void prop_arctan2(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) {
int n_elements = ggml_nelements(h); int n_elements = static_cast<int>(ggml_nelements(h));
float* dx = (float*)x->data; float* dx = (float*)x->data;
float* dy = (float*)y->data; float* dy = (float*)y->data;
float* dh = (float*)h->data; float* dh = (float*)h->data;
@ -65,8 +65,8 @@ void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tens
} }
} }
void normalize_tensor(struct ggml_tensor* g) { void normalize_tensor(ggml_tensor* g) {
int n_elements = ggml_nelements(g); int n_elements = static_cast<int>(ggml_nelements(g));
float* dg = (float*)g->data; float* dg = (float*)g->data;
float max = -INFINITY; float max = -INFINITY;
for (int i = 0; i < n_elements; i++) { for (int i = 0; i < n_elements; i++) {
@ -78,7 +78,7 @@ void normalize_tensor(struct ggml_tensor* g) {
} }
} }
void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) { void non_max_supression(ggml_tensor* result, ggml_tensor* G, ggml_tensor* D) {
for (int iy = 1; iy < result->ne[1] - 1; iy++) { for (int iy = 1; iy < result->ne[1] - 1; iy++) {
for (int ix = 1; ix < result->ne[0] - 1; ix++) { for (int ix = 1; ix < result->ne[0] - 1; ix++) {
float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_; float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
@ -117,8 +117,8 @@ void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struc
} }
} }
void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) { void threshold_hystersis(ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) {
int n_elements = ggml_nelements(img); int n_elements = static_cast<int>(ggml_nelements(img));
float* imd = (float*)img->data; float* imd = (float*)img->data;
float max = -INFINITY; float max = -INFINITY;
for (int i = 0; i < n_elements; i++) { for (int i = 0; i < n_elements; i++) {
@ -163,11 +163,11 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
} }
bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) { bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(40 * img.width * img.height); // 10MB for 512x512 params.mem_size = static_cast<size_t>(40 * img.width * img.height); // 10MB for 512x512
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); ggml_context* work_ctx = ggml_init(params);
if (!work_ctx) { if (!work_ctx) {
LOG_ERROR("ggml_init() failed"); LOG_ERROR("ggml_init() failed");
@ -185,19 +185,19 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
-1, -2, -1}; -1, -2, -1};
// generate kernel // generate kernel
int kernel_size = 5; int kernel_size = 5;
struct ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1); ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1);
struct ggml_tensor* sf_kx = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); ggml_tensor* sf_kx = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx)); memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx));
struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky)); memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
gaussian_kernel(gkernel); gaussian_kernel(gkernel);
struct ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1); ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1);
struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1); ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1);
struct ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray); ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray); ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray); ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray); ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray);
sd_image_to_ggml_tensor(img, image); sd_image_to_ggml_tensor(img, image);
grayscale(image, image_gray); grayscale(image, image_gray);
convolve(image_gray, image_gray, gkernel, 2); convolve(image_gray, image_gray, gkernel, 2);
@ -209,8 +209,8 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
non_max_supression(image_gray, G, tetha); non_max_supression(image_gray, G, tetha);
threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong); threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
// to RGB channels // to RGB channels
for (int iy = 0; iy < img.height; iy++) { for (uint32_t iy = 0; iy < img.height; iy++) {
for (int ix = 0; ix < img.width; ix++) { for (uint32_t ix = 0; ix < img.width; ix++) {
float gray = ggml_ext_tensor_get_f32(image_gray, ix, iy); float gray = ggml_ext_tensor_get_f32(image_gray, ix, iy);
gray = inverse ? 1.0f - gray : gray; gray = inverse ? 1.0f - gray : gray;
ggml_ext_tensor_set_f32(image, gray, ix, iy); ggml_ext_tensor_set_f32(image, gray, ix, iy);

View File

@ -3,9 +3,8 @@
#include <memory> #include <memory>
#include "common.hpp" #include "common_block.hpp"
#include "flux.hpp" #include "flux.hpp"
#include "ggml_extend.hpp"
namespace Qwen { namespace Qwen {
constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480; constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
@ -27,9 +26,9 @@ namespace Qwen {
blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias)); blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* sample, ggml_tensor* sample,
struct ggml_tensor* condition = nullptr) { ggml_tensor* condition = nullptr) {
if (condition != nullptr) { if (condition != nullptr) {
auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]); auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]);
sample = ggml_add(ctx->ggml_ctx, sample, cond_proj->forward(ctx, condition)); sample = ggml_add(ctx->ggml_ctx, sample, cond_proj->forward(ctx, condition));
@ -50,8 +49,8 @@ namespace Qwen {
blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim)); blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* timesteps) { ggml_tensor* timesteps) {
// timesteps: [N,] // timesteps: [N,]
// return: [N, embedding_dim] // return: [N, embedding_dim]
auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]); auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]);
@ -108,10 +107,10 @@ namespace Qwen {
} }
std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* img, ggml_tensor* img,
struct ggml_tensor* txt, ggml_tensor* txt,
struct ggml_tensor* pe, ggml_tensor* pe,
struct ggml_tensor* mask = nullptr) { ggml_tensor* mask = nullptr) {
// img: [N, n_img_token, hidden_size] // img: [N, n_img_token, hidden_size]
// txt: [N, n_txt_token, hidden_size] // txt: [N, n_txt_token, hidden_size]
// pe: [n_img_token + n_txt_token, d_head/2, 2, 2] // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@ -162,26 +161,25 @@ namespace Qwen {
auto k = ggml_concat(ctx->ggml_ctx, txt_k, img_k, 2); // [N, n_txt_token + n_img_token, n_head, d_head] auto k = ggml_concat(ctx->ggml_ctx, txt_k, img_k, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2); // [N, n_txt_token + n_img_token, n_head, d_head] auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
auto attn = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f)); // [N, n_txt_token + n_img_token, n_head*d_head] auto attn = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f)); // [N, n_txt_token + n_img_token, n_head*d_head]
attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3)); // [n_txt_token + n_img_token, N, hidden_size]
auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx, auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1],
txt->ne[1], txt->ne[1],
attn->ne[2],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
0); // [n_txt_token, N, hidden_size] 0); // [N, n_txt_token, n_head*d_head]
txt_attn_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_attn_out, 0, 2, 1, 3)); // [N, n_txt_token, hidden_size]
auto img_attn_out = ggml_view_3d(ctx->ggml_ctx, auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1],
img->ne[1], img->ne[1],
attn->ne[2],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
attn->nb[2] * txt->ne[1]); // [n_img_token, N, hidden_size] txt->ne[1] * attn->nb[1]); // [N, n_img_token, n_head*d_head]
img_attn_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, img_attn_out, 0, 2, 1, 3)); // [N, n_img_token, hidden_size] img_attn_out = ggml_cont(ctx->ggml_ctx, img_attn_out);
txt_attn_out = ggml_cont(ctx->ggml_ctx, txt_attn_out);
img_attn_out = to_out_0->forward(ctx, img_attn_out); img_attn_out = to_out_0->forward(ctx, img_attn_out);
txt_attn_out = to_add_out->forward(ctx, txt_attn_out); txt_attn_out = to_add_out->forward(ctx, txt_attn_out);
@ -191,11 +189,16 @@ namespace Qwen {
}; };
class QwenImageTransformerBlock : public GGMLBlock { class QwenImageTransformerBlock : public GGMLBlock {
protected:
bool zero_cond_t;
public: public:
QwenImageTransformerBlock(int64_t dim, QwenImageTransformerBlock(int64_t dim,
int64_t num_attention_heads, int64_t num_attention_heads,
int64_t attention_head_dim, int64_t attention_head_dim,
float eps = 1e-6) { float eps = 1e-6,
bool zero_cond_t = false)
: zero_cond_t(zero_cond_t) {
// img_mod.0 is nn.SiLU() // img_mod.0 is nn.SiLU()
blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true)); blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));
@ -208,7 +211,7 @@ namespace Qwen {
blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false)); blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false)); blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
blocks["txt_mlp"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU)); blocks["txt_mlp"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU, true));
blocks["attn"] = std::shared_ptr<GGMLBlock>(new QwenImageAttention(dim, blocks["attn"] = std::shared_ptr<GGMLBlock>(new QwenImageAttention(dim,
attention_head_dim, attention_head_dim,
@ -220,11 +223,37 @@ namespace Qwen {
eps)); eps));
} }
std::vector<ggml_tensor*> get_mod_params_vec(ggml_context* ctx, ggml_tensor* mod_params, ggml_tensor* index = nullptr) {
// index: [N, n_img_token]
// mod_params: [N, hidden_size * 12]
if (index == nullptr) {
return ggml_ext_chunk(ctx, mod_params, 6, 0);
}
mod_params = ggml_reshape_1d(ctx, mod_params, ggml_nelements(mod_params));
auto mod_params_vec = ggml_ext_chunk(ctx, mod_params, 12, 0);
index = ggml_reshape_3d(ctx, index, 1, index->ne[0], index->ne[1]); // [N, n_img_token, 1]
index = ggml_repeat_4d(ctx, index, mod_params_vec[0]->ne[0], index->ne[1], index->ne[2], index->ne[3]); // [N, n_img_token, hidden_size]
std::vector<ggml_tensor*> mod_results;
for (int i = 0; i < 6; i++) {
auto mod_0 = mod_params_vec[i];
auto mod_1 = mod_params_vec[i + 6];
// mod_result = torch.where(index == 0, mod_0, mod_1)
// mod_result = (1 - index)*mod_0 + index*mod_1
mod_0 = ggml_sub(ctx, ggml_repeat(ctx, mod_0, index), ggml_mul(ctx, index, mod_0)); // [N, n_img_token, hidden_size]
mod_1 = ggml_mul(ctx, index, mod_1); // [N, n_img_token, hidden_size]
auto mod_result = ggml_add(ctx, mod_0, mod_1);
mod_results.push_back(mod_result);
}
return mod_results;
}
virtual std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, virtual std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* img, ggml_tensor* img,
struct ggml_tensor* txt, ggml_tensor* txt,
struct ggml_tensor* t_emb, ggml_tensor* t_emb,
struct ggml_tensor* pe) { ggml_tensor* pe,
ggml_tensor* modulate_index = nullptr) {
// img: [N, n_img_token, hidden_size] // img: [N, n_img_token, hidden_size]
// txt: [N, n_txt_token, hidden_size] // txt: [N, n_txt_token, hidden_size]
// pe: [n_img_token + n_txt_token, d_head/2, 2, 2] // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@ -244,14 +273,18 @@ namespace Qwen {
auto img_mod_params = ggml_silu(ctx->ggml_ctx, t_emb); auto img_mod_params = ggml_silu(ctx->ggml_ctx, t_emb);
img_mod_params = img_mod_1->forward(ctx, img_mod_params); img_mod_params = img_mod_1->forward(ctx, img_mod_params);
auto img_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0); auto img_mod_param_vec = get_mod_params_vec(ctx->ggml_ctx, img_mod_params, modulate_index);
if (zero_cond_t) {
t_emb = ggml_ext_chunk(ctx->ggml_ctx, t_emb, 2, 1)[0];
}
auto txt_mod_params = ggml_silu(ctx->ggml_ctx, t_emb); auto txt_mod_params = ggml_silu(ctx->ggml_ctx, t_emb);
txt_mod_params = txt_mod_1->forward(ctx, txt_mod_params); txt_mod_params = txt_mod_1->forward(ctx, txt_mod_params);
auto txt_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0); auto txt_mod_param_vec = get_mod_params_vec(ctx->ggml_ctx, txt_mod_params);
auto img_normed = img_norm1->forward(ctx, img); auto img_normed = img_norm1->forward(ctx, img);
auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]); auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1], modulate_index != nullptr);
auto img_gate1 = img_mod_param_vec[2]; auto img_gate1 = img_mod_param_vec[2];
auto txt_normed = txt_norm1->forward(ctx, txt); auto txt_normed = txt_norm1->forward(ctx, txt);
@ -264,7 +297,7 @@ namespace Qwen {
txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_gate1)); txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_gate1));
auto img_normed2 = img_norm2->forward(ctx, img); auto img_normed2 = img_norm2->forward(ctx, img);
auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]); auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4], modulate_index != nullptr);
auto img_gate2 = img_mod_param_vec[5]; auto img_gate2 = img_mod_param_vec[5];
auto txt_normed2 = txt_norm2->forward(ctx, txt); auto txt_normed2 = txt_norm2->forward(ctx, txt);
@ -292,9 +325,9 @@ namespace Qwen {
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias)); blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* c) { ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// c: [N, hidden_size] // c: [N, hidden_size]
// return: [N, n_token, patch_size * patch_size * out_channels] // return: [N, n_token, patch_size * patch_size * out_channels]
@ -315,16 +348,17 @@ namespace Qwen {
}; };
struct QwenImageParams { struct QwenImageParams {
int64_t patch_size = 2; int patch_size = 2;
int64_t in_channels = 64; int64_t in_channels = 64;
int64_t out_channels = 16; int64_t out_channels = 16;
int64_t num_layers = 60; int num_layers = 60;
int64_t attention_head_dim = 128; int64_t attention_head_dim = 128;
int64_t num_attention_heads = 24; int64_t num_attention_heads = 24;
int64_t joint_attention_dim = 3584; int64_t joint_attention_dim = 3584;
float theta = 10000; int theta = 10000;
std::vector<int> axes_dim = {16, 56, 56}; std::vector<int> axes_dim = {16, 56, 56};
int64_t axes_dim_sum = 128; int axes_dim_sum = 128;
bool zero_cond_t = false;
}; };
class QwenImageModel : public GGMLBlock { class QwenImageModel : public GGMLBlock {
@ -346,7 +380,8 @@ namespace Qwen {
auto block = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim, auto block = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
params.num_attention_heads, params.num_attention_heads,
params.attention_head_dim, params.attention_head_dim,
1e-6f)); 1e-6f,
params.zero_cond_t));
blocks["transformer_blocks." + std::to_string(i)] = block; blocks["transformer_blocks." + std::to_string(i)] = block;
} }
@ -354,74 +389,12 @@ namespace Qwen {
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels)); blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
} }
struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx, ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
struct ggml_tensor* x) { ggml_tensor* x,
int64_t W = x->ne[0]; ggml_tensor* timestep,
int64_t H = x->ne[1]; ggml_tensor* context,
ggml_tensor* pe,
int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; ggml_tensor* modulate_index = nullptr) {
int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w]
return x;
}
struct ggml_tensor* patchify(struct ggml_context* ctx,
struct ggml_tensor* x) {
// x: [N, C, H, W]
// return: [N, h*w, C * patch_size * patch_size]
int64_t N = x->ne[3];
int64_t C = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
int64_t p = params.patch_size;
int64_t h = H / params.patch_size;
int64_t w = W / params.patch_size;
GGML_ASSERT(h * p == H && w * p == W);
x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p]
x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, p*p]
x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N); // [N, h*w, C*p*p]
return x;
}
struct ggml_tensor* process_img(struct ggml_context* ctx,
struct ggml_tensor* x) {
x = pad_to_patch_size(ctx, x);
x = patchify(ctx, x);
return x;
}
struct ggml_tensor* unpatchify(struct ggml_context* ctx,
struct ggml_tensor* x,
int64_t h,
int64_t w) {
// x: [N, h*w, C*patch_size*patch_size]
// return: [N, C, H, W]
int64_t N = x->ne[2];
int64_t C = x->ne[0] / params.patch_size / params.patch_size;
int64_t H = h * params.patch_size;
int64_t W = w * params.patch_size;
int64_t p = params.patch_size;
GGML_ASSERT(C * p * p == x->ne[0]);
x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N); // [N, h*w, C, p*p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, p*p]
x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p]
x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p]
return x;
}
struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* timestep,
struct ggml_tensor* context,
struct ggml_tensor* pe) {
auto time_text_embed = std::dynamic_pointer_cast<QwenTimestepProjEmbeddings>(blocks["time_text_embed"]); auto time_text_embed = std::dynamic_pointer_cast<QwenTimestepProjEmbeddings>(blocks["time_text_embed"]);
auto txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]); auto txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]); auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
@ -430,30 +403,39 @@ namespace Qwen {
auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]); auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
auto t_emb = time_text_embed->forward(ctx, timestep); auto t_emb = time_text_embed->forward(ctx, timestep);
auto img = img_in->forward(ctx, x); if (params.zero_cond_t) {
auto txt = txt_norm->forward(ctx, context); auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep));
txt = txt_in->forward(ctx, txt); t_emb = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
}
auto img = img_in->forward(ctx, x);
auto txt = txt_norm->forward(ctx, context);
txt = txt_in->forward(ctx, txt);
for (int i = 0; i < params.num_layers; i++) { for (int i = 0; i < params.num_layers; i++) {
auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
auto result = block->forward(ctx, img, txt, t_emb, pe); auto result = block->forward(ctx, img, txt, t_emb, pe, modulate_index);
img = result.first; img = result.first;
txt = result.second; txt = result.second;
} }
if (params.zero_cond_t) {
t_emb = ggml_ext_chunk(ctx->ggml_ctx, t_emb, 2, 1)[0];
}
img = norm_out->forward(ctx, img, t_emb); img = norm_out->forward(ctx, img, t_emb);
img = proj_out->forward(ctx, img); img = proj_out->forward(ctx, img);
return img; return img;
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* timestep, ggml_tensor* timestep,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* pe, ggml_tensor* pe,
std::vector<ggml_tensor*> ref_latents = {}) { std::vector<ggml_tensor*> ref_latents = {},
ggml_tensor* modulate_index = nullptr) {
// Forward pass of DiT. // Forward pass of DiT.
// x: [N, C, H, W] // x: [N, C, H, W]
// timestep: [N,] // timestep: [N,]
@ -466,20 +448,17 @@ namespace Qwen {
int64_t C = x->ne[2]; int64_t C = x->ne[2];
int64_t N = x->ne[3]; int64_t N = x->ne[3];
auto img = process_img(ctx->ggml_ctx, x); auto img = DiT::pad_and_patchify(ctx, x, params.patch_size, params.patch_size);
uint64_t img_tokens = img->ne[1]; int64_t img_tokens = img->ne[1];
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
for (ggml_tensor* ref : ref_latents) { for (ggml_tensor* ref : ref_latents) {
ref = process_img(ctx->ggml_ctx, ref); ref = DiT::pad_and_patchify(ctx, ref, params.patch_size, params.patch_size);
img = ggml_concat(ctx->ggml_ctx, img, ref, 1); img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
} }
} }
int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size); auto out = forward_orig(ctx, img, timestep, context, pe, modulate_index); // [N, h_len*w_len, ph*pw*C]
int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);
auto out = forward_orig(ctx, img, timestep, context, pe); // [N, h_len*w_len, ph*pw*C]
if (out->ne[1] > img_tokens) { if (out->ne[1] > img_tokens) {
out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size] out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
@ -487,11 +466,7 @@ namespace Qwen {
out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size] out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
} }
out = unpatchify(ctx->ggml_ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w] out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, params.patch_size, params.patch_size); // [N, C, H, W]
// slice
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H); // [N, C, H, W + pad_w]
out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W); // [N, C, H, W]
return out; return out;
} }
@ -502,19 +477,25 @@ namespace Qwen {
QwenImageParams qwen_image_params; QwenImageParams qwen_image_params;
QwenImageModel qwen_image; QwenImageModel qwen_image;
std::vector<float> pe_vec; std::vector<float> pe_vec;
std::vector<float> modulate_index_vec;
SDVersion version; SDVersion version;
QwenImageRunner(ggml_backend_t backend, QwenImageRunner(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {}, const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "", const std::string prefix = "",
SDVersion version = VERSION_QWEN_IMAGE) SDVersion version = VERSION_QWEN_IMAGE,
bool zero_cond_t = false)
: GGMLRunner(backend, offload_params_to_cpu) { : GGMLRunner(backend, offload_params_to_cpu) {
qwen_image_params.num_layers = 0; qwen_image_params.num_layers = 0;
qwen_image_params.zero_cond_t = zero_cond_t;
for (auto pair : tensor_storage_map) { for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first; std::string tensor_name = pair.first;
if (tensor_name.find(prefix) == std::string::npos) if (tensor_name.find(prefix) == std::string::npos)
continue; continue;
if (tensor_name.find("__index_timestep_zero__") != std::string::npos) {
qwen_image_params.zero_cond_t = true;
}
size_t pos = tensor_name.find("transformer_blocks."); size_t pos = tensor_name.find("transformer_blocks.");
if (pos != std::string::npos) { if (pos != std::string::npos) {
tensor_name = tensor_name.substr(pos); // remove prefix tensor_name = tensor_name.substr(pos); // remove prefix
@ -529,6 +510,9 @@ namespace Qwen {
} }
} }
LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers); LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
if (qwen_image_params.zero_cond_t) {
LOG_INFO("use zero_cond_t");
}
qwen_image = QwenImageModel(qwen_image_params); qwen_image = QwenImageModel(qwen_image_params);
qwen_image.init(params_ctx, tensor_storage_map, prefix); qwen_image.init(params_ctx, tensor_storage_map, prefix);
} }
@ -537,17 +521,17 @@ namespace Qwen {
return "qwen_image"; return "qwen_image";
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
qwen_image.get_param_tensors(tensors, prefix); qwen_image.get_param_tensors(tensors, prefix);
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* x, ggml_cgraph* build_graph(ggml_tensor* x,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
std::vector<ggml_tensor*> ref_latents = {}, std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false) { bool increase_ref_index = false) {
GGML_ASSERT(x->ne[3] == 1); GGML_ASSERT(x->ne[3] == 1);
struct ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE); ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
x = to_backend(x); x = to_backend(x);
context = to_backend(context); context = to_backend(context);
@ -557,16 +541,18 @@ namespace Qwen {
ref_latents[i] = to_backend(ref_latents[i]); ref_latents[i] = to_backend(ref_latents[i]);
} }
pe_vec = Rope::gen_qwen_image_pe(x->ne[1], pe_vec = Rope::gen_qwen_image_pe(static_cast<int>(x->ne[1]),
x->ne[0], static_cast<int>(x->ne[0]),
qwen_image_params.patch_size, qwen_image_params.patch_size,
x->ne[3], static_cast<int>(x->ne[3]),
context->ne[1], static_cast<int>(context->ne[1]),
ref_latents, ref_latents,
increase_ref_index, increase_ref_index,
qwen_image_params.theta, qwen_image_params.theta,
circular_y_enabled,
circular_x_enabled,
qwen_image_params.axes_dim); qwen_image_params.axes_dim);
int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2; int pos_len = static_cast<int>(pe_vec.size() / qwen_image_params.axes_dim_sum / 2);
// LOG_DEBUG("pos_len %d", pos_len); // LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len); auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
// pe->data = pe_vec.data(); // pe->data = pe_vec.data();
@ -574,14 +560,40 @@ namespace Qwen {
// pe->data = nullptr; // pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data()); set_backend_tensor_data(pe, pe_vec.data());
ggml_tensor* modulate_index = nullptr;
if (qwen_image_params.zero_cond_t) {
modulate_index_vec.clear();
int64_t h_len = ((x->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
int64_t w_len = ((x->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
int64_t num_img_tokens = h_len * w_len;
modulate_index_vec.insert(modulate_index_vec.end(), num_img_tokens, 0.f);
int64_t num_ref_img_tokens = 0;
for (ggml_tensor* ref : ref_latents) {
int64_t h_len = ((ref->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
int64_t w_len = ((ref->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
num_ref_img_tokens += h_len * w_len;
}
if (num_ref_img_tokens > 0) {
modulate_index_vec.insert(modulate_index_vec.end(), num_ref_img_tokens, 1.f);
}
modulate_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, modulate_index_vec.size());
set_backend_tensor_data(modulate_index, modulate_index_vec.data());
}
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* out = qwen_image.forward(&runner_ctx, ggml_tensor* out = qwen_image.forward(&runner_ctx,
x, x,
timesteps, timesteps,
context, context,
pe, pe,
ref_latents); ref_latents,
modulate_index);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
@ -589,17 +601,17 @@ namespace Qwen {
} }
bool compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
std::vector<ggml_tensor*> ref_latents = {}, std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false, bool increase_ref_index = false,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// timesteps: [N, ] // timesteps: [N, ]
// context: [N, max_position, hidden_size] // context: [N, max_position, hidden_size]
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, ref_latents, increase_ref_index); return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
}; };
@ -607,12 +619,12 @@ namespace Qwen {
} }
void test() { void test() {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != nullptr); GGML_ASSERT(work_ctx != nullptr);
{ {
@ -629,14 +641,14 @@ namespace Qwen {
auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin"); auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin");
print_ggml_tensor(context); print_ggml_tensor(context);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
compute(8, x, timesteps, context, {}, false, &out, work_ctx); compute(8, x, timesteps, context, {}, false, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("qwen_image test done in %dms", t1 - t0); LOG_DEBUG("qwen_image test done in %lldms", t1 - t0);
} }
} }
@ -684,4 +696,4 @@ namespace Qwen {
} // namespace name } // namespace name
#endif // __QWEN_IMAGE_HPP__ #endif // __QWEN_IMAGE_HPP__

View File

@ -90,7 +90,7 @@ class MT19937RNG : public RNG {
float u1 = 1.0f - data[j]; float u1 = 1.0f - data[j];
float u2 = data[j + 8]; float u2 = data[j + 8];
float r = std::sqrt(-2.0f * std::log(u1)); float r = std::sqrt(-2.0f * std::log(u1));
float theta = 2.0f * 3.14159265358979323846 * u2; float theta = 2.0f * 3.14159265358979323846f * u2;
data[j] = r * std::cos(theta) * std + mean; data[j] = r * std::cos(theta) * std + mean;
data[j + 8] = r * std::sin(theta) * std + mean; data[j + 8] = r * std::sin(theta) * std + mean;
} }

View File

@ -1,6 +1,8 @@
#ifndef __ROPE_HPP__ #ifndef __ROPE_HPP__
#define __ROPE_HPP__ #define __ROPE_HPP__
#include <algorithm>
#include <cmath>
#include <vector> #include <vector>
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
@ -20,11 +22,11 @@ namespace Rope {
} }
__STATIC_INLINE__ std::vector<std::vector<float>> transpose(const std::vector<std::vector<float>>& mat) { __STATIC_INLINE__ std::vector<std::vector<float>> transpose(const std::vector<std::vector<float>>& mat) {
int rows = mat.size(); size_t rows = mat.size();
int cols = mat[0].size(); size_t cols = mat[0].size();
std::vector<std::vector<float>> transposed(cols, std::vector<float>(rows)); std::vector<std::vector<float>> transposed(cols, std::vector<float>(rows));
for (int i = 0; i < rows; ++i) { for (size_t i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) { for (size_t j = 0; j < cols; ++j) {
transposed[j][i] = mat[i][j]; transposed[j][i] = mat[i][j];
} }
} }
@ -39,7 +41,10 @@ namespace Rope {
return flat_vec; return flat_vec;
} }
__STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos, int dim, int theta) { __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos,
int dim,
float theta,
const std::vector<int>& axis_wrap_dims = {}) {
assert(dim % 2 == 0); assert(dim % 2 == 0);
int half_dim = dim / 2; int half_dim = dim / 2;
@ -47,14 +52,31 @@ namespace Rope {
std::vector<float> omega(half_dim); std::vector<float> omega(half_dim);
for (int i = 0; i < half_dim; ++i) { for (int i = 0; i < half_dim; ++i) {
omega[i] = 1.0 / std::pow(theta, scale[i]); omega[i] = 1.0f / ::powf(1.f * theta, scale[i]);
} }
int pos_size = pos.size(); size_t pos_size = pos.size();
std::vector<std::vector<float>> out(pos_size, std::vector<float>(half_dim)); std::vector<std::vector<float>> out(pos_size, std::vector<float>(half_dim));
for (int i = 0; i < pos_size; ++i) { for (size_t i = 0; i < pos_size; ++i) {
for (int j = 0; j < half_dim; ++j) { for (size_t j = 0; j < half_dim; ++j) {
out[i][j] = pos[i] * omega[j]; float angle = pos[i] * omega[j];
if (!axis_wrap_dims.empty()) {
size_t wrap_size = axis_wrap_dims.size();
// mod batch size since we only store this for one item in the batch
size_t wrap_idx = wrap_size > 0 ? (i % wrap_size) : 0;
int wrap_dim = axis_wrap_dims[wrap_idx];
if (wrap_dim > 0) {
constexpr float TWO_PI = 6.28318530717958647692f;
float cycles = omega[j] * wrap_dim / TWO_PI;
// closest periodic harmonic, necessary to ensure things neatly tile
// without this round, things don't tile at the boundaries and you end up
// with the model knowing what is "center"
float rounded = std::round(cycles);
angle = pos[i] * TWO_PI * rounded / wrap_dim;
}
}
out[i][j] = angle;
} }
} }
@ -77,7 +99,7 @@ namespace Rope {
for (int dim = 0; dim < axes_dim_num; dim++) { for (int dim = 0; dim < axes_dim_num; dim++) {
if (arange_dims.find(dim) != arange_dims.end()) { if (arange_dims.find(dim) != arange_dims.end()) {
for (int i = 0; i < bs * context_len; i++) { for (int i = 0; i < bs * context_len; i++) {
txt_ids[i][dim] = (i % context_len); txt_ids[i][dim] = 1.f * (i % context_len);
} }
} }
} }
@ -89,20 +111,29 @@ namespace Rope {
int patch_size, int patch_size,
int bs, int bs,
int axes_dim_num, int axes_dim_num,
int index = 0, int index = 0,
int h_offset = 0, int h_offset = 0,
int w_offset = 0) { int w_offset = 0,
bool scale_rope = false) {
int h_len = (h + (patch_size / 2)) / patch_size; int h_len = (h + (patch_size / 2)) / patch_size;
int w_len = (w + (patch_size / 2)) / patch_size; int w_len = (w + (patch_size / 2)) / patch_size;
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0)); std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));
std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len); int h_start = h_offset;
std::vector<float> col_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len); int w_start = w_offset;
if (scale_rope) {
h_start -= h_len / 2;
w_start -= w_len / 2;
}
std::vector<float> row_ids = linspace<float>(1.f * h_start, 1.f * h_start + h_len - 1, h_len);
std::vector<float> col_ids = linspace<float>(1.f * w_start, 1.f * w_start + w_len - 1, w_len);
for (int i = 0; i < h_len; ++i) { for (int i = 0; i < h_len; ++i) {
for (int j = 0; j < w_len; ++j) { for (int j = 0; j < w_len; ++j) {
img_ids[i * w_len + j][0] = index; img_ids[i * w_len + j][0] = 1.f * index;
img_ids[i * w_len + j][1] = row_ids[i]; img_ids[i * w_len + j][1] = row_ids[i];
img_ids[i * w_len + j][2] = col_ids[j]; img_ids[i * w_len + j][2] = col_ids[j];
} }
@ -136,11 +167,12 @@ namespace Rope {
__STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids, __STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
int bs, int bs,
int theta, const std::vector<float>& axis_thetas,
const std::vector<int>& axes_dim) { const std::vector<int>& axes_dim,
const std::vector<std::vector<int>>& wrap_dims = {}) {
std::vector<std::vector<float>> trans_ids = transpose(ids); std::vector<std::vector<float>> trans_ids = transpose(ids);
size_t pos_len = ids.size() / bs; size_t pos_len = ids.size() / bs;
int num_axes = axes_dim.size(); size_t num_axes = axes_dim.size();
// for (int i = 0; i < pos_len; i++) { // for (int i = 0; i < pos_len; i++) {
// std::cout << trans_ids[0][i] << " " << trans_ids[1][i] << " " << trans_ids[2][i] << std::endl; // std::cout << trans_ids[0][i] << " " << trans_ids[1][i] << " " << trans_ids[2][i] << std::endl;
// } // }
@ -150,9 +182,18 @@ namespace Rope {
emb_dim += d / 2; emb_dim += d / 2;
std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0)); std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0));
int offset = 0; size_t offset = 0;
for (int i = 0; i < num_axes; ++i) { for (size_t i = 0; i < num_axes; ++i) {
std::vector<std::vector<float>> rope_emb = rope(trans_ids[i], axes_dim[i], theta); // [bs*pos_len, axes_dim[i]/2 * 2 * 2] std::vector<int> axis_wrap_dims;
if (!wrap_dims.empty() && i < (int)wrap_dims.size()) {
axis_wrap_dims = wrap_dims[i];
}
float axis_theta = 10000.0f;
if (!axis_thetas.empty()) {
axis_theta = axis_thetas[std::min(i, axis_thetas.size() - 1)];
}
std::vector<std::vector<float>> rope_emb =
rope(trans_ids[i], axes_dim[i], axis_theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
for (int b = 0; b < bs; ++b) { for (int b = 0; b < bs; ++b) {
for (int j = 0; j < pos_len; ++j) { for (int j = 0; j < pos_len; ++j) {
for (int k = 0; k < rope_emb[0].size(); ++k) { for (int k = 0; k < rope_emb[0].size(); ++k) {
@ -166,43 +207,55 @@ namespace Rope {
return flatten(emb); return flatten(emb);
} }
__STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
int bs,
float theta,
const std::vector<int>& axes_dim,
const std::vector<std::vector<int>>& wrap_dims = {}) {
std::vector<float> axis_thetas(axes_dim.size(), theta);
return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims);
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size, __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
int bs, int bs,
int axes_dim_num, int axes_dim_num,
const std::vector<ggml_tensor*>& ref_latents, const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index, bool increase_ref_index,
float ref_index_scale) { float ref_index_scale,
bool scale_rope) {
std::vector<std::vector<float>> ids; std::vector<std::vector<float>> ids;
uint64_t curr_h_offset = 0; int curr_h_offset = 0;
uint64_t curr_w_offset = 0; int curr_w_offset = 0;
int index = 1; int index = 1;
for (ggml_tensor* ref : ref_latents) { for (ggml_tensor* ref : ref_latents) {
uint64_t h_offset = 0; int h_offset = 0;
uint64_t w_offset = 0; int w_offset = 0;
if (!increase_ref_index) { if (!increase_ref_index) {
if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) { if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
w_offset = curr_w_offset; w_offset = curr_w_offset;
} else { } else {
h_offset = curr_h_offset; h_offset = curr_h_offset;
} }
scale_rope = false;
} }
auto ref_ids = gen_flux_img_ids(ref->ne[1], auto ref_ids = gen_flux_img_ids(static_cast<int>(ref->ne[1]),
ref->ne[0], static_cast<int>(ref->ne[0]),
patch_size, patch_size,
bs, bs,
axes_dim_num, axes_dim_num,
static_cast<int>(index * ref_index_scale), static_cast<int>(index * ref_index_scale),
h_offset, h_offset,
w_offset); w_offset,
scale_rope);
ids = concat_ids(ids, ref_ids, bs); ids = concat_ids(ids, ref_ids, bs);
if (increase_ref_index) { if (increase_ref_index) {
index++; index++;
} }
curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset); curr_h_offset = std::max(curr_h_offset, static_cast<int>(ref->ne[1]) + h_offset);
curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset); curr_w_offset = std::max(curr_w_offset, static_cast<int>(ref->ne[0]) + w_offset);
} }
return ids; return ids;
} }
@ -222,7 +275,7 @@ namespace Rope {
auto ids = concat_ids(txt_ids, img_ids, bs); auto ids = concat_ids(txt_ids, img_ids, bs);
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale); auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale, false);
ids = concat_ids(ids, refs_ids, bs); ids = concat_ids(ids, refs_ids, bs);
} }
return ids; return ids;
@ -239,6 +292,8 @@ namespace Rope {
bool increase_ref_index, bool increase_ref_index,
float ref_index_scale, float ref_index_scale,
int theta, int theta,
bool circular_h,
bool circular_w,
const std::vector<int>& axes_dim) { const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_flux_ids(h, std::vector<std::vector<float>> ids = gen_flux_ids(h,
w, w,
@ -250,7 +305,47 @@ namespace Rope {
ref_latents, ref_latents,
increase_ref_index, increase_ref_index,
ref_index_scale); ref_index_scale);
return embed_nd(ids, bs, theta, axes_dim); std::vector<std::vector<int>> wrap_dims;
if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
int h_len = (h + (patch_size / 2)) / patch_size;
int w_len = (w + (patch_size / 2)) / patch_size;
if (h_len > 0 && w_len > 0) {
size_t pos_len = ids.size() / bs;
wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
size_t cursor = context_len; // text first
const size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
if (circular_h) {
wrap_dims[1][cursor + token_i] = h_len;
}
if (circular_w) {
wrap_dims[2][cursor + token_i] = w_len;
}
}
cursor += img_tokens;
// reference latents
for (ggml_tensor* ref : ref_latents) {
if (ref == nullptr) {
continue;
}
int ref_h = static_cast<int>(ref->ne[1]);
int ref_w = static_cast<int>(ref->ne[0]);
int ref_h_l = (ref_h + (patch_size / 2)) / patch_size;
int ref_w_l = (ref_w + (patch_size / 2)) / patch_size;
size_t ref_tokens = static_cast<size_t>(ref_h_l) * static_cast<size_t>(ref_w_l);
for (size_t token_i = 0; token_i < ref_tokens; ++token_i) {
if (circular_h) {
wrap_dims[1][cursor + token_i] = ref_h_l;
}
if (circular_w) {
wrap_dims[2][cursor + token_i] = ref_w_l;
}
}
cursor += ref_tokens;
}
}
}
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
} }
__STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen_image_ids(int h, __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen_image_ids(int h,
@ -263,7 +358,7 @@ namespace Rope {
int h_len = (h + (patch_size / 2)) / patch_size; int h_len = (h + (patch_size / 2)) / patch_size;
int w_len = (w + (patch_size / 2)) / patch_size; int w_len = (w + (patch_size / 2)) / patch_size;
int txt_id_start = std::max(h_len, w_len); int txt_id_start = std::max(h_len, w_len);
auto txt_ids = linspace<float>(txt_id_start, context_len + txt_id_start, context_len); auto txt_ids = linspace<float>(1.f * txt_id_start, 1.f * context_len + txt_id_start, context_len);
std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3)); std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
for (int i = 0; i < bs; ++i) { for (int i = 0; i < bs; ++i) {
for (int j = 0; j < txt_ids.size(); ++j) { for (int j = 0; j < txt_ids.size(); ++j) {
@ -271,10 +366,10 @@ namespace Rope {
} }
} }
int axes_dim_num = 3; int axes_dim_num = 3;
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num); auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, 0, 0, 0, true);
auto ids = concat_ids(txt_ids_repeated, img_ids, bs); auto ids = concat_ids(txt_ids_repeated, img_ids, bs);
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f); auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f, true);
ids = concat_ids(ids, refs_ids, bs); ids = concat_ids(ids, refs_ids, bs);
} }
return ids; return ids;
@ -289,9 +384,57 @@ namespace Rope {
const std::vector<ggml_tensor*>& ref_latents, const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index, bool increase_ref_index,
int theta, int theta,
bool circular_h,
bool circular_w,
const std::vector<int>& axes_dim) { const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index); std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
return embed_nd(ids, bs, theta, axes_dim); std::vector<std::vector<int>> wrap_dims;
// This logic simply stores the (pad and patch_adjusted) sizes of images so we can make sure rope correctly tiles
if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
int pad_h = (patch_size - (h % patch_size)) % patch_size;
int pad_w = (patch_size - (w % patch_size)) % patch_size;
int h_len = (h + pad_h) / patch_size;
int w_len = (w + pad_w) / patch_size;
if (h_len > 0 && w_len > 0) {
const size_t total_tokens = ids.size();
// Track per-token wrap lengths for the row/column axes so only spatial tokens become periodic.
wrap_dims.assign(axes_dim.size(), std::vector<int>(total_tokens / bs, 0));
size_t cursor = context_len; // ignore text tokens
const size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
if (circular_h) {
wrap_dims[1][cursor + token_i] = h_len;
}
if (circular_w) {
wrap_dims[2][cursor + token_i] = w_len;
}
}
cursor += img_tokens;
// For each reference image, store wrap sizes as well
for (ggml_tensor* ref : ref_latents) {
if (ref == nullptr) {
continue;
}
int ref_h = static_cast<int>(ref->ne[1]);
int ref_w = static_cast<int>(ref->ne[0]);
int ref_pad_h = (patch_size - (ref_h % patch_size)) % patch_size;
int ref_pad_w = (patch_size - (ref_w % patch_size)) % patch_size;
int ref_h_len = (ref_h + ref_pad_h) / patch_size;
int ref_w_len = (ref_w + ref_pad_w) / patch_size;
size_t ref_n_tokens = static_cast<size_t>(ref_h_len) * static_cast<size_t>(ref_w_len);
for (size_t token_i = 0; token_i < ref_n_tokens; ++token_i) {
if (circular_h) {
wrap_dims[1][cursor + token_i] = ref_h_len;
}
if (circular_w) {
wrap_dims[2][cursor + token_i] = ref_w_len;
}
}
cursor += ref_n_tokens;
}
}
}
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
} }
__STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t, __STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,
@ -310,9 +453,9 @@ namespace Rope {
std::vector<std::vector<float>> vid_ids(t_len * h_len * w_len, std::vector<float>(3, 0.0)); std::vector<std::vector<float>> vid_ids(t_len * h_len * w_len, std::vector<float>(3, 0.0));
std::vector<float> t_ids = linspace<float>(t_offset, t_len - 1 + t_offset, t_len); std::vector<float> t_ids = linspace<float>(1.f * t_offset, 1.f * t_len - 1 + t_offset, t_len);
std::vector<float> h_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len); std::vector<float> h_ids = linspace<float>(1.f * h_offset, 1.f * h_len - 1 + h_offset, h_len);
std::vector<float> w_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len); std::vector<float> w_ids = linspace<float>(1.f * w_offset, 1.f * w_len - 1 + w_offset, w_len);
for (int i = 0; i < t_len; ++i) { for (int i = 0; i < t_len; ++i) {
for (int j = 0; j < h_len; ++j) { for (int j = 0; j < h_len; ++j) {
@ -345,7 +488,7 @@ namespace Rope {
int theta, int theta,
const std::vector<int>& axes_dim) { const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_vid_ids(t, h, w, pt, ph, pw, bs); std::vector<std::vector<float>> ids = gen_vid_ids(t, h, w, pt, ph, pw, bs);
return embed_nd(ids, bs, theta, axes_dim); return embed_nd(ids, bs, static_cast<float>(theta), axes_dim);
} }
__STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen2vl_ids(int grid_h, __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen2vl_ids(int grid_h,
@ -363,8 +506,8 @@ namespace Rope {
GGML_ASSERT(i < grid_h * grid_w); GGML_ASSERT(i < grid_h * grid_w);
ids[i][0] = ih + iy; ids[i][0] = static_cast<float>(ih + iy);
ids[i][1] = iw + ix; ids[i][1] = static_cast<float>(iw + ix);
index++; index++;
} }
} }
@ -381,7 +524,7 @@ namespace Rope {
int theta, int theta,
const std::vector<int>& axes_dim) { const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_qwen2vl_ids(grid_h, grid_w, merge_size, window_index); std::vector<std::vector<float>> ids = gen_qwen2vl_ids(grid_h, grid_w, merge_size, window_index);
return embed_nd(ids, 1, theta, axes_dim); return embed_nd(ids, 1, static_cast<float>(theta), axes_dim);
} }
__STATIC_INLINE__ int bound_mod(int a, int m) { __STATIC_INLINE__ int bound_mod(int a, int m) {
@ -428,15 +571,39 @@ namespace Rope {
const std::vector<ggml_tensor*>& ref_latents, const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index, bool increase_ref_index,
int theta, int theta,
bool circular_h,
bool circular_w,
const std::vector<int>& axes_dim) { const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index); std::vector<std::vector<float>> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index);
return embed_nd(ids, bs, theta, axes_dim); std::vector<std::vector<int>> wrap_dims;
if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
int pad_h = (patch_size - (h % patch_size)) % patch_size;
int pad_w = (patch_size - (w % patch_size)) % patch_size;
int h_len = (h + pad_h) / patch_size;
int w_len = (w + pad_w) / patch_size;
if (h_len > 0 && w_len > 0) {
size_t pos_len = ids.size() / bs;
wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
size_t cursor = context_len + bound_mod(context_len, seq_multi_of); // skip text (and its padding)
size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
if (circular_h) {
wrap_dims[1][cursor + token_i] = h_len;
}
if (circular_w) {
wrap_dims[2][cursor + token_i] = w_len;
}
}
}
}
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
} }
__STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx, __STATIC_INLINE__ ggml_tensor* apply_rope(ggml_context* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* pe, ggml_tensor* pe,
bool rope_interleaved = true) { bool rope_interleaved = true) {
// x: [N, L, n_head, d_head] // x: [N, L, n_head, d_head]
// pe: [L, d_head/2, 2, 2], [[cos, -sin], [sin, cos]] // pe: [L, d_head/2, 2, 2], [[cos, -sin], [sin, cos]]
int64_t d_head = x->ne[0]; int64_t d_head = x->ne[0];
@ -474,21 +641,21 @@ namespace Rope {
return x_out; return x_out;
} }
__STATIC_INLINE__ struct ggml_tensor* attention(GGMLRunnerContext* ctx, __STATIC_INLINE__ ggml_tensor* attention(GGMLRunnerContext* ctx,
struct ggml_tensor* q, ggml_tensor* q,
struct ggml_tensor* k, ggml_tensor* k,
struct ggml_tensor* v, ggml_tensor* v,
struct ggml_tensor* pe, ggml_tensor* pe,
struct ggml_tensor* mask, ggml_tensor* mask,
float kv_scale = 1.0f, float kv_scale = 1.0f,
bool rope_interleaved = true) { bool rope_interleaved = true) {
// q,k,v: [N, L, n_head, d_head] // q,k,v: [N, L, n_head, d_head]
// pe: [L, d_head/2, 2, 2] // pe: [L, d_head/2, 2, 2]
// return: [N, L, n_head*d_head] // return: [N, L, n_head*d_head]
q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head] q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head]
k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head] k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head]
auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, false, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head] auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head]
return x; return x;
} }
}; // namespace Rope }; // namespace Rope

195
src/spectrum.hpp Normal file
View File

@ -0,0 +1,195 @@
#ifndef __SPECTRUM_HPP__
#define __SPECTRUM_HPP__
#include <cmath>
#include <cstring>
#include <vector>
#include "ggml_extend.hpp"
struct SpectrumConfig {
float w = 0.40f;
int m = 3;
float lam = 1.0f;
int window_size = 2;
float flex_window = 0.50f;
int warmup_steps = 4;
float stop_percent = 0.9f;
};
struct SpectrumState {
SpectrumConfig config;
int cnt = 0;
int num_cached = 0;
float curr_ws = 2.0f;
int K = 6;
int stop_step = 0;
int total_steps_skipped = 0;
std::vector<std::vector<float>> H_buf;
std::vector<float> T_buf;
void init(const SpectrumConfig& cfg, size_t total_steps) {
config = cfg;
cnt = 0;
num_cached = 0;
curr_ws = (float)cfg.window_size;
K = std::max(cfg.m + 1, 6);
stop_step = (int)(cfg.stop_percent * (float)total_steps);
total_steps_skipped = 0;
H_buf.clear();
T_buf.clear();
}
float taus(int step_cnt) const {
return (step_cnt / 50.0f) * 2.0f - 1.0f;
}
bool should_predict() {
if (cnt < config.warmup_steps)
return false;
if (stop_step > 0 && cnt >= stop_step)
return false;
if ((int)H_buf.size() < 2)
return false;
int ws = std::max(1, (int)std::floor(curr_ws));
return (num_cached + 1) % ws != 0;
}
void update(const ggml_tensor* denoised) {
int64_t ne = ggml_nelements(denoised);
const float* data = (const float*)denoised->data;
H_buf.emplace_back(data, data + ne);
T_buf.push_back(taus(cnt));
while ((int)H_buf.size() > K) {
H_buf.erase(H_buf.begin());
T_buf.erase(T_buf.begin());
}
if (cnt >= config.warmup_steps)
curr_ws += config.flex_window;
num_cached = 0;
cnt++;
}
void predict(ggml_tensor* denoised) {
int64_t F = (int64_t)H_buf[0].size();
int K_curr = (int)H_buf.size();
int M1 = config.m + 1;
float tau_at = taus(cnt);
// Design matrix X: K_curr x M1 (Chebyshev basis)
std::vector<float> X(K_curr * M1);
for (int i = 0; i < K_curr; i++) {
X[i * M1] = 1.0f;
if (M1 > 1)
X[i * M1 + 1] = T_buf[i];
for (int j = 2; j < M1; j++)
X[i * M1 + j] = 2.0f * T_buf[i] * X[i * M1 + j - 1] - X[i * M1 + j - 2];
}
// x_star: Chebyshev basis at current tau
std::vector<float> x_star(M1);
x_star[0] = 1.0f;
if (M1 > 1)
x_star[1] = tau_at;
for (int j = 2; j < M1; j++)
x_star[j] = 2.0f * tau_at * x_star[j - 1] - x_star[j - 2];
// XtX = X^T X + lambda I
std::vector<float> XtX(M1 * M1, 0.0f);
for (int i = 0; i < M1; i++) {
for (int j = 0; j < M1; j++) {
float sum = 0.0f;
for (int k = 0; k < K_curr; k++)
sum += X[k * M1 + i] * X[k * M1 + j];
XtX[i * M1 + j] = sum + (i == j ? config.lam : 0.0f);
}
}
// Cholesky decomposition
std::vector<float> L(M1 * M1, 0.0f);
if (!cholesky_decompose(XtX.data(), L.data(), M1)) {
float trace = 0.0f;
for (int i = 0; i < M1; i++)
trace += XtX[i * M1 + i];
for (int i = 0; i < M1; i++)
XtX[i * M1 + i] += 1e-4f * trace / M1;
cholesky_decompose(XtX.data(), L.data(), M1);
}
// Solve XtX v = x_star
std::vector<float> v(M1);
cholesky_solve(L.data(), x_star.data(), v.data(), M1);
// Prediction weights per history entry
std::vector<float> weights(K_curr, 0.0f);
for (int k = 0; k < K_curr; k++)
for (int j = 0; j < M1; j++)
weights[k] += X[k * M1 + j] * v[j];
// Blend Chebyshev and Taylor predictions
float* out = (float*)denoised->data;
float w_cheb = config.w;
float w_taylor = 1.0f - w_cheb;
const float* h_last = H_buf.back().data();
const float* h_prev = H_buf[H_buf.size() - 2].data();
for (int64_t f = 0; f < F; f++) {
float pred_cheb = 0.0f;
for (int k = 0; k < K_curr; k++)
pred_cheb += weights[k] * H_buf[k][f];
float pred_taylor = h_last[f] + 0.5f * (h_last[f] - h_prev[f]);
out[f] = w_taylor * pred_taylor + w_cheb * pred_cheb;
}
num_cached++;
total_steps_skipped++;
cnt++;
}
private:
static bool cholesky_decompose(const float* A, float* L, int n) {
std::memset(L, 0, n * n * sizeof(float));
for (int i = 0; i < n; i++) {
for (int j = 0; j <= i; j++) {
float sum = 0.0f;
for (int k = 0; k < j; k++)
sum += L[i * n + k] * L[j * n + k];
if (i == j) {
float diag = A[i * n + i] - sum;
if (diag <= 0.0f)
return false;
L[i * n + j] = std::sqrt(diag);
} else {
L[i * n + j] = (A[i * n + j] - sum) / L[j * n + j];
}
}
}
return true;
}
static void cholesky_solve(const float* L, const float* b, float* x, int n) {
std::vector<float> y(n);
for (int i = 0; i < n; i++) {
float sum = 0.0f;
for (int j = 0; j < i; j++)
sum += L[i * n + j] * y[j];
y[i] = (b[i] - sum) / L[i * n + i];
}
for (int i = n - 1; i >= 0; i--) {
float sum = 0.0f;
for (int j = i + 1; j < n; j++)
sum += L[j * n + i] * x[j];
x[i] = (y[i] - sum) / L[i * n + i];
}
}
};
#endif // __SPECTRUM_HPP__

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,7 @@
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "json.hpp" #include "json.hpp"
#include "model.h" #include "model.h"
#include "vocab/vocab.h"
// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h // Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h. // and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
@ -96,7 +97,7 @@ protected:
try { try {
data = nlohmann::json::parse(json_str); data = nlohmann::json::parse(json_str);
} catch (const nlohmann::json::parse_error& e) { } catch (const nlohmann::json::parse_error&) {
status_ = INVLIAD_JSON; status_ = INVLIAD_JSON;
return; return;
} }
@ -168,9 +169,9 @@ protected:
kMaxTrieResultsSize); kMaxTrieResultsSize);
trie_results_size_ = 0; trie_results_size_ = 0;
for (const auto& p : *pieces) { for (const auto& p : *pieces) {
const int num_nodes = trie_->commonPrefixSearch( const size_t num_nodes = trie_->commonPrefixSearch(
p.first.data(), results.data(), results.size(), p.first.size()); p.first.data(), results.data(), results.size(), p.first.size());
trie_results_size_ = std::max(trie_results_size_, num_nodes); trie_results_size_ = std::max(trie_results_size_, static_cast<int>(num_nodes));
} }
if (trie_results_size_ == 0) if (trie_results_size_ == 0)
@ -210,9 +211,9 @@ protected:
// implementation. It's based on the following three ideas: // implementation. It's based on the following three ideas:
// //
// 1. Because it uses the *unigram* model: // 1. Because it uses the *unigram* model:
// best_score(x1, x2, …, xt) = best_score(x1, x2, …, x{t-1}) + score(xt) // best_score(x1, x2, <EFBFBD>? xt) = best_score(x1, x2, <20>? x{t-1}) + score(xt)
// Deciding the best path (and score) can be decoupled into two isolated // Deciding the best path (and score) can be decoupled into two isolated
// terms: (a) the best path ended before the last token `best_score(x1, x2, …, // terms: (a) the best path ended before the last token `best_score(x1, x2, <EFBFBD>?
// x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
// not related to each other at all. // not related to each other at all.
// //
@ -268,7 +269,7 @@ protected:
-1; // The starting position (in utf-8) of this node. The entire best -1; // The starting position (in utf-8) of this node. The entire best
// path can be constructed by backtracking along this link. // path can be constructed by backtracking along this link.
}; };
const int size = normalized.size(); const int size = static_cast<int>(normalized.size());
const float unk_score = min_score() - kUnkPenalty; const float unk_score = min_score() - kUnkPenalty;
// The ends are exclusive. // The ends are exclusive.
std::vector<BestPathNode> best_path_ends_at(size + 1); std::vector<BestPathNode> best_path_ends_at(size + 1);
@ -281,7 +282,7 @@ protected:
best_path_ends_at[starts_at].best_path_score; best_path_ends_at[starts_at].best_path_score;
bool has_single_node = false; bool has_single_node = false;
const int mblen = const int mblen =
std::min<int>(OneCharLen(normalized.data() + starts_at), std::min<int>(static_cast<int>(OneCharLen(normalized.data() + starts_at)),
size - starts_at); size - starts_at);
while (key_pos < size) { while (key_pos < size) {
const int ret = const int ret =
@ -302,7 +303,7 @@ protected:
score + best_path_score_till_here; score + best_path_score_till_here;
if (target_node.starts_at == -1 || if (target_node.starts_at == -1 ||
candidate_best_path_score > target_node.best_path_score) { candidate_best_path_score > target_node.best_path_score) {
target_node.best_path_score = candidate_best_path_score; target_node.best_path_score = static_cast<float>(candidate_best_path_score);
target_node.starts_at = starts_at; target_node.starts_at = starts_at;
target_node.id = ret; target_node.id = ret;
} }
@ -341,9 +342,9 @@ protected:
public: public:
explicit T5UniGramTokenizer(bool is_umt5 = false) { explicit T5UniGramTokenizer(bool is_umt5 = false) {
if (is_umt5) { if (is_umt5) {
InitializePieces(ModelLoader::load_umt5_tokenizer_json()); InitializePieces(load_umt5_tokenizer_json());
} else { } else {
InitializePieces(ModelLoader::load_t5_tokenizer_json()); InitializePieces(load_t5_tokenizer_json());
} }
min_score_ = FLT_MAX; min_score_ = FLT_MAX;
@ -394,7 +395,7 @@ public:
bool padding = false) { bool padding = false) {
if (max_length > 0 && padding) { if (max_length > 0 && padding) {
size_t orig_token_num = tokens.size() - 1; size_t orig_token_num = tokens.size() - 1;
size_t n = std::ceil(orig_token_num * 1.0 / (max_length - 1)); size_t n = static_cast<size_t>(std::ceil(orig_token_num * 1.0 / (max_length - 1)));
if (n == 0) { if (n == 0) {
n = 1; n = 1;
} }
@ -461,7 +462,7 @@ protected:
int64_t hidden_size; int64_t hidden_size;
float eps; float eps;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
} }
@ -472,10 +473,10 @@ public:
: hidden_size(hidden_size), : hidden_size(hidden_size),
eps(eps) {} eps(eps) {}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
struct ggml_tensor* w = params["weight"]; ggml_tensor* w = params["weight"];
x = ggml_rms_norm(ctx->ggml_ctx, x, eps); x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
x = ggml_mul(ctx->ggml_ctx, x, w); x = ggml_mul(ctx->ggml_ctx, x, w);
return x; return x;
} }
}; };
@ -487,7 +488,7 @@ public:
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false)); blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]); auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
@ -509,13 +510,13 @@ public:
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale)); blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]); auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]); auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
auto hidden_gelu = ggml_gelu_inplace(ctx->ggml_ctx, wi_0->forward(ctx, x)); auto hidden_gelu = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true);
auto hidden_linear = wi_1->forward(ctx, x); auto hidden_linear = wi_1->forward(ctx, x);
x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear); x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
x = wo->forward(ctx, x); x = wo->forward(ctx, x);
@ -530,7 +531,7 @@ public:
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]); auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
@ -569,8 +570,8 @@ public:
} }
} }
struct ggml_tensor* compute_bias(GGMLRunnerContext* ctx, ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
struct ggml_tensor* relative_position_bucket) { ggml_tensor* relative_position_bucket) {
auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]); auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads)
@ -579,11 +580,11 @@ public:
} }
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx, std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* past_bias = nullptr, ggml_tensor* past_bias = nullptr,
struct ggml_tensor* mask = nullptr, ggml_tensor* mask = nullptr,
struct ggml_tensor* relative_position_bucket = nullptr) { ggml_tensor* relative_position_bucket = nullptr) {
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q"]); auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q"]);
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k"]); auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k"]);
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v"]); auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v"]);
@ -608,7 +609,7 @@ public:
} }
} }
k = ggml_scale_inplace(ctx->ggml_ctx, k, sqrt(d_head)); k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast<float>(d_head)), true);
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head]
@ -628,11 +629,11 @@ public:
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
} }
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx, std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* past_bias = nullptr, ggml_tensor* past_bias = nullptr,
struct ggml_tensor* mask = nullptr, ggml_tensor* mask = nullptr,
struct ggml_tensor* relative_position_bucket = nullptr) { ggml_tensor* relative_position_bucket = nullptr) {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]); auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
@ -654,11 +655,11 @@ public:
blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim)); blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
} }
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx, std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* past_bias = nullptr, ggml_tensor* past_bias = nullptr,
struct ggml_tensor* mask = nullptr, ggml_tensor* mask = nullptr,
struct ggml_tensor* relative_position_bucket = nullptr) { ggml_tensor* relative_position_bucket = nullptr) {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]); auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]); auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
@ -689,11 +690,11 @@ public:
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* past_bias = nullptr, ggml_tensor* past_bias = nullptr,
struct ggml_tensor* attention_mask = nullptr, ggml_tensor* attention_mask = nullptr,
struct ggml_tensor* relative_position_bucket = nullptr) { ggml_tensor* relative_position_bucket = nullptr) {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
@ -736,11 +737,11 @@ public:
params.model_dim)); params.model_dim));
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
struct ggml_tensor* past_bias = nullptr, ggml_tensor* past_bias = nullptr,
struct ggml_tensor* attention_mask = nullptr, ggml_tensor* attention_mask = nullptr,
struct ggml_tensor* relative_position_bucket = nullptr) { ggml_tensor* relative_position_bucket = nullptr) {
// input_ids: [N, n_token] // input_ids: [N, n_token]
auto shared = std::dynamic_pointer_cast<Embedding>(blocks["shared"]); auto shared = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
@ -775,14 +776,14 @@ struct T5Runner : public GGMLRunner {
return "t5"; return "t5";
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix); model.get_param_tensors(tensors, prefix);
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
struct ggml_tensor* relative_position_bucket, ggml_tensor* relative_position_bucket,
struct ggml_tensor* attention_mask = nullptr) { ggml_tensor* attention_mask = nullptr) {
size_t N = input_ids->ne[1]; size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0]; size_t n_token = input_ids->ne[0];
@ -790,14 +791,14 @@ struct T5Runner : public GGMLRunner {
return hidden_states; return hidden_states;
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, ggml_cgraph* build_graph(ggml_tensor* input_ids,
struct ggml_tensor* attention_mask = nullptr) { ggml_tensor* attention_mask = nullptr) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); ggml_cgraph* gf = ggml_new_graph(compute_ctx);
input_ids = to_backend(input_ids); input_ids = to_backend(input_ids);
attention_mask = to_backend(attention_mask); attention_mask = to_backend(attention_mask);
relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]); relative_position_bucket_vec = compute_relative_position_bucket(static_cast<int>(input_ids->ne[0]), static_cast<int>(input_ids->ne[0]));
// for (int i = 0; i < relative_position_bucket_vec.size(); i++) { // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
// if (i % 77 == 0) { // if (i % 77 == 0) {
@ -812,8 +813,8 @@ struct T5Runner : public GGMLRunner {
input_ids->ne[0]); input_ids->ne[0]);
set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask); ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
@ -821,11 +822,11 @@ struct T5Runner : public GGMLRunner {
} }
bool compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* input_ids, ggml_tensor* input_ids,
struct ggml_tensor* attention_mask, ggml_tensor* attention_mask,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(input_ids, attention_mask); return build_graph(input_ids, attention_mask);
}; };
return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
@ -911,7 +912,7 @@ struct T5Embedder {
: model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix); model.get_param_tensors(tensors, prefix);
} }
@ -961,17 +962,17 @@ struct T5Embedder {
} }
void test() { void test() {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != nullptr); GGML_ASSERT(work_ctx != nullptr);
{ {
std::string text("a lovely cat"); std::string text("a lovely cat");
// std::string text("一只可爱的猫"); // umt5 chinease test // std::string text("一只可爱的<EFBFBD>?); // umt5 chinease test
auto tokens_and_weights = tokenize(text, 512, true); auto tokens_and_weights = tokenize(text, 512, true);
std::vector<int>& tokens = std::get<0>(tokens_and_weights); std::vector<int>& tokens = std::get<0>(tokens_and_weights);
std::vector<float>& weights = std::get<1>(tokens_and_weights); std::vector<float>& weights = std::get<1>(tokens_and_weights);
@ -980,16 +981,16 @@ struct T5Embedder {
printf("%d ", token); printf("%d ", token);
} }
printf("\n"); printf("\n");
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
auto attention_mask = vector_to_ggml_tensor(work_ctx, masks); auto attention_mask = vector_to_ggml_tensor(work_ctx, masks);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
model.compute(8, input_ids, attention_mask, &out, work_ctx); model.compute(8, input_ids, attention_mask, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("t5 test done in %dms", t1 - t0); LOG_DEBUG("t5 test done in %lldms", t1 - t0);
} }
} }

666
src/tae.hpp Normal file
View File

@ -0,0 +1,666 @@
#ifndef __TAE_HPP__
#define __TAE_HPP__
#include "ggml_extend.hpp"
#include "model.h"
/*
=================================== TinyAutoEncoder ===================================
References:
https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
https://github.com/madebyollin/taesd/blob/main/taesd.py
*/
class TAEBlock : public UnaryBlock {
protected:
int n_in;
int n_out;
bool use_midblock_gn;
public:
TAEBlock(int n_in, int n_out, bool use_midblock_gn = false)
: n_in(n_in), n_out(n_out), use_midblock_gn(use_midblock_gn) {
blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}));
blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
if (n_in != n_out) {
blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false));
}
if (use_midblock_gn) {
int n_gn = n_in * 4;
blocks["pool.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_gn, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
blocks["pool.1"] = std::shared_ptr<GGMLBlock>(new GroupNorm(4, n_gn));
// pool.2 is ReLU, handled in forward
blocks["pool.3"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_gn, n_in, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
}
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [n, n_in, h, w]
// return: [n, n_out, h, w]
if (use_midblock_gn) {
auto pool_0 = std::dynamic_pointer_cast<Conv2d>(blocks["pool.0"]);
auto pool_1 = std::dynamic_pointer_cast<GroupNorm>(blocks["pool.1"]);
auto pool_3 = std::dynamic_pointer_cast<Conv2d>(blocks["pool.3"]);
auto p = pool_0->forward(ctx, x);
p = pool_1->forward(ctx, p);
p = ggml_relu_inplace(ctx->ggml_ctx, p);
p = pool_3->forward(ctx, p);
x = ggml_add(ctx->ggml_ctx, x, p);
}
auto conv_0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
auto conv_2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
auto h = conv_0->forward(ctx, x);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv_2->forward(ctx, h);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv_4->forward(ctx, h);
if (n_in != n_out) {
auto skip = std::dynamic_pointer_cast<Conv2d>(blocks["skip"]);
LOG_DEBUG("skip");
x = skip->forward(ctx, x);
}
h = ggml_add(ctx->ggml_ctx, h, x);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
return h;
}
};
class TinyEncoder : public UnaryBlock {
int in_channels = 3;
int channels = 64;
int z_channels = 4;
int num_blocks = 3;
public:
TinyEncoder(int z_channels = 4, bool use_midblock_gn = false)
: z_channels(z_channels) {
int index = 0;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, use_midblock_gn));
}
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [n, in_channels, h, w]
// return: [n, z_channels, h/8, w/8]
for (int i = 0; i < num_blocks * 3 + 6; i++) {
auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
x = block->forward(ctx, x);
}
return x;
}
};
class TinyDecoder : public UnaryBlock {
int z_channels = 4;
int channels = 64;
int out_channels = 3;
int num_blocks = 3;
public:
TinyDecoder(int z_channels = 4, bool use_midblock_gn = false)
: z_channels(z_channels) {
int index = 0;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}));
index++; // nn.ReLU()
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, use_midblock_gn));
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) override {
// z: [n, z_channels, h, w]
// return: [n, out_channels, h*8, w*8]
auto h = ggml_ext_scale(ctx->ggml_ctx, z, 1.0f / 3.0f);
h = ggml_tanh_inplace(ctx->ggml_ctx, h);
h = ggml_ext_scale(ctx->ggml_ctx, h, 3.0f);
for (int i = 0; i < num_blocks * 3 + 10; i++) {
if (blocks.find(std::to_string(i)) == blocks.end()) {
if (i == 1) {
h = ggml_relu_inplace(ctx->ggml_ctx, h);
} else {
h = ggml_upscale(ctx->ggml_ctx, h, 2, GGML_SCALE_MODE_NEAREST);
}
continue;
}
auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
h = block->forward(ctx, h);
}
return h;
}
};
class TPool : public UnaryBlock {
int stride;
public:
TPool(int channels, int stride)
: stride(stride) {
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels * stride, channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks["conv"]);
auto h = x;
if (stride != 1) {
h = ggml_reshape_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2] * stride, h->ne[3] / stride);
}
h = conv->forward(ctx, h);
return h;
}
};
class TGrow : public UnaryBlock {
int stride;
public:
TGrow(int channels, int stride)
: stride(stride) {
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels * stride, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks["conv"]);
auto h = conv->forward(ctx, x);
if (stride != 1) {
h = ggml_reshape_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2] / stride, h->ne[3] * stride);
}
return h;
}
};
class MemBlock : public GGMLBlock {
bool has_skip_conv = false;
public:
MemBlock(int channels, int out_channels)
: has_skip_conv(channels != out_channels) {
blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels * 2, out_channels, {3, 3}, {1, 1}, {1, 1}));
blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
if (has_skip_conv) {
blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
}
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* past) {
// x: [n, channels, h, w]
auto conv0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
auto h = ggml_concat(ctx->ggml_ctx, x, past, 2);
h = conv0->forward(ctx, h);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv1->forward(ctx, h);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv2->forward(ctx, h);
auto skip = x;
if (has_skip_conv) {
auto skip_conv = std::dynamic_pointer_cast<Conv2d>(blocks["skip"]);
skip = skip_conv->forward(ctx, x);
}
h = ggml_add_inplace(ctx->ggml_ctx, h, skip);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
return h;
}
};
ggml_tensor* patchify(ggml_context* ctx,
ggml_tensor* x,
int64_t patch_size,
int64_t b = 1) {
// x: [f, b*c, h*q, w*r]
// return: [f, b*c*r*q, h, w]
if (patch_size == 1) {
return x;
}
int64_t r = patch_size;
int64_t q = patch_size;
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t C = x->ne[2];
int64_t f = x->ne[3];
int64_t w = W / r;
int64_t h = H / q;
x = ggml_reshape_4d(ctx, x, W, q, h, C * f); // [W, q, h, C*f]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [W, h, q, C*f]
x = ggml_reshape_4d(ctx, x, r, w, h, q * C * f); // [r, w, h, q*C*f]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [w, h, r, q*C*f]
x = ggml_reshape_4d(ctx, x, w, h, r * q * C, f); // [f, b*c*r*q, h, w]
return x;
}
ggml_tensor* unpatchify(ggml_context* ctx,
ggml_tensor* x,
int64_t patch_size,
int64_t b = 1) {
// x: [f, b*c*r*q, h, w]
// return: [f, b*c, h*q, w*r]
if (patch_size == 1) {
return x;
}
int64_t r = patch_size;
int64_t q = patch_size;
int64_t c = x->ne[2] / b / q / r;
int64_t f = x->ne[3];
int64_t h = x->ne[1];
int64_t w = x->ne[0];
x = ggml_reshape_4d(ctx, x, w, h, r, q * c * b * f); // [q*c*b*f, r, h, w]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [r, w, h, q*c*b*f]
x = ggml_reshape_4d(ctx, x, r * w, h, q, c * b * f); // [c*b*f, q, h, r*w]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [r*w, q, h, c*b*f]
x = ggml_reshape_4d(ctx, x, r * w, q * h, c * b, f);
return x;
}
class TinyVideoEncoder : public UnaryBlock {
int in_channels = 3;
int hidden = 64;
int z_channels = 4;
int num_blocks = 3;
int num_layers = 3;
int patch_size = 1;
public:
TinyVideoEncoder(int z_channels = 4, int patch_size = 1)
: z_channels(z_channels), patch_size(patch_size) {
int index = 0;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels * patch_size * patch_size, hidden, {3, 3}, {1, 1}, {1, 1}));
index++; // nn.ReLU()
for (int i = 0; i < num_layers; i++) {
int stride = i == num_layers - 1 ? 1 : 2;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TPool(hidden, stride));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(hidden, hidden, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int j = 0; j < num_blocks; j++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new MemBlock(hidden, hidden));
}
}
blocks[std::to_string(index)] = std::shared_ptr<GGMLBlock>(new Conv2d(hidden, z_channels, {3, 3}, {1, 1}, {1, 1}));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) override {
auto first_conv = std::dynamic_pointer_cast<Conv2d>(blocks["0"]);
if (patch_size > 1) {
z = patchify(ctx->ggml_ctx, z, patch_size, 1);
}
auto h = first_conv->forward(ctx, z);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
int index = 2;
for (int i = 0; i < num_layers; i++) {
auto pool = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(index++)]);
auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(index++)]);
h = pool->forward(ctx, h);
h = conv->forward(ctx, h);
for (int j = 0; j < num_blocks; j++) {
auto block = std::dynamic_pointer_cast<MemBlock>(blocks[std::to_string(index++)]);
auto mem = ggml_pad_ext(ctx->ggml_ctx, h, 0, 0, 0, 0, 0, 0, 1, 0);
mem = ggml_view_4d(ctx->ggml_ctx, mem, h->ne[0], h->ne[1], h->ne[2], h->ne[3], h->nb[1], h->nb[2], h->nb[3], 0);
h = block->forward(ctx, h, mem);
}
}
auto last_conv = std::dynamic_pointer_cast<Conv2d>(blocks[std::to_string(index)]);
h = last_conv->forward(ctx, h);
return h;
}
};
class TinyVideoDecoder : public UnaryBlock {
int z_channels = 4;
int out_channels = 3;
int num_blocks = 3;
static const int num_layers = 3;
int channels[num_layers + 1] = {256, 128, 64, 64};
int patch_size = 1;
public:
TinyVideoDecoder(int z_channels = 4, int patch_size = 1)
: z_channels(z_channels), patch_size(patch_size) {
int index = 1; // Clamp()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels[0], {3, 3}, {1, 1}, {1, 1}));
index++; // nn.ReLU()
for (int i = 0; i < num_layers; i++) {
int stride = i == 0 ? 1 : 2;
for (int j = 0; j < num_blocks; j++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new MemBlock(channels[i], channels[i]));
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TGrow(channels[i], stride));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels[i], channels[i + 1], {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
}
index++; // nn.ReLU()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels[num_layers], out_channels * patch_size * patch_size, {3, 3}, {1, 1}, {1, 1}));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) override {
auto first_conv = std::dynamic_pointer_cast<Conv2d>(blocks["1"]);
// Clamp()
auto h = ggml_ext_scale(ctx->ggml_ctx,
ggml_tanh_inplace(ctx->ggml_ctx,
ggml_ext_scale(ctx->ggml_ctx, z, 1.0f / 3.0f)),
3.0f,
true);
h = first_conv->forward(ctx, h);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
int index = 3;
for (int i = 0; i < num_layers; i++) {
for (int j = 0; j < num_blocks; j++) {
auto block = std::dynamic_pointer_cast<MemBlock>(blocks[std::to_string(index++)]);
auto mem = ggml_pad_ext(ctx->ggml_ctx, h, 0, 0, 0, 0, 0, 0, 1, 0);
mem = ggml_view_4d(ctx->ggml_ctx, mem, h->ne[0], h->ne[1], h->ne[2], h->ne[3], h->nb[1], h->nb[2], h->nb[3], 0);
h = block->forward(ctx, h, mem);
}
// upsample
index++;
h = ggml_upscale(ctx->ggml_ctx, h, 2, GGML_SCALE_MODE_NEAREST);
auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(index++)]);
h = block->forward(ctx, h);
block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(index++)]);
h = block->forward(ctx, h);
}
h = ggml_relu_inplace(ctx->ggml_ctx, h);
auto last_conv = std::dynamic_pointer_cast<Conv2d>(blocks[std::to_string(++index)]);
h = last_conv->forward(ctx, h);
if (patch_size > 1) {
h = unpatchify(ctx->ggml_ctx, h, patch_size, 1);
}
// shape(W, H, 3, 3 + T) => shape(W, H, 3, T)
h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - 3, h->nb[1], h->nb[2], h->nb[3], 3 * h->nb[3]);
return h;
}
};
class TAEHV : public GGMLBlock {
protected:
bool decode_only;
SDVersion version;
public:
int z_channels = 16;
public:
TAEHV(bool decode_only = true, SDVersion version = VERSION_WAN2)
: decode_only(decode_only), version(version) {
int patch = 1;
if (version == VERSION_WAN2_2_TI2V) {
z_channels = 48;
patch = 2;
}
blocks["decoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoDecoder(z_channels, patch));
if (!decode_only) {
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoEncoder(z_channels, patch));
}
}
ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
auto decoder = std::dynamic_pointer_cast<TinyVideoDecoder>(blocks["decoder"]);
if (sd_version_is_wan(version)) {
// (W, H, C, T) -> (W, H, T, C)
z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 1, 3, 2));
}
auto result = decoder->forward(ctx, z);
if (sd_version_is_wan(version)) {
// (W, H, C, T) -> (W, H, T, C)
result = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, result, 0, 1, 3, 2));
}
return result;
}
ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto encoder = std::dynamic_pointer_cast<TinyVideoEncoder>(blocks["encoder"]);
// (W, H, T, C) -> (W, H, C, T)
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));
int64_t num_frames = x->ne[3];
if (num_frames % 4) {
// pad to multiple of 4 at the end
auto last_frame = ggml_view_4d(ctx->ggml_ctx, x, x->ne[0], x->ne[1], x->ne[2], 1, x->nb[1], x->nb[2], x->nb[3], (num_frames - 1) * x->nb[3]);
for (int i = 0; i < 4 - num_frames % 4; i++) {
x = ggml_concat(ctx->ggml_ctx, x, last_frame, 3);
}
}
x = encoder->forward(ctx, x);
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));
return x;
}
};
class TAESD : public GGMLBlock {
protected:
bool decode_only;
bool taef2 = false;
public:
int z_channels = 4;
public:
TAESD(bool decode_only = true, SDVersion version = VERSION_SD1)
: decode_only(decode_only) {
bool use_midblock_gn = false;
taef2 = sd_version_is_flux2(version);
if (sd_version_is_dit(version)) {
z_channels = 16;
}
if (taef2) {
z_channels = 32;
use_midblock_gn = true;
}
blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder(z_channels, use_midblock_gn));
if (!decode_only) {
blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder(z_channels, use_midblock_gn));
}
}
ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
if (taef2) {
z = unpatchify(ctx->ggml_ctx, z, 2);
}
return decoder->forward(ctx, z);
}
ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
auto z = encoder->forward(ctx, x);
if (taef2) {
z = patchify(ctx->ggml_ctx, z, 2);
}
return z;
}
};
struct TinyImageAutoEncoder : public VAE {
TAESD taesd;
bool decode_only = false;
TinyImageAutoEncoder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool decoder_only = true,
SDVersion version = VERSION_SD1)
: decode_only(decoder_only),
taesd(decoder_only, version),
VAE(version, backend, offload_params_to_cpu) {
scale_input = false;
taesd.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "taesd";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
taesd.get_param_tensors(tensors, prefix);
}
ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
return vae_output;
}
ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
}
ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
}
int get_encoder_output_channels(int input_channels) {
return taesd.z_channels;
}
ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) {
ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z);
auto runner_ctx = get_context();
ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z);
ggml_build_forward_expand(gf, out);
return gf;
}
bool _compute(const int n_threads,
ggml_tensor* z,
bool decode_graph,
ggml_tensor** output,
ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(z, decode_graph);
};
return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
}
};
struct TinyVideoAutoEncoder : public VAE {
TAEHV taehv;
bool decode_only = false;
TinyVideoAutoEncoder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool decoder_only = true,
SDVersion version = VERSION_WAN2)
: decode_only(decoder_only),
taehv(decoder_only, version),
VAE(version, backend, offload_params_to_cpu) {
scale_input = false;
taehv.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "taehv";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
taehv.get_param_tensors(tensors, prefix);
}
ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
return vae_output;
}
ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
}
ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
}
int get_encoder_output_channels(int input_channels) {
return taehv.z_channels;
}
ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) {
ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z);
auto runner_ctx = get_context();
ggml_tensor* out = decode_graph ? taehv.decode(&runner_ctx, z) : taehv.encode(&runner_ctx, z);
ggml_build_forward_expand(gf, out);
return gf;
}
bool _compute(const int n_threads,
ggml_tensor* z,
bool decode_graph,
ggml_tensor** output,
ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(z, decode_graph);
};
return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
}
};
#endif // __TAE_HPP__

View File

@ -919,15 +919,21 @@ std::vector<std::string> token_split(const std::string& text) {
// `\s*[\r\n]+|\s+(?!\S)|\s+` // `\s*[\r\n]+|\s+(?!\S)|\s+`
if (is_space(cp)) { if (is_space(cp)) {
std::string token = codepoint_to_utf8(cp); std::string token;
++i; bool saw_new_line = false;
while (i < cps.size() && is_space(cps[i])) { while (i < cps.size() && is_space(cps[i])) {
token += codepoint_to_utf8(cps[i]); token += codepoint_to_utf8(cps[i]);
++i;
if (cps[i] == U'\r' || cps[i] == U'\n') { if (cps[i] == U'\r' || cps[i] == U'\n') {
break; saw_new_line = true;
} else {
if (saw_new_line) {
break;
}
} }
++i;
} }
tokens.push_back(token); tokens.push_back(token);

434
src/ucache.hpp Normal file
View File

@ -0,0 +1,434 @@
#ifndef __UCACHE_HPP__
#define __UCACHE_HPP__
#include <cmath>
#include <limits>
#include <unordered_map>
#include <vector>
#include "denoiser.hpp"
#include "ggml_extend.hpp"
struct UCacheConfig {
bool enabled = false;
float reuse_threshold = 1.0f;
float start_percent = 0.15f;
float end_percent = 0.95f;
float error_decay_rate = 1.0f;
bool use_relative_threshold = true;
bool adaptive_threshold = true;
float early_step_multiplier = 0.5f;
float late_step_multiplier = 1.5f;
float relative_norm_gain = 1.6f;
bool reset_error_on_compute = true;
};
struct UCacheCacheEntry {
std::vector<float> diff;
};
struct UCacheState {
UCacheConfig config;
Denoiser* denoiser = nullptr;
float start_sigma = std::numeric_limits<float>::max();
float end_sigma = 0.0f;
bool initialized = false;
bool initial_step = true;
bool skip_current_step = false;
bool step_active = false;
const SDCondition* anchor_condition = nullptr;
std::unordered_map<const SDCondition*, UCacheCacheEntry> cache_diffs;
std::vector<float> prev_input;
std::vector<float> prev_output;
float output_prev_norm = 0.0f;
bool has_prev_input = false;
bool has_prev_output = false;
bool has_output_prev_norm = false;
bool has_relative_transformation_rate = false;
float relative_transformation_rate = 0.0f;
float last_input_change = 0.0f;
bool has_last_input_change = false;
float output_change_ema = 0.0f;
bool has_output_change_ema = false;
int total_steps_skipped = 0;
int current_step_index = -1;
int steps_computed_since_active = 0;
int expected_total_steps = 0;
int consecutive_skipped_steps = 0;
float accumulated_error = 0.0f;
struct BlockMetrics {
float sum_transformation_rate = 0.0f;
float sum_output_norm = 0.0f;
int sample_count = 0;
float min_change_rate = std::numeric_limits<float>::max();
float max_change_rate = 0.0f;
void reset() {
sum_transformation_rate = 0.0f;
sum_output_norm = 0.0f;
sample_count = 0;
min_change_rate = std::numeric_limits<float>::max();
max_change_rate = 0.0f;
}
void record(float change_rate, float output_norm) {
if (std::isfinite(change_rate) && change_rate > 0.0f) {
sum_transformation_rate += change_rate;
sum_output_norm += output_norm;
sample_count++;
if (change_rate < min_change_rate)
min_change_rate = change_rate;
if (change_rate > max_change_rate)
max_change_rate = change_rate;
}
}
float avg_transformation_rate() const {
return (sample_count > 0) ? (sum_transformation_rate / sample_count) : 0.0f;
}
float avg_output_norm() const {
return (sample_count > 0) ? (sum_output_norm / sample_count) : 0.0f;
}
};
BlockMetrics block_metrics;
int total_active_steps = 0;
void reset_runtime() {
initial_step = true;
skip_current_step = false;
step_active = false;
anchor_condition = nullptr;
cache_diffs.clear();
prev_input.clear();
prev_output.clear();
output_prev_norm = 0.0f;
has_prev_input = false;
has_prev_output = false;
has_output_prev_norm = false;
has_relative_transformation_rate = false;
relative_transformation_rate = 0.0f;
last_input_change = 0.0f;
has_last_input_change = false;
output_change_ema = 0.0f;
has_output_change_ema = false;
total_steps_skipped = 0;
current_step_index = -1;
steps_computed_since_active = 0;
expected_total_steps = 0;
consecutive_skipped_steps = 0;
accumulated_error = 0.0f;
block_metrics.reset();
total_active_steps = 0;
}
void init(const UCacheConfig& cfg, Denoiser* d) {
config = cfg;
denoiser = d;
initialized = cfg.enabled && d != nullptr;
reset_runtime();
if (initialized) {
start_sigma = percent_to_sigma(config.start_percent);
end_sigma = percent_to_sigma(config.end_percent);
}
}
void set_sigmas(const std::vector<float>& sigmas) {
if (!initialized || sigmas.size() < 2) {
return;
}
size_t n_steps = sigmas.size() - 1;
expected_total_steps = static_cast<int>(n_steps);
size_t start_step = static_cast<size_t>(config.start_percent * n_steps);
size_t end_step = static_cast<size_t>(config.end_percent * n_steps);
if (start_step >= n_steps)
start_step = n_steps - 1;
if (end_step >= n_steps)
end_step = n_steps - 1;
start_sigma = sigmas[start_step];
end_sigma = sigmas[end_step];
if (start_sigma < end_sigma) {
std::swap(start_sigma, end_sigma);
}
}
bool enabled() const {
return initialized && config.enabled;
}
float percent_to_sigma(float percent) const {
if (!denoiser) {
return 0.0f;
}
if (percent <= 0.0f) {
return std::numeric_limits<float>::max();
}
if (percent >= 1.0f) {
return 0.0f;
}
float t = (1.0f - percent) * (TIMESTEPS - 1);
return denoiser->t_to_sigma(t);
}
void begin_step(int step_index, float sigma) {
if (!enabled()) {
return;
}
if (step_index == current_step_index) {
return;
}
current_step_index = step_index;
skip_current_step = false;
has_last_input_change = false;
step_active = false;
if (sigma > start_sigma) {
return;
}
if (!(sigma > end_sigma)) {
return;
}
step_active = true;
total_active_steps++;
}
bool step_is_active() const {
return enabled() && step_active;
}
bool is_step_skipped() const {
return enabled() && step_active && skip_current_step;
}
float get_adaptive_threshold(int estimated_total_steps = 0) const {
float base_threshold = config.reuse_threshold;
if (!config.adaptive_threshold) {
return base_threshold;
}
int effective_total = estimated_total_steps;
if (effective_total <= 0) {
effective_total = expected_total_steps;
}
if (effective_total <= 0) {
effective_total = std::max(20, steps_computed_since_active * 2);
}
float progress = (effective_total > 0) ? (static_cast<float>(steps_computed_since_active) / effective_total) : 0.0f;
progress = std::max(0.0f, std::min(1.0f, progress));
float multiplier = 1.0f;
if (progress < 0.2f) {
multiplier = config.early_step_multiplier;
} else if (progress > 0.8f) {
multiplier = config.late_step_multiplier;
}
return base_threshold * multiplier;
}
bool has_cache(const SDCondition* cond) const {
auto it = cache_diffs.find(cond);
return it != cache_diffs.end() && !it->second.diff.empty();
}
void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
UCacheCacheEntry& entry = cache_diffs[cond];
size_t ne = static_cast<size_t>(ggml_nelements(output));
entry.diff.resize(ne);
float* out_data = (float*)output->data;
float* in_data = (float*)input->data;
for (size_t i = 0; i < ne; ++i) {
entry.diff[i] = out_data[i] - in_data[i];
}
}
void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
auto it = cache_diffs.find(cond);
if (it == cache_diffs.end() || it->second.diff.empty()) {
return;
}
copy_ggml_tensor(output, input);
float* out_data = (float*)output->data;
const std::vector<float>& diff = it->second.diff;
for (size_t i = 0; i < diff.size(); ++i) {
out_data[i] += diff[i];
}
}
bool before_condition(const SDCondition* cond,
ggml_tensor* input,
ggml_tensor* output,
float sigma,
int step_index) {
if (!enabled() || step_index < 0) {
return false;
}
if (step_index != current_step_index) {
begin_step(step_index, sigma);
}
if (!step_active) {
return false;
}
if (initial_step) {
anchor_condition = cond;
initial_step = false;
}
bool is_anchor = (cond == anchor_condition);
if (skip_current_step) {
if (has_cache(cond)) {
apply_cache(cond, input, output);
return true;
}
return false;
}
if (!is_anchor) {
return false;
}
if (!has_prev_input || !has_prev_output || !has_cache(cond)) {
return false;
}
size_t ne = static_cast<size_t>(ggml_nelements(input));
if (prev_input.size() != ne) {
return false;
}
float* input_data = (float*)input->data;
last_input_change = 0.0f;
for (size_t i = 0; i < ne; ++i) {
last_input_change += std::fabs(input_data[i] - prev_input[i]);
}
if (ne > 0) {
last_input_change /= static_cast<float>(ne);
}
has_last_input_change = true;
if (has_output_prev_norm && has_relative_transformation_rate &&
last_input_change > 0.0f && output_prev_norm > 0.0f) {
float approx_output_change = relative_transformation_rate * last_input_change;
float approx_output_change_rate;
if (config.use_relative_threshold) {
float base_scale = std::max(output_prev_norm, 1e-6f);
float dyn_scale = has_output_change_ema
? std::max(output_change_ema * std::max(1.0f, config.relative_norm_gain), 1e-6f)
: base_scale;
float scale = std::sqrt(base_scale * dyn_scale);
approx_output_change_rate = approx_output_change / scale;
} else {
approx_output_change_rate = approx_output_change;
}
// Increase estimated error with skip horizon to avoid long extrapolation streaks
approx_output_change_rate *= (1.0f + 0.50f * consecutive_skipped_steps);
accumulated_error = accumulated_error * config.error_decay_rate + approx_output_change_rate;
float effective_threshold = get_adaptive_threshold();
if (!config.use_relative_threshold && output_prev_norm > 0.0f) {
effective_threshold = effective_threshold * output_prev_norm;
}
if (accumulated_error < effective_threshold) {
skip_current_step = true;
total_steps_skipped++;
consecutive_skipped_steps++;
apply_cache(cond, input, output);
return true;
} else if (config.reset_error_on_compute) {
accumulated_error = 0.0f;
}
}
return false;
}
void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
if (!step_is_active()) {
return;
}
update_cache(cond, input, output);
if (cond != anchor_condition) {
return;
}
steps_computed_since_active++;
consecutive_skipped_steps = 0;
size_t ne = static_cast<size_t>(ggml_nelements(input));
float* in_data = (float*)input->data;
prev_input.resize(ne);
for (size_t i = 0; i < ne; ++i) {
prev_input[i] = in_data[i];
}
has_prev_input = true;
float* out_data = (float*)output->data;
float output_change = 0.0f;
if (has_prev_output && prev_output.size() == ne) {
for (size_t i = 0; i < ne; ++i) {
output_change += std::fabs(out_data[i] - prev_output[i]);
}
if (ne > 0) {
output_change /= static_cast<float>(ne);
}
}
if (std::isfinite(output_change) && output_change > 0.0f) {
if (!has_output_change_ema) {
output_change_ema = output_change;
has_output_change_ema = true;
} else {
output_change_ema = 0.8f * output_change_ema + 0.2f * output_change;
}
}
prev_output.resize(ne);
for (size_t i = 0; i < ne; ++i) {
prev_output[i] = out_data[i];
}
has_prev_output = true;
float mean_abs = 0.0f;
for (size_t i = 0; i < ne; ++i) {
mean_abs += std::fabs(out_data[i]);
}
output_prev_norm = (ne > 0) ? (mean_abs / static_cast<float>(ne)) : 0.0f;
has_output_prev_norm = output_prev_norm > 0.0f;
if (has_last_input_change && last_input_change > 0.0f && output_change > 0.0f) {
float rate = output_change / last_input_change;
if (std::isfinite(rate)) {
relative_transformation_rate = rate;
has_relative_transformation_rate = true;
block_metrics.record(rate, output_prev_norm);
}
}
has_last_input_change = false;
}
void log_block_metrics() const {
if (block_metrics.sample_count > 0) {
LOG_INFO("UCacheBlockMetrics: samples=%d, avg_rate=%.4f, min=%.4f, max=%.4f, avg_norm=%.4f",
block_metrics.sample_count,
block_metrics.avg_transformation_rate(),
block_metrics.min_change_rate,
block_metrics.max_change_rate,
block_metrics.avg_output_norm());
}
}
};
#endif // __UCACHE_HPP__

View File

@ -1,8 +1,7 @@
#ifndef __UNET_HPP__ #ifndef __UNET_HPP__
#define __UNET_HPP__ #define __UNET_HPP__
#include "common.hpp" #include "common_block.hpp"
#include "ggml_extend.hpp"
#include "model.h" #include "model.h"
/*==================================================== UnetModel =====================================================*/ /*==================================================== UnetModel =====================================================*/
@ -12,7 +11,7 @@
class SpatialVideoTransformer : public SpatialTransformer { class SpatialVideoTransformer : public SpatialTransformer {
protected: protected:
int64_t time_depth; int64_t time_depth;
int64_t max_time_embed_period; int max_time_embed_period;
public: public:
SpatialVideoTransformer(int64_t in_channels, SpatialVideoTransformer(int64_t in_channels,
@ -21,8 +20,8 @@ public:
int64_t depth, int64_t depth,
int64_t context_dim, int64_t context_dim,
bool use_linear, bool use_linear,
int64_t time_depth = 1, int64_t time_depth = 1,
int64_t max_time_embed_period = 10000) int max_time_embed_period = 10000)
: SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear), : SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear),
max_time_embed_period(max_time_embed_period) { max_time_embed_period(max_time_embed_period) {
// We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
@ -61,10 +60,10 @@ public:
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender()); blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* context, ggml_tensor* context,
int timesteps) { int timesteps) {
// x: [N, in_channels, h, w] aka [b*t, in_channels, h, w], t == timesteps // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w], t == timesteps
// context: [N, max_position(aka n_context), hidden_size(aka context_dim)] aka [b*t, n_context, context_dim], t == timesteps // context: [N, max_position(aka n_context), hidden_size(aka context_dim)] aka [b*t, n_context, context_dim], t == timesteps
// t_emb: [N, in_channels] aka [b*t, in_channels] // t_emb: [N, in_channels] aka [b*t, in_channels]
@ -112,9 +111,9 @@ public:
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim] x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim] x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
auto num_frames = ggml_arange(ctx->ggml_ctx, 0, timesteps, 1); auto num_frames = ggml_arange(ctx->ggml_ctx, 0.f, static_cast<float>(timesteps), 1.f);
// since b is 1, no need to do repeat // since b is 1, no need to do repeat
auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, num_frames, in_channels, max_time_embed_period); // [N, in_channels] auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, num_frames, static_cast<int>(in_channels), max_time_embed_period); // [N, in_channels]
auto emb = time_pos_embed_0->forward(ctx, t_emb); auto emb = time_pos_embed_0->forward(ctx, t_emb);
emb = ggml_silu_inplace(ctx->ggml_ctx, emb); emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
@ -201,6 +200,9 @@ public:
num_head_channels = 64; num_head_channels = 64;
num_heads = -1; num_heads = -1;
use_linear_projection = true; use_linear_projection = true;
if (version == VERSION_SDXL_VEGA) {
transformer_depth = {1, 1, 2};
}
} else if (version == VERSION_SVD) { } else if (version == VERSION_SVD) {
in_channels = 8; in_channels = 8;
out_channels = 4; out_channels = 4;
@ -215,10 +217,13 @@ public:
} else if (sd_version_is_unet_edit(version)) { } else if (sd_version_is_unet_edit(version)) {
in_channels = 8; in_channels = 8;
} }
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET) { if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
num_res_blocks = 1; num_res_blocks = 1;
channel_mult = {1, 2, 4}; channel_mult = {1, 2, 4};
tiny_unet = true; tiny_unet = true;
if (version == VERSION_SDXS) {
attention_resolutions = {4, 2}; // here just like SDXL
}
} }
// dims is always 2 // dims is always 2
@ -316,7 +321,7 @@ public:
} }
if (!tiny_unet) { if (!tiny_unet) {
blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch)); blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
if (version != VERSION_SDXL_SSD1B) { if (version != VERSION_SDXL_SSD1B && version != VERSION_SDXL_VEGA) {
blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
n_head, n_head,
d_head, d_head,
@ -383,11 +388,11 @@ public:
blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* resblock_forward(std::string name, ggml_tensor* resblock_forward(std::string name,
GGMLRunnerContext* ctx, GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* emb, ggml_tensor* emb,
int num_video_frames) { int num_video_frames) {
if (version == VERSION_SVD) { if (version == VERSION_SVD) {
auto block = std::dynamic_pointer_cast<VideoResBlock>(blocks[name]); auto block = std::dynamic_pointer_cast<VideoResBlock>(blocks[name]);
@ -399,11 +404,11 @@ public:
} }
} }
struct ggml_tensor* attention_layer_forward(std::string name, ggml_tensor* attention_layer_forward(std::string name,
GGMLRunnerContext* ctx, GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* context, ggml_tensor* context,
int timesteps) { int timesteps) {
if (version == VERSION_SVD) { if (version == VERSION_SVD) {
auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]); auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]);
@ -415,15 +420,15 @@ public:
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* c_concat = nullptr, ggml_tensor* c_concat = nullptr,
struct ggml_tensor* y = nullptr, ggml_tensor* y = nullptr,
int num_video_frames = -1, int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {}, std::vector<ggml_tensor*> controls = {},
float control_strength = 0.f) { float control_strength = 0.f) {
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w] // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
// timesteps: [N,] // timesteps: [N,]
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768] // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
@ -475,7 +480,7 @@ public:
} }
// input_blocks // input_blocks
std::vector<struct ggml_tensor*> hs; std::vector<ggml_tensor*> hs;
// input block 0 // input block 0
auto h = input_blocks_0_0->forward(ctx, x); auto h = input_blocks_0_0->forward(ctx, x);
@ -517,16 +522,16 @@ public:
// middle_block // middle_block
if (!tiny_unet) { if (!tiny_unet) {
h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
if (version != VERSION_SDXL_SSD1B) { if (version != VERSION_SDXL_SSD1B && version != VERSION_SDXL_VEGA) {
h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8]
h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
} }
} }
if (controls.size() > 0) { if (controls.size() > 0) {
auto cs = ggml_scale_inplace(ctx->ggml_ctx, controls[controls.size() - 1], control_strength); auto cs = ggml_ext_scale(ctx->ggml_ctx, controls[controls.size() - 1], control_strength, true);
h = ggml_add(ctx->ggml_ctx, h, cs); // middle control h = ggml_add(ctx->ggml_ctx, h, cs); // middle control
} }
int control_offset = controls.size() - 2; int control_offset = static_cast<int>(controls.size() - 2);
// output_blocks // output_blocks
int output_block_idx = 0; int output_block_idx = 0;
@ -536,7 +541,7 @@ public:
hs.pop_back(); hs.pop_back();
if (controls.size() > 0) { if (controls.size() > 0) {
auto cs = ggml_scale_inplace(ctx->ggml_ctx, controls[control_offset], control_strength); auto cs = ggml_ext_scale(ctx->ggml_ctx, controls[control_offset], control_strength, true);
h_skip = ggml_add(ctx->ggml_ctx, h_skip, cs); // control net condition h_skip = ggml_add(ctx->ggml_ctx, h_skip, cs); // control net condition
control_offset--; control_offset--;
} }
@ -600,22 +605,22 @@ struct UNetModelRunner : public GGMLRunner {
return "unet"; return "unet";
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
unet.get_param_tensors(tensors, prefix); unet.get_param_tensors(tensors, prefix);
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* x, ggml_cgraph* build_graph(ggml_tensor* x,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* c_concat = nullptr, ggml_tensor* c_concat = nullptr,
struct ggml_tensor* y = nullptr, ggml_tensor* y = nullptr,
int num_video_frames = -1, int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {}, std::vector<ggml_tensor*> controls = {},
float control_strength = 0.f) { float control_strength = 0.f) {
struct ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE); ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE);
if (num_video_frames == -1) { if (num_video_frames == -1) {
num_video_frames = x->ne[3]; num_video_frames = static_cast<int>(x->ne[3]);
} }
x = to_backend(x); x = to_backend(x);
@ -630,15 +635,15 @@ struct UNetModelRunner : public GGMLRunner {
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* out = unet.forward(&runner_ctx, ggml_tensor* out = unet.forward(&runner_ctx,
x, x,
timesteps, timesteps,
context, context,
c_concat, c_concat,
y, y,
num_video_frames, num_video_frames,
controls, controls,
control_strength); control_strength);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
@ -646,22 +651,22 @@ struct UNetModelRunner : public GGMLRunner {
} }
bool compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* c_concat, ggml_tensor* c_concat,
struct ggml_tensor* y, ggml_tensor* y,
int num_video_frames = -1, int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {}, std::vector<ggml_tensor*> controls = {},
float control_strength = 0.f, float control_strength = 0.f,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// timesteps: [N, ] // timesteps: [N, ]
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
// c_concat: [N, in_channels, h, w] or [1, in_channels, h, w] // c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
// y: [N, adm_in_channels] or [1, adm_in_channels] // y: [N, adm_in_channels] or [1, adm_in_channels]
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength); return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
}; };
@ -669,12 +674,12 @@ struct UNetModelRunner : public GGMLRunner {
} }
void test() { void test() {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != nullptr); GGML_ASSERT(work_ctx != nullptr);
{ {
@ -698,14 +703,14 @@ struct UNetModelRunner : public GGMLRunner {
ggml_set_f32(y, 0.5f); ggml_set_f32(y, 0.5f);
// print_ggml_tensor(y); // print_ggml_tensor(y);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
compute(8, x, timesteps, context, nullptr, y, num_video_frames, {}, 0.f, &out, work_ctx); compute(8, x, timesteps, context, nullptr, y, num_video_frames, {}, 0.f, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("unet test done in %dms", t1 - t0); LOG_DEBUG("unet test done in %lldms", t1 - t0);
} }
} }
}; };

View File

@ -72,13 +72,13 @@ struct UpscalerGGML {
LOG_INFO("upscaling from (%i x %i) to (%i x %i)", LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
input_image.width, input_image.height, output_width, output_height); input_image.width, input_image.height, output_width, output_height);
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1G params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1G
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
// draft context // draft context
struct ggml_context* upscale_ctx = ggml_init(params); ggml_context* upscale_ctx = ggml_init(params);
if (!upscale_ctx) { if (!upscale_ctx) {
LOG_ERROR("ggml_init() failed"); LOG_ERROR("ggml_init() failed");
return upscaled_image; return upscaled_image;
@ -89,10 +89,11 @@ struct UpscalerGGML {
ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1); ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
esrgan_upscaler->compute(n_threads, in, &out); return esrgan_upscaler->compute(n_threads, in, &out);
}; };
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling); // TODO: circular upscaling?
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, false, false, on_tiling);
esrgan_upscaler->free_compute_buffer(); esrgan_upscaler->free_compute_buffer();
ggml_ext_tensor_clamp_inplace(upscaled, 0.f, 1.f); ggml_ext_tensor_clamp_inplace(upscaled, 0.f, 1.f);
uint8_t* upscaled_data = ggml_tensor_to_sd_image(upscaled); uint8_t* upscaled_data = ggml_tensor_to_sd_image(upscaled);

View File

@ -95,9 +95,71 @@ bool is_directory(const std::string& path) {
return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY)); return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
} }
class MmapWrapperImpl : public MmapWrapper {
public:
MmapWrapperImpl(void* data, size_t size, HANDLE hfile, HANDLE hmapping)
: MmapWrapper(data, size), hfile_(hfile), hmapping_(hmapping) {}
~MmapWrapperImpl() override {
UnmapViewOfFile(data_);
CloseHandle(hmapping_);
CloseHandle(hfile_);
}
private:
HANDLE hfile_;
HANDLE hmapping_;
};
std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
void* mapped_data = nullptr;
size_t file_size = 0;
HANDLE file_handle = CreateFileA(
filename.c_str(),
GENERIC_READ,
FILE_SHARE_READ,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
NULL);
if (file_handle == INVALID_HANDLE_VALUE) {
return nullptr;
}
LARGE_INTEGER size;
if (!GetFileSizeEx(file_handle, &size)) {
CloseHandle(file_handle);
return nullptr;
}
file_size = static_cast<size_t>(size.QuadPart);
HANDLE mapping_handle = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
if (mapping_handle == NULL) {
CloseHandle(file_handle);
return nullptr;
}
mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size);
if (mapped_data == NULL) {
CloseHandle(mapping_handle);
CloseHandle(file_handle);
return nullptr;
}
return std::make_unique<MmapWrapperImpl>(mapped_data, file_size, file_handle, mapping_handle);
}
#else // Unix #else // Unix
#include <dirent.h> #include <dirent.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h>
bool file_exists(const std::string& filename) { bool file_exists(const std::string& filename) {
struct stat buffer; struct stat buffer;
@ -109,8 +171,64 @@ bool is_directory(const std::string& path) {
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode)); return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
} }
class MmapWrapperImpl : public MmapWrapper {
public:
MmapWrapperImpl(void* data, size_t size)
: MmapWrapper(data, size) {}
~MmapWrapperImpl() override {
munmap(data_, size_);
}
};
std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
int file_descriptor = open(filename.c_str(), O_RDONLY);
if (file_descriptor == -1) {
return nullptr;
}
int mmap_flags = MAP_PRIVATE;
#ifdef __linux__
// performance flags used by llama.cpp
// posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
// mmap_flags |= MAP_POPULATE;
#endif #endif
struct stat sb;
if (fstat(file_descriptor, &sb) == -1) {
close(file_descriptor);
return nullptr;
}
size_t file_size = sb.st_size;
void* mapped_data = mmap(NULL, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
close(file_descriptor);
if (mapped_data == MAP_FAILED) {
return nullptr;
}
#ifdef __linux__
// performance flags used by llama.cpp
// posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
#endif
return std::make_unique<MmapWrapperImpl>(mapped_data, file_size);
}
#endif
bool MmapWrapper::copy_data(void* buf, size_t n, size_t offset) const {
if (offset >= size_ || n > (size_ - offset)) {
return false;
}
std::memcpy(buf, data() + offset, n);
return true;
}
// get_num_physical_cores is copy from // get_num_physical_cores is copy from
// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp // https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE // LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
@ -370,7 +488,7 @@ sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
// Allocate memory for float data // Allocate memory for float data
converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float)); converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float));
for (int i = 0; i < image.width * image.height * image.channel; i++) { for (uint32_t i = 0; i < image.width * image.height * image.channel; i++) {
// Convert uint8_t to float // Convert uint8_t to float
converted_image.data[i] = (float)image.data[i]; converted_image.data[i] = (float)image.data[i];
} }
@ -402,7 +520,7 @@ sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int
uint32_t x2 = std::min(x1 + 1, image.width - 1); uint32_t x2 = std::min(x1 + 1, image.width - 1);
uint32_t y2 = std::min(y1 + 1, image.height - 1); uint32_t y2 = std::min(y1 + 1, image.height - 1);
for (int k = 0; k < image.channel; k++) { for (uint32_t k = 0; k < image.channel; k++) {
float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k); float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k); float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k); float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
@ -422,9 +540,9 @@ sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int
} }
void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) { void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) {
for (int y = 0; y < image.height; y++) { for (uint32_t y = 0; y < image.height; y++) {
for (int x = 0; x < image.width; x++) { for (uint32_t x = 0; x < image.width; x++) {
for (int k = 0; k < image.channel; k++) { for (uint32_t k = 0; k < image.channel; k++) {
int index = (y * image.width + x) * image.channel + k; int index = (y * image.width + x) * image.channel + k;
image.data[index] = (image.data[index] - means[k]) / stds[k]; image.data[index] = (image.data[index] - means[k]) / stds[k];
} }
@ -433,8 +551,8 @@ void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3
} }
// Constants for means and std // Constants for means and std
float means[3] = {0.48145466, 0.4578275, 0.40821073}; float means[3] = {0.48145466f, 0.4578275f, 0.40821073f};
float stds[3] = {0.26862954, 0.26130258, 0.27577711}; float stds[3] = {0.26862954f, 0.26130258f, 0.27577711f};
// Function to clip and preprocess sd_image_f32_t // Function to clip and preprocess sd_image_f32_t
sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height) { sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height) {
@ -458,7 +576,7 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
uint32_t x2 = std::min(x1 + 1, image.width - 1); uint32_t x2 = std::min(x1 + 1, image.width - 1);
uint32_t y2 = std::min(y1 + 1, image.height - 1); uint32_t y2 = std::min(y1 + 1, image.height - 1);
for (int k = 0; k < image.channel; k++) { for (uint32_t k = 0; k < image.channel; k++) {
float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k); float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k); float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k); float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
@ -484,11 +602,11 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
result.channel = image.channel; result.channel = image.channel;
result.data = (float*)malloc(target_height * target_width * image.channel * sizeof(float)); result.data = (float*)malloc(target_height * target_width * image.channel * sizeof(float));
for (int k = 0; k < image.channel; k++) { for (uint32_t k = 0; k < image.channel; k++) {
for (int i = 0; i < result.height; i++) { for (uint32_t i = 0; i < result.height; i++) {
for (int j = 0; j < result.width; j++) { for (uint32_t j = 0; j < result.width; j++) {
int src_y = std::min(i + h_offset, resized_height - 1); int src_y = std::min(static_cast<int>(i + h_offset), resized_height - 1);
int src_x = std::min(j + w_offset, resized_width - 1); int src_x = std::min(static_cast<int>(j + w_offset), resized_width - 1);
*(result.data + i * result.width * image.channel + j * image.channel + k) = *(result.data + i * result.width * image.channel + j * image.channel + k) =
fmin(fmax(*(resized_data + src_y * resized_width * image.channel + src_x * image.channel + k), 0.0f), 255.0f) / 255.0f; fmin(fmax(*(resized_data + src_y * resized_width * image.channel + src_x * image.channel + k), 0.0f), 255.0f) / 255.0f;
} }
@ -499,9 +617,9 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
free(resized_data); free(resized_data);
// Normalize // Normalize
for (int k = 0; k < image.channel; k++) { for (uint32_t k = 0; k < image.channel; k++) {
for (int i = 0; i < result.height; i++) { for (uint32_t i = 0; i < result.height; i++) {
for (int j = 0; j < result.width; j++) { for (uint32_t j = 0; j < result.width; j++) {
// *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f; // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
int offset = i * result.width * image.channel + j * image.channel + k; int offset = i * result.width * image.channel + j * image.channel + k;
float value = *(result.data + offset); float value = *(result.data + offset);

View File

@ -2,6 +2,7 @@
#define __UTIL_H__ #define __UTIL_H__
#include <cstdint> #include <cstdint>
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
@ -43,6 +44,28 @@ sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int
sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height); sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height);
class MmapWrapper {
public:
static std::unique_ptr<MmapWrapper> create(const std::string& filename);
virtual ~MmapWrapper() = default;
MmapWrapper(const MmapWrapper&) = delete;
MmapWrapper& operator=(const MmapWrapper&) = delete;
MmapWrapper(MmapWrapper&&) = delete;
MmapWrapper& operator=(MmapWrapper&&) = delete;
const uint8_t* data() const { return static_cast<uint8_t*>(data_); }
size_t size() const { return size_; }
bool copy_data(void* buf, size_t n, size_t offset) const;
protected:
MmapWrapper(void* data, size_t size)
: data_(data), size_(size) {}
void* data_ = nullptr;
size_t size_ = 0;
};
std::string path_join(const std::string& p1, const std::string& p2); std::string path_join(const std::string& p1, const std::string& p2);
std::vector<std::string> split_string(const std::string& str, char delimiter); std::vector<std::string> split_string(const std::string& str, char delimiter);
void pretty_progress(int step, int steps, float time); void pretty_progress(int step, int steps, float time);

235
src/vae.hpp Normal file
View File

@ -0,0 +1,235 @@
#ifndef __VAE_HPP__
#define __VAE_HPP__
#include "common_block.hpp"
struct VAE : public GGMLRunner {
protected:
SDVersion version;
bool scale_input = true;
virtual bool _compute(const int n_threads,
ggml_tensor* z,
bool decode_graph,
ggml_tensor** output,
ggml_context* output_ctx) = 0;
public:
VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu)
: version(version), GGMLRunner(backend, offload_params_to_cpu) {}
int get_scale_factor() {
int scale_factor = 8;
if (version == VERSION_WAN2_2_TI2V) {
scale_factor = 16;
} else if (sd_version_is_flux2(version)) {
scale_factor = 16;
} else if (version == VERSION_CHROMA_RADIANCE) {
scale_factor = 1;
}
return scale_factor;
}
virtual int get_encoder_output_channels(int input_channels) = 0;
void get_tile_sizes(int& tile_size_x,
int& tile_size_y,
float& tile_overlap,
const sd_tiling_params_t& params,
int64_t latent_x,
int64_t latent_y,
float encoding_factor = 1.0f) {
tile_overlap = std::max(std::min(params.target_overlap, 0.5f), 0.0f);
auto get_tile_size = [&](int requested_size, float factor, int64_t latent_size) {
const int default_tile_size = 32;
const int min_tile_dimension = 4;
int tile_size = default_tile_size;
// factor <= 1 means simple fraction of the latent dimension
// factor > 1 means number of tiles across that dimension
if (factor > 0.f) {
if (factor > 1.0)
factor = 1 / (factor - factor * tile_overlap + tile_overlap);
tile_size = static_cast<int>(std::round(latent_size * factor));
} else if (requested_size >= min_tile_dimension) {
tile_size = requested_size;
}
tile_size = static_cast<int>(tile_size * encoding_factor);
return std::max(std::min(tile_size, static_cast<int>(latent_size)), min_tile_dimension);
};
tile_size_x = get_tile_size(params.tile_size_x, params.rel_size_x, latent_x);
tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y);
}
ggml_tensor* encode(int n_threads,
ggml_context* work_ctx,
ggml_tensor* x,
sd_tiling_params_t tiling_params,
bool circular_x = false,
bool circular_y = false) {
int64_t t0 = ggml_time_ms();
ggml_tensor* result = nullptr;
const int scale_factor = get_scale_factor();
int64_t W = x->ne[0] / scale_factor;
int64_t H = x->ne[1] / scale_factor;
int channel_dim = sd_version_is_wan(version) ? 3 : 2;
int64_t C = get_encoder_output_channels(static_cast<int>(x->ne[channel_dim]));
int64_t ne2;
int64_t ne3;
if (sd_version_is_wan(version)) {
int64_t T = x->ne[2];
ne2 = (T - 1) / 4 + 1;
ne3 = C;
} else {
ne2 = C;
ne3 = x->ne[3];
}
result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
if (scale_input) {
scale_to_minus1_1(x);
}
if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
}
if (tiling_params.enabled) {
float tile_overlap;
int tile_size_x, tile_size_y;
// multiply tile size for encode to keep the compute buffer size consistent
get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, W, H, 1.30539f);
LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
return _compute(n_threads, in, false, &out, work_ctx);
};
sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling);
} else {
_compute(n_threads, x, false, &result, work_ctx);
}
free_compute_buffer();
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
return result;
}
ggml_tensor* decode(int n_threads,
ggml_context* work_ctx,
ggml_tensor* x,
sd_tiling_params_t tiling_params,
bool decode_video = false,
bool circular_x = false,
bool circular_y = false,
ggml_tensor* result = nullptr,
bool silent = false) {
const int scale_factor = get_scale_factor();
int64_t W = x->ne[0] * scale_factor;
int64_t H = x->ne[1] * scale_factor;
int64_t C = 3;
if (result == nullptr) {
if (decode_video) {
int64_t T = x->ne[2];
if (sd_version_is_wan(version)) {
T = ((T - 1) * 4) + 1;
}
result = ggml_new_tensor_4d(work_ctx,
GGML_TYPE_F32,
W,
H,
T,
3);
} else {
result = ggml_new_tensor_4d(work_ctx,
GGML_TYPE_F32,
W,
H,
C,
x->ne[3]);
}
}
int64_t t0 = ggml_time_ms();
if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
}
if (tiling_params.enabled) {
float tile_overlap;
int tile_size_x, tile_size_y;
get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, x->ne[0], x->ne[1]);
if (!silent) {
LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);
}
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
return _compute(n_threads, in, true, &out, nullptr);
};
sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling, silent);
} else {
if (!_compute(n_threads, x, true, &result, work_ctx)) {
LOG_ERROR("Failed to decode latetnts");
free_compute_buffer();
return nullptr;
}
}
free_compute_buffer();
if (scale_input) {
scale_to_0_1(result);
}
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
return result;
}
virtual ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) = 0;
virtual ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0;
virtual ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0;
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) = 0;
virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
};
struct FakeVAE : public VAE {
FakeVAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu)
: VAE(version, backend, offload_params_to_cpu) {}
int get_encoder_output_channels(int input_channels) {
return input_channels;
}
bool _compute(const int n_threads,
ggml_tensor* z,
bool decode_graph,
ggml_tensor** output,
ggml_context* output_ctx) override {
if (*output == nullptr && output_ctx != nullptr) {
*output = ggml_dup_tensor(output_ctx, z);
}
ggml_ext_tensor_iter(z, [&](ggml_tensor* z, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = ggml_ext_tensor_get_f32(z, i0, i1, i2, i3);
ggml_ext_tensor_set_f32(*output, value, i0, i1, i2, i3);
});
return true;
}
ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
return vae_output;
}
ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
}
ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
return ggml_ext_dup_and_cpy_tensor(work_ctx, latents);
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {}
std::string get_desc() override {
return "fake_vae";
}
};
#endif // __VAE_HPP__

View File

@ -1,4 +1,4 @@
static unsigned char merges_utf8_c_str[] = { static const unsigned char clip_merges_utf8_c_str[] = {
0x23, 0x23,
0x76, 0x76,
0x65, 0x65,
@ -524620,7 +524620,7 @@ static unsigned char merges_utf8_c_str[] = {
0x0a, 0x0a,
}; };
static unsigned char t5_tokenizer_json_str[] = { static const unsigned char t5_tokenizer_json_str[] = {
0x7b, 0x7b,
0x0a, 0x0a,
0x20, 0x20,

View File

@ -1,4 +1,4 @@
unsigned char mistral_merges_utf8_c_str[] = { static const unsigned char mistral_merges_utf8_c_str[] = {
0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0x20, 0x74, 0x0a, 0x65, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0x20, 0x74, 0x0a, 0x65,
0x20, 0x72, 0x0a, 0x69, 0x20, 0x6e, 0x0a, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x20, 0x72, 0x0a, 0x69, 0x20, 0x6e, 0x0a, 0xc4, 0xa0, 0x20, 0xc4, 0xa0,
0xc4, 0xa0, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xa0,
@ -260614,7 +260614,7 @@ unsigned char mistral_merges_utf8_c_str[] = {
0xc3, 0xa5, 0xc4, 0xb2, 0xc4, 0xb0, 0x20, 0xc3, 0xa6, 0xc2, 0xb1, 0xc4, 0xc3, 0xa5, 0xc4, 0xb2, 0xc4, 0xb0, 0x20, 0xc3, 0xa6, 0xc2, 0xb1, 0xc4,
0xab, 0xc3, 0xa4, 0xc2, 0xb9, 0xc2, 0xa6, 0x0a, 0xab, 0xc3, 0xa4, 0xc2, 0xb9, 0xc2, 0xa6, 0x0a,
}; };
unsigned char mistral_vocab_json_utf8_c_str[] = { static const unsigned char mistral_vocab_json_utf8_c_str[] = {
0x7b, 0x22, 0x3c, 0x75, 0x6e, 0x6b, 0x3e, 0x22, 0x3a, 0x20, 0x30, 0x2c, 0x7b, 0x22, 0x3c, 0x75, 0x6e, 0x6b, 0x3e, 0x22, 0x3a, 0x20, 0x30, 0x2c,
0x20, 0x22, 0x3c, 0x73, 0x3e, 0x22, 0x3a, 0x20, 0x31, 0x2c, 0x20, 0x22, 0x20, 0x22, 0x3c, 0x73, 0x3e, 0x22, 0x3a, 0x20, 0x31, 0x2c, 0x20, 0x22,
0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x3a, 0x20, 0x32, 0x2c, 0x20, 0x22, 0x5b, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x3a, 0x20, 0x32, 0x2c, 0x20, 0x22, 0x5b,

View File

@ -1,4 +1,4 @@
unsigned char qwen2_merges_utf8_c_str[] = { static const unsigned char qwen2_merges_utf8_c_str[] = {
0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4,
0xa0, 0xc4, 0xa0, 0x0a, 0x69, 0x20, 0x6e, 0x0a, 0xc4, 0xa0, 0x20, 0x74, 0xa0, 0xc4, 0xa0, 0x0a, 0x69, 0x20, 0x6e, 0x0a, 0xc4, 0xa0, 0x20, 0x74,
0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xa0,

View File

@ -1,4 +1,4 @@
unsigned char umt5_tokenizer_json_str[] = { static const unsigned char umt5_tokenizer_json_str[] = {
0x7b, 0x22, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0x3a, 0x20, 0x7b, 0x22, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0x3a, 0x20,
0x22, 0x31, 0x2e, 0x30, 0x22, 0x2c, 0x20, 0x22, 0x74, 0x72, 0x75, 0x6e, 0x22, 0x31, 0x2e, 0x30, 0x22, 0x2c, 0x20, 0x22, 0x74, 0x72, 0x75, 0x6e,
0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c,

35
src/vocab/vocab.cpp Normal file
View File

@ -0,0 +1,35 @@
#include "vocab.h"
#include "clip_t5.hpp"
#include "mistral.hpp"
#include "qwen.hpp"
#include "umt5.hpp"
std::string load_clip_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(clip_merges_utf8_c_str), sizeof(clip_merges_utf8_c_str));
return merges_utf8_str;
}
std::string load_qwen2_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(qwen2_merges_utf8_c_str), sizeof(qwen2_merges_utf8_c_str));
return merges_utf8_str;
}
std::string load_mistral_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(mistral_merges_utf8_c_str), sizeof(mistral_merges_utf8_c_str));
return merges_utf8_str;
}
std::string load_mistral_vocab_json() {
std::string json_str(reinterpret_cast<const char*>(mistral_vocab_json_utf8_c_str), sizeof(mistral_vocab_json_utf8_c_str));
return json_str;
}
std::string load_t5_tokenizer_json() {
std::string json_str(reinterpret_cast<const char*>(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str));
return json_str;
}
std::string load_umt5_tokenizer_json() {
std::string json_str(reinterpret_cast<const char*>(umt5_tokenizer_json_str), sizeof(umt5_tokenizer_json_str));
return json_str;
}

13
src/vocab/vocab.h Normal file
View File

@ -0,0 +1,13 @@
#ifndef __VOCAB_H__
#define __VOCAB_H__
#include <string>
std::string load_clip_merges();
std::string load_qwen2_merges();
std::string load_mistral_merges();
std::string load_mistral_vocab_json();
std::string load_t5_tokenizer_json();
std::string load_umt5_tokenizer_json();
#endif // __VOCAB_H__

File diff suppressed because it is too large Load Diff

View File

@ -42,10 +42,10 @@ namespace ZImage {
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* pe, ggml_tensor* pe,
struct ggml_tensor* mask = nullptr) { ggml_tensor* mask = nullptr) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
int64_t n_token = x->ne[1]; int64_t n_token = x->ne[1];
int64_t N = x->ne[2]; int64_t N = x->ne[2];
@ -54,15 +54,37 @@ namespace ZImage {
auto qkv = qkv_proj->forward(ctx, x); // [N, n_token, (num_heads + num_kv_heads*2)*head_dim] auto qkv = qkv_proj->forward(ctx, x); // [N, n_token, (num_heads + num_kv_heads*2)*head_dim]
qkv = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]); // [N, n_token, num_heads + num_kv_heads*2, head_dim] qkv = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]); // [N, n_token, num_heads + num_kv_heads*2, head_dim]
qkv = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, qkv, 0, 2, 3, 1)); // [num_heads + num_kv_heads*2, N, n_token, head_dim]
auto q = ggml_view_4d(ctx->ggml_ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], num_heads, qkv->nb[1], qkv->nb[2], qkv->nb[3], 0); // [num_heads, N, n_token, head_dim] auto q = ggml_view_4d(ctx->ggml_ctx,
auto k = ggml_view_4d(ctx->ggml_ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], num_kv_heads, qkv->nb[1], qkv->nb[2], qkv->nb[3], qkv->nb[3] * num_heads); // [num_kv_heads, N, n_token, head_dim] qkv,
auto v = ggml_view_4d(ctx->ggml_ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], num_kv_heads, qkv->nb[1], qkv->nb[2], qkv->nb[3], qkv->nb[3] * (num_heads + num_kv_heads)); // [num_kv_heads, N, n_token, head_dim] qkv->ne[0],
num_heads,
q = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, q, 0, 3, 1, 2)); // [N, n_token, num_heads, head_dim] qkv->ne[2],
k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 3, 1, 2)); // [N, n_token, num_kv_heads, head_dim] qkv->ne[3],
v = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, v, 0, 3, 1, 2)); // [N, n_token, num_kv_heads, head_dim] qkv->nb[1],
qkv->nb[2],
qkv->nb[3],
0); // [N, n_token, num_heads, head_dim]
auto k = ggml_view_4d(ctx->ggml_ctx,
qkv,
qkv->ne[0],
num_kv_heads,
qkv->ne[2],
qkv->ne[3],
qkv->nb[1],
qkv->nb[2],
qkv->nb[3],
num_heads * qkv->nb[1]); // [N, n_token, num_kv_heads, head_dim]
auto v = ggml_view_4d(ctx->ggml_ctx,
qkv,
qkv->ne[0],
num_kv_heads,
qkv->ne[2],
qkv->ne[3],
qkv->nb[1],
qkv->nb[2],
qkv->nb[3],
(num_heads + num_kv_heads) * qkv->nb[1]); // [N, n_token, num_kv_heads, head_dim]
if (qk_norm) { if (qk_norm) {
auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]); auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
@ -102,23 +124,23 @@ namespace ZImage {
blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false); blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]); auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]); auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]); auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
auto x1 = w1->forward(ctx, x); auto x1 = w1->forward(ctx, x);
auto x3 = w3->forward(ctx, x); auto x3 = w3->forward(ctx, x);
x = ggml_mul(ctx->ggml_ctx, ggml_silu(ctx->ggml_ctx, x1), x3); x = ggml_swiglu_split(ctx->ggml_ctx, x1, x3);
x = w2->forward(ctx, x); x = w2->forward(ctx, x);
return x; return x;
} }
}; };
__STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx, __STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* scale) { ggml_tensor* scale) {
// x: [N, L, C] // x: [N, L, C]
// scale: [N, C] // scale: [N, C]
scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]); // [N, 1, C] scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]); // [N, 1, C]
@ -153,11 +175,11 @@ namespace ZImage {
} }
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* pe, ggml_tensor* pe,
struct ggml_tensor* mask = nullptr, ggml_tensor* mask = nullptr,
struct ggml_tensor* adaln_input = nullptr) { ggml_tensor* adaln_input = nullptr) {
auto attention = std::dynamic_pointer_cast<JointAttention>(blocks["attention"]); auto attention = std::dynamic_pointer_cast<JointAttention>(blocks["attention"]);
auto feed_forward = std::dynamic_pointer_cast<FeedForward>(blocks["feed_forward"]); auto feed_forward = std::dynamic_pointer_cast<FeedForward>(blocks["feed_forward"]);
auto attention_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm1"]); auto attention_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm1"]);
@ -219,9 +241,9 @@ namespace ZImage {
blocks["adaLN_modulation.1"] = std::make_shared<Linear>(MIN(hidden_size, ADALN_EMBED_DIM), hidden_size); blocks["adaLN_modulation.1"] = std::make_shared<Linear>(MIN(hidden_size, ADALN_EMBED_DIM), hidden_size);
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* c) { ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// c: [N, hidden_size] // c: [N, hidden_size]
// return: [N, n_token, patch_size * patch_size * out_channels] // return: [N, n_token, patch_size * patch_size * out_channels]
@ -239,7 +261,7 @@ namespace ZImage {
}; };
struct ZImageParams { struct ZImageParams {
int64_t patch_size = 2; int patch_size = 2;
int64_t hidden_size = 3840; int64_t hidden_size = 3840;
int64_t in_channels = 16; int64_t in_channels = 16;
int64_t out_channels = 16; int64_t out_channels = 16;
@ -249,11 +271,11 @@ namespace ZImage {
int64_t num_heads = 30; int64_t num_heads = 30;
int64_t num_kv_heads = 30; int64_t num_kv_heads = 30;
int64_t multiple_of = 256; int64_t multiple_of = 256;
float ffn_dim_multiplier = 8.0 / 3.0f; float ffn_dim_multiplier = 8.0f / 3.0f;
float norm_eps = 1e-5f; float norm_eps = 1e-5f;
bool qk_norm = true; bool qk_norm = true;
int64_t cap_feat_dim = 2560; int64_t cap_feat_dim = 2560;
float theta = 256.f; int theta = 256;
std::vector<int> axes_dim = {32, 48, 48}; std::vector<int> axes_dim = {32, 48, 48};
int64_t axes_dim_sum = 128; int64_t axes_dim_sum = 128;
}; };
@ -262,7 +284,7 @@ namespace ZImage {
protected: protected:
ZImageParams z_image_params; ZImageParams z_image_params;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size); params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
params["x_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size); params["x_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
} }
@ -324,74 +346,11 @@ namespace ZImage {
blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels); blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
} }
struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx, ggml_tensor* forward_core(GGMLRunnerContext* ctx,
struct ggml_tensor* x) { ggml_tensor* x,
int64_t W = x->ne[0]; ggml_tensor* timestep,
int64_t H = x->ne[1]; ggml_tensor* context,
ggml_tensor* pe) {
int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size;
int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size;
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w]
return x;
}
struct ggml_tensor* patchify(struct ggml_context* ctx,
struct ggml_tensor* x) {
// x: [N, C, H, W]
// return: [N, h*w, patch_size*patch_size*C]
int64_t N = x->ne[3];
int64_t C = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
int64_t p = z_image_params.patch_size;
int64_t h = H / z_image_params.patch_size;
int64_t w = W / z_image_params.patch_size;
GGML_ASSERT(h * p == H && w * p == W);
x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p]
x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p]
x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [N, h*w, C, p*p]
x = ggml_reshape_3d(ctx, x, C * p * p, w * h, N); // [N, h*w, p*p*C]
return x;
}
struct ggml_tensor* process_img(struct ggml_context* ctx,
struct ggml_tensor* x) {
x = pad_to_patch_size(ctx, x);
x = patchify(ctx, x);
return x;
}
struct ggml_tensor* unpatchify(struct ggml_context* ctx,
struct ggml_tensor* x,
int64_t h,
int64_t w) {
// x: [N, h*w, patch_size*patch_size*C]
// return: [N, C, H, W]
int64_t N = x->ne[2];
int64_t C = x->ne[0] / z_image_params.patch_size / z_image_params.patch_size;
int64_t H = h * z_image_params.patch_size;
int64_t W = w * z_image_params.patch_size;
int64_t p = z_image_params.patch_size;
GGML_ASSERT(C * p * p == x->ne[0]);
x = ggml_reshape_4d(ctx, x, C, p * p, w * h, N); // [N, h*w, p*p, C]
x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, h*w, p*p]
x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p]
x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p]
return x;
}
struct ggml_tensor* forward_core(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* timestep,
struct ggml_tensor* context,
struct ggml_tensor* pe) {
auto x_embedder = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]); auto x_embedder = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]);
auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]); auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
auto cap_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["cap_embedder.0"]); auto cap_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["cap_embedder.0"]);
@ -411,13 +370,13 @@ namespace ZImage {
auto txt = cap_embedder_1->forward(ctx, cap_embedder_0->forward(ctx, context)); // [N, n_txt_token, hidden_size] auto txt = cap_embedder_1->forward(ctx, cap_embedder_0->forward(ctx, context)); // [N, n_txt_token, hidden_size]
auto img = x_embedder->forward(ctx, x); // [N, n_img_token, hidden_size] auto img = x_embedder->forward(ctx, x); // [N, n_img_token, hidden_size]
int64_t n_txt_pad_token = Rope::bound_mod(n_txt_token, SEQ_MULTI_OF); int64_t n_txt_pad_token = Rope::bound_mod(static_cast<int>(n_txt_token), SEQ_MULTI_OF);
if (n_txt_pad_token > 0) { if (n_txt_pad_token > 0) {
auto txt_pad_tokens = ggml_repeat_4d(ctx->ggml_ctx, txt_pad_token, txt_pad_token->ne[0], n_txt_pad_token, N, 1); auto txt_pad_tokens = ggml_repeat_4d(ctx->ggml_ctx, txt_pad_token, txt_pad_token->ne[0], n_txt_pad_token, N, 1);
txt = ggml_concat(ctx->ggml_ctx, txt, txt_pad_tokens, 1); // [N, n_txt_token + n_txt_pad_token, hidden_size] txt = ggml_concat(ctx->ggml_ctx, txt, txt_pad_tokens, 1); // [N, n_txt_token + n_txt_pad_token, hidden_size]
} }
int64_t n_img_pad_token = Rope::bound_mod(n_img_token, SEQ_MULTI_OF); int64_t n_img_pad_token = Rope::bound_mod(static_cast<int>(n_img_token), SEQ_MULTI_OF);
if (n_img_pad_token > 0) { if (n_img_pad_token > 0) {
auto img_pad_tokens = ggml_repeat_4d(ctx->ggml_ctx, img_pad_token, img_pad_token->ne[0], n_img_pad_token, N, 1); auto img_pad_tokens = ggml_repeat_4d(ctx->ggml_ctx, img_pad_token, img_pad_token->ne[0], n_img_pad_token, N, 1);
img = ggml_concat(ctx->ggml_ctx, img, img_pad_tokens, 1); // [N, n_img_token + n_img_pad_token, hidden_size] img = ggml_concat(ctx->ggml_ctx, img, img_pad_tokens, 1); // [N, n_img_token + n_img_pad_token, hidden_size]
@ -455,12 +414,12 @@ namespace ZImage {
return img; return img;
} }
struct ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* timestep, ggml_tensor* timestep,
struct ggml_tensor* context, ggml_tensor* context,
struct ggml_tensor* pe, ggml_tensor* pe,
std::vector<ggml_tensor*> ref_latents = {}) { std::vector<ggml_tensor*> ref_latents = {}) {
// Forward pass of DiT. // Forward pass of DiT.
// x: [N, C, H, W] // x: [N, C, H, W]
// timestep: [N,] // timestep: [N,]
@ -473,29 +432,24 @@ namespace ZImage {
int64_t C = x->ne[2]; int64_t C = x->ne[2];
int64_t N = x->ne[3]; int64_t N = x->ne[3];
auto img = process_img(ctx->ggml_ctx, x); int patch_size = z_image_params.patch_size;
auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size, false);
uint64_t n_img_token = img->ne[1]; uint64_t n_img_token = img->ne[1];
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
for (ggml_tensor* ref : ref_latents) { for (ggml_tensor* ref : ref_latents) {
ref = process_img(ctx->ggml_ctx, ref); ref = DiT::pad_and_patchify(ctx, ref, patch_size, patch_size, false);
img = ggml_concat(ctx->ggml_ctx, img, ref, 1); img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
} }
} }
int64_t h_len = ((H + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
int64_t w_len = ((W + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
auto out = forward_core(ctx, img, timestep, context, pe); auto out = forward_core(ctx, img, timestep, context, pe);
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token); // [N, n_img_token, ph*pw*C] out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token); // [N, n_img_token, ph*pw*C]
out = unpatchify(ctx->ggml_ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w] out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, patch_size, patch_size, false); // [N, C, H, W]
// slice out = ggml_ext_scale(ctx->ggml_ctx, out, -1.f);
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H); // [N, C, H, W + pad_w]
out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W); // [N, C, H, W]
out = ggml_scale(ctx->ggml_ctx, out, -1.f);
return out; return out;
} }
@ -523,17 +477,17 @@ namespace ZImage {
return "z_image"; return "z_image";
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
z_image.get_param_tensors(tensors, prefix); z_image.get_param_tensors(tensors, prefix);
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* x, ggml_cgraph* build_graph(ggml_tensor* x,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
std::vector<ggml_tensor*> ref_latents = {}, std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false) { bool increase_ref_index = false) {
GGML_ASSERT(x->ne[3] == 1); GGML_ASSERT(x->ne[3] == 1);
struct ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE); ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE);
x = to_backend(x); x = to_backend(x);
context = to_backend(context); context = to_backend(context);
@ -543,17 +497,19 @@ namespace ZImage {
ref_latents[i] = to_backend(ref_latents[i]); ref_latents[i] = to_backend(ref_latents[i]);
} }
pe_vec = Rope::gen_z_image_pe(x->ne[1], pe_vec = Rope::gen_z_image_pe(static_cast<int>(x->ne[1]),
x->ne[0], static_cast<int>(x->ne[0]),
z_image_params.patch_size, z_image_params.patch_size,
x->ne[3], static_cast<int>(x->ne[3]),
context->ne[1], static_cast<int>(context->ne[1]),
SEQ_MULTI_OF, SEQ_MULTI_OF,
ref_latents, ref_latents,
increase_ref_index, increase_ref_index,
z_image_params.theta, z_image_params.theta,
circular_y_enabled,
circular_x_enabled,
z_image_params.axes_dim); z_image_params.axes_dim);
int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2; int pos_len = static_cast<int>(pe_vec.size() / z_image_params.axes_dim_sum / 2);
// LOG_DEBUG("pos_len %d", pos_len); // LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, z_image_params.axes_dim_sum / 2, pos_len); auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, z_image_params.axes_dim_sum / 2, pos_len);
// pe->data = pe_vec.data(); // pe->data = pe_vec.data();
@ -562,12 +518,12 @@ namespace ZImage {
set_backend_tensor_data(pe, pe_vec.data()); set_backend_tensor_data(pe, pe_vec.data());
auto runner_ctx = get_context(); auto runner_ctx = get_context();
struct ggml_tensor* out = z_image.forward(&runner_ctx, ggml_tensor* out = z_image.forward(&runner_ctx,
x, x,
timesteps, timesteps,
context, context,
pe, pe,
ref_latents); ref_latents);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
@ -575,17 +531,17 @@ namespace ZImage {
} }
bool compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, ggml_tensor* x,
struct ggml_tensor* timesteps, ggml_tensor* timesteps,
struct ggml_tensor* context, ggml_tensor* context,
std::vector<ggml_tensor*> ref_latents = {}, std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false, bool increase_ref_index = false,
struct ggml_tensor** output = nullptr, ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// timesteps: [N, ] // timesteps: [N, ]
// context: [N, max_position, hidden_size] // context: [N, max_position, hidden_size]
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, ref_latents, increase_ref_index); return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
}; };
@ -593,12 +549,12 @@ namespace ZImage {
} }
void test() { void test() {
struct ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params); ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != nullptr); GGML_ASSERT(work_ctx != nullptr);
{ {
@ -615,14 +571,14 @@ namespace ZImage {
auto context = load_tensor_from_file(work_ctx, "./z_image_context.bin"); auto context = load_tensor_from_file(work_ctx, "./z_image_context.bin");
print_ggml_tensor(context); print_ggml_tensor(context);
struct ggml_tensor* out = nullptr; ggml_tensor* out = nullptr;
int t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
compute(8, x, timesteps, context, {}, false, &out, work_ctx); compute(8, x, timesteps, context, {}, false, &out, work_ctx);
int t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
print_ggml_tensor(out); print_ggml_tensor(out);
LOG_DEBUG("z_image test done in %dms", t1 - t0); LOG_DEBUG("z_image test done in %lldms", t1 - t0);
} }
} }

263
tae.hpp
View File

@ -1,263 +0,0 @@
#ifndef __TAE_HPP__
#define __TAE_HPP__
#include "ggml_extend.hpp"
#include "model.h"
/*
=================================== TinyAutoEncoder ===================================
References:
https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
https://github.com/madebyollin/taesd/blob/main/taesd.py
*/
class TAEBlock : public UnaryBlock {
protected:
int n_in;
int n_out;
public:
TAEBlock(int n_in, int n_out)
: n_in(n_in), n_out(n_out) {
blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}));
blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
if (n_in != n_out) {
blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false));
}
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [n, n_in, h, w]
// return: [n, n_out, h, w]
auto conv_0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
auto conv_2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
auto h = conv_0->forward(ctx, x);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv_2->forward(ctx, h);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv_4->forward(ctx, h);
if (n_in != n_out) {
auto skip = std::dynamic_pointer_cast<Conv2d>(blocks["skip"]);
LOG_DEBUG("skip");
x = skip->forward(ctx, x);
}
h = ggml_add(ctx->ggml_ctx, h, x);
h = ggml_relu_inplace(ctx->ggml_ctx, h);
return h;
}
};
class TinyEncoder : public UnaryBlock {
int in_channels = 3;
int channels = 64;
int z_channels = 4;
int num_blocks = 3;
public:
TinyEncoder(int z_channels = 4)
: z_channels(z_channels) {
int index = 0;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [n, in_channels, h, w]
// return: [n, z_channels, h/8, w/8]
for (int i = 0; i < num_blocks * 3 + 6; i++) {
auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
x = block->forward(ctx, x);
}
return x;
}
};
class TinyDecoder : public UnaryBlock {
int z_channels = 4;
int channels = 64;
int out_channels = 3;
int num_blocks = 3;
public:
TinyDecoder(int z_channels = 4)
: z_channels(z_channels) {
int index = 0;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}));
index++; // nn.ReLU()
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override {
// z: [n, z_channels, h, w]
// return: [n, out_channels, h*8, w*8]
auto h = ggml_scale(ctx->ggml_ctx, z, 1.0f / 3.0f);
h = ggml_tanh_inplace(ctx->ggml_ctx, h);
h = ggml_scale(ctx->ggml_ctx, h, 3.0f);
for (int i = 0; i < num_blocks * 3 + 10; i++) {
if (blocks.find(std::to_string(i)) == blocks.end()) {
if (i == 1) {
h = ggml_relu_inplace(ctx->ggml_ctx, h);
} else {
h = ggml_upscale(ctx->ggml_ctx, h, 2, GGML_SCALE_MODE_NEAREST);
}
continue;
}
auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
h = block->forward(ctx, h);
}
return h;
}
};
class TAESD : public GGMLBlock {
protected:
bool decode_only;
public:
TAESD(bool decode_only = true, SDVersion version = VERSION_SD1)
: decode_only(decode_only) {
int z_channels = 4;
if (sd_version_is_dit(version)) {
z_channels = 16;
}
blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder(z_channels));
if (!decode_only) {
blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder(z_channels));
}
}
struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
return decoder->forward(ctx, z);
}
struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
return encoder->forward(ctx, x);
}
};
struct TinyAutoEncoder : public GGMLRunner {
TAESD taesd;
bool decode_only = false;
TinyAutoEncoder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool decoder_only = true,
SDVersion version = VERSION_SD1)
: decode_only(decoder_only),
taesd(decoder_only, version),
GGMLRunner(backend, offload_params_to_cpu) {
taesd.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "taesd";
}
bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
alloc_params_buffer();
std::map<std::string, ggml_tensor*> taesd_tensors;
taesd.get_param_tensors(taesd_tensors);
std::set<std::string> ignore_tensors;
if (decode_only) {
ignore_tensors.insert("encoder.");
}
ModelLoader model_loader;
if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
return false;
}
bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);
if (!success) {
LOG_ERROR("load tae tensors from model loader failed");
return false;
}
LOG_INFO("taesd model loaded");
return success;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z);
auto runner_ctx = get_context();
struct ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z);
ggml_build_forward_expand(gf, out);
return gf;
}
bool compute(const int n_threads,
struct ggml_tensor* z,
bool decode_graph,
struct ggml_tensor** output,
struct ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(z, decode_graph);
};
return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
}
};
#endif // __TAE_HPP__

18
thirdparty/darts.h vendored
View File

@ -845,7 +845,7 @@ inline void BitVector::build() {
num_ones_ = 0; num_ones_ = 0;
for (std::size_t i = 0; i < units_.size(); ++i) { for (std::size_t i = 0; i < units_.size(); ++i) {
ranks_[i] = num_ones_; ranks_[i] = static_cast<id_type>(num_ones_);
num_ones_ += pop_count(units_[i]); num_ones_ += pop_count(units_[i]);
} }
} }
@ -1769,7 +1769,7 @@ id_type DoubleArrayBuilder::arrange_from_keyset(const Keyset<T> &keyset,
inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const { inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const {
if (extras_head_ >= units_.size()) { if (extras_head_ >= units_.size()) {
return units_.size() | (id & LOWER_MASK); return static_cast<id_type>(units_.size()) | (id & LOWER_MASK);
} }
id_type unfixed_id = extras_head_; id_type unfixed_id = extras_head_;
@ -1781,7 +1781,7 @@ inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const {
unfixed_id = extras(unfixed_id).next(); unfixed_id = extras(unfixed_id).next();
} while (unfixed_id != extras_head_); } while (unfixed_id != extras_head_);
return units_.size() | (id & LOWER_MASK); return static_cast<id_type>(units_.size()) | (id & LOWER_MASK);
} }
inline bool DoubleArrayBuilder::is_valid_offset(id_type id, inline bool DoubleArrayBuilder::is_valid_offset(id_type id,
@ -1812,7 +1812,7 @@ inline void DoubleArrayBuilder::reserve_id(id_type id) {
if (id == extras_head_) { if (id == extras_head_) {
extras_head_ = extras(id).next(); extras_head_ = extras(id).next();
if (extras_head_ == id) { if (extras_head_ == id) {
extras_head_ = units_.size(); extras_head_ = static_cast<id_type>(units_.size());
} }
} }
extras(extras(id).prev()).set_next(extras(id).next()); extras(extras(id).prev()).set_next(extras(id).next());
@ -1821,8 +1821,8 @@ inline void DoubleArrayBuilder::reserve_id(id_type id) {
} }
inline void DoubleArrayBuilder::expand_units() { inline void DoubleArrayBuilder::expand_units() {
id_type src_num_units = units_.size(); id_type src_num_units = static_cast<id_type>(units_.size());
id_type src_num_blocks = num_blocks(); id_type src_num_blocks = static_cast<id_type>(num_blocks());
id_type dest_num_units = src_num_units + BLOCK_SIZE; id_type dest_num_units = src_num_units + BLOCK_SIZE;
id_type dest_num_blocks = src_num_blocks + 1; id_type dest_num_blocks = src_num_blocks + 1;
@ -1834,7 +1834,7 @@ inline void DoubleArrayBuilder::expand_units() {
units_.resize(dest_num_units); units_.resize(dest_num_units);
if (dest_num_blocks > NUM_EXTRA_BLOCKS) { if (dest_num_blocks > NUM_EXTRA_BLOCKS) {
for (std::size_t id = src_num_units; id < dest_num_units; ++id) { for (id_type id = src_num_units; id < dest_num_units; ++id) {
extras(id).set_is_used(false); extras(id).set_is_used(false);
extras(id).set_is_fixed(false); extras(id).set_is_fixed(false);
} }
@ -1858,9 +1858,9 @@ inline void DoubleArrayBuilder::expand_units() {
inline void DoubleArrayBuilder::fix_all_blocks() { inline void DoubleArrayBuilder::fix_all_blocks() {
id_type begin = 0; id_type begin = 0;
if (num_blocks() > NUM_EXTRA_BLOCKS) { if (num_blocks() > NUM_EXTRA_BLOCKS) {
begin = num_blocks() - NUM_EXTRA_BLOCKS; begin = static_cast<id_type>(num_blocks() - NUM_EXTRA_BLOCKS);
} }
id_type end = num_blocks(); id_type end = static_cast<id_type>(num_blocks());
for (id_type block_id = begin; block_id != end; ++block_id) { for (id_type block_id = begin; block_id != end; ++block_id) {
fix_block(block_id); fix_block(block_id);

View File

@ -257,6 +257,10 @@ int stbi_write_tga_with_rle = 1;
int stbi_write_force_png_filter = -1; int stbi_write_force_png_filter = -1;
#endif #endif
#ifndef STBMIN
#define STBMIN(a, b) ((a) < (b) ? (a) : (b))
#endif // STBMIN
static int stbi__flip_vertically_on_write = 0; static int stbi__flip_vertically_on_write = 0;
STBIWDEF void stbi_flip_vertically_on_write(int flag) STBIWDEF void stbi_flip_vertically_on_write(int flag)
@ -1179,8 +1183,8 @@ STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int s
if (!zlib) return 0; if (!zlib) return 0;
if(parameters != NULL) { if(parameters != NULL) {
param_length = strlen(parameters); param_length = (int)strlen(parameters);
param_length += strlen("parameters") + 1; // For the name and the null-byte param_length += (int)strlen("parameters") + 1; // For the name and the null-byte
} }
// each tag requires 12 bytes of overhead // each tag requires 12 bytes of overhead
@ -1526,11 +1530,11 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
if(parameters != NULL) { if(parameters != NULL) {
stbiw__putc(s, 0xFF /* comnent */ ); stbiw__putc(s, 0xFF /* comnent */ );
stbiw__putc(s, 0xFE /* marker */ ); stbiw__putc(s, 0xFE /* marker */ );
size_t param_length = std::min(2 + strlen("parameters") + 1 + strlen(parameters) + 1, (size_t) 0xFFFF); int param_length = STBMIN(2 + (int)strlen("parameters") + 1 + (int)strlen(parameters) + 1, 0xFFFF);
stbiw__putc(s, param_length >> 8); // no need to mask, length < 65536 stbiw__putc(s, param_length >> 8); // no need to mask, length < 65536
stbiw__putc(s, param_length & 0xFF); stbiw__putc(s, param_length & 0xFF);
s->func(s->context, (void*)"parameters", strlen("parameters") + 1); // std::string is zero-terminated s->func(s->context, (void*)"parameters", (int)strlen("parameters") + 1); // std::string is zero-terminated
s->func(s->context, (void*)parameters, std::min(param_length, (size_t) 65534) - 2 - strlen("parameters") - 1); s->func(s->context, (void*)parameters, STBMIN(param_length, 65534) - 2 - (int)strlen("parameters") - 1);
if(param_length > 65534) stbiw__putc(s, 0); // always zero-terminate for safety if(param_length > 65534) stbiw__putc(s, 0); // always zero-terminate for safety
if(param_length & 1) stbiw__putc(s, 0xFF); // pad to even length if(param_length & 1) stbiw__putc(s, 0xFF); // pad to even length
} }