Compare commits

...

130 Commits

Author SHA1 Message Date
RapidMark
19bdfe22d2
feat: set tensor names on block params (#1622) 2026-06-08 23:25:52 +08:00
stduhpf
138da14cc3
apg: normalize diff_norm calculation by tensor size (#1620) 2026-06-08 21:56:15 +08:00
fszontagh
17a2b4a315
perf: cap planner budget when model dwarfs the streaming budget (#1612) 2026-06-08 21:53:54 +08:00
leejet
b3d56d0ba1
refactor: split model loader from model definitions (#1619) 2026-06-07 23:20:12 +08:00
leejet
2a07540c2a
refactor: move photomaker into generation extension (#1618) 2026-06-07 22:40:02 +08:00
Wagner Bruna
81abfb2548
chore: rename and reformat gits_noise.inl (#1617) 2026-06-07 22:30:20 +08:00
leejet
f3fd359b58
refactor: reorganize src model layout (#1615) 2026-06-07 03:21:12 +08:00
leejet
dfb2390dd4
refactor: extract Wan VAE implementation (#1614) 2026-06-07 01:33:49 +08:00
leejet
cfbc19d186
refactor: unify model config detection (#1613) 2026-06-07 01:05:12 +08:00
leejet
b9254dda0d
feat: add ideogram4 support (#1609) 2026-06-06 16:34:16 +08:00
fszontagh
0648f4426b
perf: ratchet streaming budget so plan stops re-merging every step (#1611) 2026-06-06 16:32:03 +08:00
YOSHIDA Keiji
74f513d512
fix: Suppress spurious error output for --help (#1607) (#1608)
Signed-off-by: kei-g <km.8k6ce+github@gmail.com>
2026-06-06 16:23:44 +08:00
fszontagh
064001b524
perf: allocate CPU-offloaded params from runtime device pinned host buffer (#1601) 2026-06-06 16:22:18 +08:00
leejet
1f9ee88e09
fix: zero Wan2.2 TI2V timesteps for fixed frames (#1604) 2026-06-03 23:32:31 +08:00
fszontagh
a7f2e03da4
perf: keep chunk-K residency engaged with runtime LoRA (#1598) 2026-06-03 23:12:00 +08:00
stduhpf
4513e3fda9
refactor: img-cond->img_uncond (#1594)
* refactor: img-cond->img_uncond

* align APG and CFG++ with img-uncond CFG

* set default img_cfg to 1.f

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-06-03 22:57:42 +08:00
leejet
2d40a8b2ad
feat: make Wan2.2 5B FLF2V work (#1110) 2026-06-02 23:16:09 +08:00
leejet
9c7f9a20b3
chore: embed server web UI in Docker images (#1597) 2026-06-02 22:46:25 +08:00
fszontagh
ed74577c40
feat: --stream-layers for streaming weights from CPU during generation (#1576) 2026-06-02 22:35:28 +08:00
RapidMark
7948df8ac1
fix(cmake): build HIP backend with PIC so the static-lib PIE link succeeds (#1593) 2026-06-02 00:07:48 +08:00
Wagner Bruna
02f06370a7
refactor: call CPU backend functions dynamically (#1591)
Co-authored-by: leejet <leejet714@gmail.com>
2026-06-01 23:41:21 +08:00
stduhpf
f8935d6f25
feat: support img-cfg for edit models (#929)
Co-authored-by: leejet <leejet714@gmail.com>
2026-06-01 22:54:25 +08:00
stduhpf
be65ac7511
feat: add support for APG (adaptive projected guidance) + unconditionnal SLG (#593) 2026-06-01 00:55:49 +08:00
leejet
20901f6d8e
fix: remove kv padding from flash attention wrapper (#1453) 2026-05-31 23:23:19 +08:00
leejet
0982807139
feat: add PiD support (#1585) 2026-05-31 22:38:39 +08:00
leejet
d2797b8667
fix: correct Gemma3 rope settings and vram limit propagation (#1583) 2026-05-30 22:23:49 +08:00
leejet
d3b2cb047e
fix: split tokens before normalization (#1582) 2026-05-30 18:38:46 +08:00
akleine
b4ba55d8d7
fix: prevent crash in case of a mem alloc error and graceful exit (#1566) 2026-05-30 18:34:07 +08:00
Wagner Bruna
b54bd83a3f
fix: explicitly exclude f8, f64 and i64 tensors from mmap (#1575) 2026-05-30 18:31:08 +08:00
Wagner Bruna
0e4ee04488
fix: correct tae for models that use the flux2 vae (#1571) 2026-05-28 09:13:16 +08:00
leejet
29ab511fc7
fix: resolve LLM norm tensor names by architecture (#1570) 2026-05-28 00:36:16 +08:00
leejet
55c2aed52c
refactor: simplify diffusion model runner params (#1569) 2026-05-28 00:12:35 +08:00
leejet
8eded497e5
fix: preserve frontend tooling in ROCm CI build (#1568) 2026-05-27 21:26:16 +08:00
leejet
92dc7268fc
feat: add microsoft lens support (#1560) 2026-05-27 01:04:17 +08:00
schirik
07b2b18e70
fix: skip permission denied errors in recursive_directory_iterator (#1564)
Co-authored-by: Serge F. Chirik <s.chirik@timbel.info>
2026-05-27 00:56:16 +08:00
leejet
1ceb5bd9df
fix: package ROCm BLAS runtime in Windows artifacts (#1562) 2026-05-26 00:57:37 +08:00
leejet
202c6154a2
fix: use flux flow prediction for LTXAV (#1561) 2026-05-26 00:23:39 +08:00
stduhpf
a397e03488
feat: add Longcat-Image / Longcat-Image-Edit support (#1053)
Co-authored-by: leejet <leejet714@gmail.com>
2026-05-24 02:02:02 +08:00
leejet
72e512a0cc
fix: make macOS binaries use relocatable rpaths (#1552) 2026-05-23 12:27:06 +08:00
leejet
0baf721215
feat: add LTX temporal latent upscaler support (#1551) 2026-05-23 01:35:13 +08:00
leejet
645e6e9089
feat: add LTX rational latent upscaler (#1549) 2026-05-23 00:28:15 +08:00
stduhpf
cbf92191c3
fix: strip trailing latent channels for preview decode (#1548) 2026-05-23 00:26:40 +08:00
stduhpf
8cf55a3b3b
fix: load TAESD preview-only model correctly (#1547) 2026-05-23 00:22:35 +08:00
leejet
3a8788cb7d
refactor: unify extra argument parsing (#1540) 2026-05-22 01:00:03 +08:00
leejet
449165caf5
feat: stream LTX VAE temporal tile decoding (#1539) 2026-05-22 00:25:04 +08:00
stduhpf
adaa599a3b
Feat: Temporal tile custom size with overlap (#1510)
* Temporal tile size + overlap

* add --extra-tiling-args support

---------

Co-authored-by: leejet <leejet714@gmail.com>
2026-05-21 23:44:12 +08:00
leejet
2e3514625a
perf: run LTX audio VAE decode in one ggml graph (#1538) 2026-05-21 22:43:14 +08:00
stduhpf
47d8198b69
feat: add taeltx2_3_wide support (#1535) 2026-05-21 22:34:12 +08:00
leejet
ef92a0027e
feat: add graph cut markers for LTXAV transformer (#1534) 2026-05-20 23:22:10 +08:00
leejet
b3374e6a71
feat: add LTX spatial latent upscale hires support (#1533) 2026-05-20 22:27:09 +08:00
stduhpf
bdd937f29a
feat: add taeltx2/taeltx2.3 support (#1531) 2026-05-20 22:14:05 +08:00
stduhpf
c51ec7cad9
fix: always load runtimle lora params on runtime backend (#1532) 2026-05-20 22:13:15 +08:00
leejet
5b0267e941
fix: avoid Vulkan f16 repeat in LTX audio VAE (#1528) 2026-05-19 23:15:26 +08:00
leejet
0045a72b96
fix: trigger ci for docker image changes (#1527) 2026-05-19 22:05:03 +08:00
leejet
99bd062546
fix: update sycl docker image to oneapi 2025.3 (#1526) 2026-05-19 21:59:15 +08:00
leejet
9d8c9e4279
fix: build web UI for Windows ROCm server releases (#1525) 2026-05-19 21:53:48 +08:00
George Sofianos
caa823a8c0
ci: add RDNA1 + RDNA2 targets for ROCm 7.13 (#1511) 2026-05-19 01:38:02 +08:00
leejet
22c8c40b0d
sync: update ggml (#1520) 2026-05-19 01:30:11 +08:00
leejet
b706d682ad
fix: restore singleton dims for LLM outputs (#1518) 2026-05-18 23:47:10 +08:00
leejet
b758b7de13
fix: only enable TAE after successful load (#1517) 2026-05-18 23:32:03 +08:00
Wagner Bruna
f683c88a28
feat: make negative max_vram control the amount of spare vram (#1503) 2026-05-18 23:00:06 +08:00
Christoph
21fd4e6788
ci: add CUDA Docker image support for NVIDIA Spark GB10 (#1512) 2026-05-18 22:52:01 +08:00
leejet
830804262b docs: update news 2026-05-18 00:24:29 +08:00
leejet
82e03ef137 ci: add inactive pr clean up workflow 2026-05-18 00:09:45 +08:00
leejet
baf7eda1e4
refactor: minify vocab files (#1509) 2026-05-17 23:06:58 +08:00
Wagner Bruna
e7eb92fd84
feat: add Gradient Estimation sampler (#1484) 2026-05-17 22:54:28 +08:00
leejet
50134e51dd
refactor: split guidance composition (#1506) 2026-05-17 20:20:16 +08:00
leejet
e43b24cf48
feat: add ltx2.3 flf2v support (#1505) 2026-05-17 18:40:14 +08:00
stduhpf
06accf2b39
feat: add ltxav latent2rgb projection matrix (#1502) 2026-05-17 17:52:05 +08:00
stduhpf
cde20d5ef0 fix: handle stereo format in sd_audio (#1489)
Co-authored-by: leejet <leejet714@gmail.com>
2026-05-17 16:55:39 +08:00
leejet
67dda3f897
feat: add ltx2.3 support (#1463)
* add GemmaTokenizer

* add basic ltx2.3 support

* change vocab file encoding

* fix ci

* fix ubuntu build

* add temporal tiling support

* add ltx audio support

* update ggml submodule url

* fix generate_video

* add i2v support

* minify bundled Gemma tokenizer vocab sources

* pass video fps into temporal rope embeddings

* fix av_ca_timestep_scale_multiplier

* add LTX2Scheduler support

* update docs

* fix ci
2026-05-17 16:46:20 +08:00
Mario Limonciello
3b4d26f3d9
ci: update ROCm builds for Windows and Linux to use ROCm 7.13 (#1504) 2026-05-17 16:32:19 +08:00
Taylor
bd17f53b73
docs: update zit example to 8 steps (#1294) 2026-05-16 21:32:03 +08:00
leejet
d7ecbe1d01
fix: avoid repeated T5 EOS tokens in Anima prompt weights (#1501) 2026-05-16 21:22:46 +08:00
leejet
36330724bd
feat: add module backend assignment support (#1500)
Co-authored-by: Stéphane du Hamel <stephduh@live.fr>
2026-05-16 20:27:06 +08:00
Mario Limonciello
0c1ca170ca
ci: update ROCm Windows builds (#1282) 2026-05-16 20:25:38 +08:00
Mario Limonciello
839f6a94d2
ci: switch over ROCm builds to artifacts both for stable and preview releases (#1281) 2026-05-16 20:23:26 +08:00
leejet
38b14adb67
feat: auto-detect max VRAM budget with --max-vram -1 (#1498) 2026-05-16 16:14:25 +08:00
Wagner Bruna
fd1a2794f3
refactor: unify Euler, Euler Ancestral and DDIM implementations (#1474) 2026-05-16 16:13:28 +08:00
cphlipot
db08b84607
fix: Fix broken GCC 16 build (enforce C11/C++17 compile ) (#1478) 2026-05-16 16:10:16 +08:00
Wagner Bruna
686856edca
chore: do not report the fake VAE "allocation" as an error (#1494) 2026-05-16 16:08:31 +08:00
leejet
0b8296915c docs: add .github/pull_request_template.md 2026-05-15 01:16:21 +08:00
leejet
381e0df50f docs: add CONTRIBUTING.md 2026-05-15 01:09:45 +08:00
leejet
0665a7f8bf
feat: add hidream o1 image support (#1485) 2026-05-15 00:40:21 +08:00
Craig Andrews
eeac950b44
fix: Use PkgConfig for WebP and WebM (#1400) 2026-05-15 00:31:10 +08:00
Wagner Bruna
57ff2eb0f4
feat: support for memory-mapping model weights (#1414)
Co-authored-by: Piotr Wilkin <piotr.wilkin@syndatis.com>
Co-authored-by: Junmo Kim <me@junmo.kim>
Co-authored-by: leejet <leejet714@gmail.com>
2026-05-15 00:30:03 +08:00
Daniele
9d683417cb
feat: add Euler CFG++ and Euler-A CFG++ samplers (#1354) 2026-05-15 00:29:04 +08:00
l8bloom
60477fd50f
docs: add new go bindings for stable-diffusion.cpp (#1480) 2026-05-14 23:59:06 +08:00
cphlipot
6ee0684d74
feat: display server url with "http://" prefix. (#1486) 2026-05-14 23:57:22 +08:00
leejet
90e87bc846
feat: add max-vram based segmented param offload (#1476) 2026-05-06 21:56:02 +08:00
Wagner Bruna
586b6f1481
feat: adapt res samplers for flow models for eta > 0 (#1436) 2026-05-06 21:49:06 +08:00
fszontagh
9097ce5211
fix: skip empty MultiLoraAdapter when no LoRAs target a model (#1469) 2026-05-06 21:45:47 +08:00
leejet
3d6064b37e
perf: speed up tensor_to_sd_image conversion (#1466) 2026-04-30 01:13:56 +08:00
Wagner Bruna
b8079e253d
feat: transition from compile-time to runtime backend discovery (#1448)
Co-authored-by: Stéphane du Hamel <stephduh@live.fr>
Co-authored-by: Cyberhan123 <255542417@qq.com>
Co-authored-by: leejet <leejet714@gmail.com>
2026-04-29 23:26:57 +08:00
Wagner Bruna
331cfa5387
fix: release VAE compute buffer after tiled encoding (#1465) 2026-04-29 22:25:30 +08:00
Douglas Griffith
a81677f59c
docs: performance tips markup (#1460) 2026-04-27 22:55:30 +08:00
leejet
f40a707d0f
feat: add sdcpp-specific generation metadata to image outputs (#1462) 2026-04-27 22:43:13 +08:00
akleine
970c4a3312
chore: replace some NULL with nullptr + use "%zu" for printing some size_t data (#1457) 2026-04-27 22:42:57 +08:00
leejet
b8bdffc199
feat: add more built-in highres upscalers (#1456) 2026-04-23 22:17:58 +08:00
leejet
c97702e105
feat: add sd-webui style Hires. fix support (#1451) 2026-04-22 23:51:09 +08:00
leejet
44cca3d626
feat: support safetensors export in convert mode (#1444) 2026-04-20 00:22:11 +08:00
leejet
0a7ae07f94
feat: add restricted torch legacy checkpoint loading (#1443) 2026-04-19 23:09:43 +08:00
leejet
66143340b6
refactor: move model file IO into dedicated module (#1442) 2026-04-19 17:52:56 +08:00
Wagner Bruna
7023fc4cfb
fix: correct image to image DDIM and TCD (#1410) 2026-04-19 17:51:28 +08:00
Wagner Bruna
e77e4c46bf
feat: adapt LCM for flow models (#1413) 2026-04-19 17:49:46 +08:00
leejet
7d33d4b2dd
chore: enable MSVC parallel compilation with /MP (#1438) 2026-04-18 15:44:43 +08:00
leejet
3c99f700de
ci: skip docker image build job on pull requests (#1439) 2026-04-18 15:25:04 +08:00
leejet
4d626d24b2
feat(server): implement vid_gen async API and mode-aware capabilities (#1437) 2026-04-18 15:06:36 +08:00
Wagner Bruna
f3f69e2fbe
feat: add DPM++ (2S) Ancestral implementation for flow models (#1428) 2026-04-18 15:05:09 +08:00
Erik Scholz
6a9cb31150
fix: tune ernie-image default flow shift (#1433) 2026-04-18 14:58:00 +08:00
Wagner Bruna
2bcff67480
fix: correct dpm++2s_a second model call (#1435) 2026-04-18 14:54:41 +08:00
leejet
a564fdf642
refactor: remove is_xl guard wrapper in get_sd_version (#1430) 2026-04-17 01:53:58 +08:00
leejet
84fc5446d2
fix: skip empty prompt segments around attention range (#1429) 2026-04-17 01:42:14 +08:00
rmatif
1b4e9be643
feat: add er_sde sampler (#1403) 2026-04-17 01:32:16 +08:00
akleine
d73b4198a4
feat: SDXS-09 support and update doc (#1356) 2026-04-17 01:11:44 +08:00
leejet
5c243db9a8
feat: add ernie image support (#1427) 2026-04-17 00:51:42 +08:00
leejet
c41c5ded7a
feat: add left padding support to tokenizers (#1424) 2026-04-15 23:17:47 +08:00
leejet
9ac7b672c2
refactor: introduce shared tokenizer abstraction and split implementations (#1423) 2026-04-15 22:44:39 +08:00
Wagner Bruna
ee5bf956b0
chore: allow building the embedded UI header separately (#1415) 2026-04-15 22:07:31 +08:00
leejet
6b675a5ede docs: update readme 2026-04-11 18:42:38 +08:00
leejet
12a369cc67 docs: update readme 2026-04-11 18:41:12 +08:00
leejet
fd3504760f
feat: use sdcpp-webui as embedded webui (#1408) 2026-04-11 18:33:11 +08:00
leejet
7ade90e478
feat: add sdcpp api support (#1407) 2026-04-11 17:49:00 +08:00
Wagner Bruna
118489eb5c
chore: harden safetensors and gguf loading code (#1404)
Co-authored-by: professor-moody <keys@nimbus.lan>
2026-04-11 17:19:57 +08:00
Wagner Bruna
be9f51b25c
refactor: simplify DiscreteFlowDenoiser (#1405) 2026-04-11 17:18:23 +08:00
leejet
e8323cabb0
feat: add flux2 small decoder support (#1402) 2026-04-08 23:13:25 +08:00
Wagner Bruna
dd753729cc
fix: correct double increment on flow denoisers sigma calculations (#1372) 2026-04-08 23:13:05 +08:00
leejet
8afbeb6ba9
chore: normalize text files to utf-8 without bom (#1394) 2026-04-06 21:25:34 +08:00
leejet
5bf438d568
refactor: split examples common into header and source (#1393) 2026-04-06 21:11:57 +08:00
leejet
359eb8b8de
refactor: apply RAII ownership to examples (#1392) 2026-04-06 20:33:46 +08:00
190 changed files with 37076 additions and 4350837 deletions

15
.github/pull_request_template.md vendored Normal file
View File

@ -0,0 +1,15 @@
## Summary
<!-- Describe what changed and why. Keep the PR focused on one clear change. -->
## Related Issue / Discussion
<!-- Link related issues, discussions, or previous PRs if applicable. -->
## Additional Information
<!-- Add verification notes, screenshots, sample output, or other context when applicable. -->
## Checklist
- [ ] I have read and confirmed this PR follows the [contribution guidelines](https://github.com/leejet/stable-diffusion.cpp/blob/master/CONTRIBUTING.md).

View File

@ -14,6 +14,8 @@ on:
paths: paths:
[ [
".github/workflows/**", ".github/workflows/**",
".dockerignore",
"Dockerfile*",
"**/CMakeLists.txt", "**/CMakeLists.txt",
"**/Makefile", "**/Makefile",
"**/*.h", "**/*.h",
@ -21,6 +23,7 @@ on:
"**/*.c", "**/*.c",
"**/*.cpp", "**/*.cpp",
"**/*.cu", "**/*.cu",
"examples/server/frontend",
"examples/server/frontend/**", "examples/server/frontend/**",
] ]
pull_request: pull_request:
@ -28,6 +31,8 @@ on:
paths: paths:
[ [
".github/workflows/**", ".github/workflows/**",
".dockerignore",
"Dockerfile*",
"**/CMakeLists.txt", "**/CMakeLists.txt",
"**/Makefile", "**/Makefile",
"**/*.h", "**/*.h",
@ -35,6 +40,7 @@ on:
"**/*.c", "**/*.c",
"**/*.cpp", "**/*.cpp",
"**/*.cu", "**/*.cu",
"examples/server/frontend",
"examples/server/frontend/**", "examples/server/frontend/**",
] ]
@ -133,7 +139,7 @@ jobs:
id: depends id: depends
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential libvulkan-dev glslc sudo apt-get install build-essential libvulkan-dev glslc spirv-headers
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -174,7 +180,8 @@ jobs:
build-and-push-docker-images: build-and-push-docker-images:
name: Build and push container images name: Build and push container images
runs-on: ubuntu-latest if: ${{ github.event_name != 'pull_request' }}
runs-on: ${{ matrix.runner }}
permissions: permissions:
contents: read contents: read
@ -186,6 +193,20 @@ jobs:
strategy: strategy:
matrix: matrix:
variant: [musa, sycl, vulkan, cuda] variant: [musa, sycl, vulkan, cuda]
platform: [linux/amd64]
runner: [ubuntu-latest]
build-args: [""]
tag-suffix: [""]
include:
- variant: cuda
platform: linux/arm64
runner: ubuntu-24.04-arm
tag-suffix: "-spark"
build-args: |
CUDA_VERSION=13.0.0
UBUNTU_VERSION=24.04
CUDA_ARCHITECTURES=121
GGML_CUDA_FA_ALL_QUANTS=ON
env: env:
REGISTRY: ghcr.io REGISTRY: ghcr.io
@ -240,12 +261,13 @@ jobs:
uses: docker/build-push-action@v6 uses: docker/build-push-action@v6
with: with:
context: . context: .
platforms: linux/amd64 platforms: ${{ matrix.platform }}
push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
file: Dockerfile.${{ matrix.variant }} file: Dockerfile.${{ matrix.variant }}
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }} tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}${{ matrix.tag-suffix }}
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
annotations: ${{ steps.meta.outputs.annotations }} annotations: ${{ steps.meta.outputs.annotations }}
build-args: ${{ matrix.build-args }}
macOS-latest-cmake: macOS-latest-cmake:
runs-on: macos-latest runs-on: macos-latest
@ -441,12 +463,129 @@ jobs:
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-rocm:
runs-on: windows-2022
env:
ROCM_VERSION: "7.13.0"
GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 10.15.1
- name: Cache ROCm Installation
id: cache-rocm
uses: actions/cache@v4
with:
path: C:\TheRock\build
key: rocm-${{ env.ROCM_VERSION }}-gfx1151-${{ runner.os }}
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-rocm-${{ env.ROCM_VERSION }}-x64
evict-old-files: 1d
- name: Install ROCm
if: steps.cache-rocm.outputs.cache-hit != 'true'
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD ROCm ${{ env.ROCM_VERSION }} tarball"
Invoke-WebRequest -Uri "https://repo.amd.com/rocm/tarball/therock-dist-windows-gfx1151-${{ env.ROCM_VERSION }}.tar.gz" -OutFile "${env:RUNNER_TEMP}\rocm.tar.gz"
write-host "Extracting ROCm tarball"
mkdir C:\TheRock\build -Force
tar -xzf "${env:RUNNER_TEMP}\rocm.tar.gz" -C C:\TheRock\build --strip-components=1
write-host "Completed ROCm extraction"
- name: Setup ROCm Environment
run: |
$rocmPath = "C:\TheRock\build"
echo "HIP_PATH=$rocmPath" >> $env:GITHUB_ENV
echo "HIP_DEVICE_LIB_PATH=$rocmPath\lib\llvm\amdgcn\bitcode" >> $env:GITHUB_ENV
echo "HIP_PLATFORM=amd" >> $env:GITHUB_ENV
echo "LLVM_PATH=$rocmPath\lib\llvm" >> $env:GITHUB_ENV
echo "$rocmPath\bin" >> $env:GITHUB_PATH
echo "$rocmPath\lib\llvm\bin" >> $env:GITHUB_PATH
- name: Build
run: |
mkdir build
cd build
cmake .. `
-G "Unix Makefiles" `
-DCMAKE_PREFIX_PATH="${env:HIP_PATH}" `
-DSD_HIPBLAS=ON `
-DSD_BUILD_SHARED_LIBS=ON `
-DGGML_NATIVE=OFF `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang++.exe" `
-DCMAKE_HIP_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang.exe" `
-DHIP_PATH="${env:HIP_PATH}" `
-DCMAKE_BUILD_TYPE=Release `
-DGPU_TARGETS="${{ env.GPU_TARGETS }}"
cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Pack artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
$ErrorActionPreference = "Stop"
$dst = "build\bin"
$rocmBin = Join-Path "${env:HIP_PATH}" "bin"
$requiredRocmPaths = @(
(Join-Path $rocmBin "rocblas.dll"),
(Join-Path $rocmBin "rocblas\library")
)
foreach ($path in $requiredRocmPaths) {
if (!(Test-Path $path)) {
throw "Missing ROCm runtime dependency: $path"
}
}
foreach ($pattern in @("rocblas*.dll", "hipblas*.dll", "libhipblas*.dll")) {
Copy-Item -Path (Join-Path $rocmBin $pattern) -Destination $dst -Force -ErrorAction SilentlyContinue
}
foreach ($dir in @("rocblas", "hipblaslt")) {
$src = Join-Path $rocmBin $dir
if (Test-Path $src) {
Copy-Item -Path $src -Destination $dst -Recurse -Force
}
}
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
windows-latest-cmake-hip: windows-latest-cmake-hip:
runs-on: windows-2022 runs-on: windows-2022
env: env:
HIPSDK_INSTALLER_VERSION: "25.Q3" HIPSDK_INSTALLER_VERSION: "26.Q1"
GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" ROCM_VERSION: "7.1.1"
GPU_TARGETS: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
@ -481,7 +620,7 @@ jobs:
run: | run: |
$ErrorActionPreference = "Stop" $ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer" write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK" write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$completed = $proc.WaitForExit(600000) $completed = $proc.WaitForExit(600000)
@ -534,47 +673,75 @@ jobs:
run: | run: |
md "build\bin\rocblas\library\" md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library" md "build\bin\hipblaslt\library"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\" cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\" cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\* 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
ubuntu-latest-rocm: ubuntu-latest-rocm:
runs-on: ubuntu-latest runs-on: ubuntu-24.04
container: rocm/dev-ubuntu-24.04:7.2
env: env:
ROCM_VERSION: "7.2"
UBUNTU_VERSION: "24.04" UBUNTU_VERSION: "24.04"
GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
strategy:
matrix:
include:
- ROCM_VERSION: "7.2.1"
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
build: 'x64'
- ROCM_VERSION: "7.13.0"
gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"
build: x64
steps: steps:
- run: apt-get update && apt-get install -y git
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v6 uses: actions/checkout@v6
with: with:
submodules: recursive submodules: recursive
- name: Setup Node - name: ccache
uses: actions/setup-node@v4 uses: ggml-org/ccache-action@v1.2.16
with: with:
node-version: 20 key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
evict-old-files: 1d
- name: Setup pnpm - name: Dependencies
uses: pnpm/action-setup@v4 id: depends
with: run: |
version: 10.15.1 sudo apt install -y build-essential cmake wget zip ninja-build
- name: Setup Legacy ROCm
if: matrix.ROCM_VERSION == '7.2.1'
id: legacy_env
run: |
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
sudo tee /etc/apt/sources.list.d/rocm.list << EOF
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} noble main
EOF
sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
Package: *
Pin: release o=repo.radeon.com
Pin-Priority: 600
EOF
sudo apt update
sudo apt-get install -y libssl-dev rocm-hip-sdk
- name: Free disk space - name: Free disk space
run: | run: |
@ -589,51 +756,29 @@ jobs:
sudo rm -rf /var/lib/apt/lists/* || true sudo rm -rf /var/lib/apt/lists/* || true
sudo apt clean sudo apt clean
- name: Dependencies - name: Setup TheRock
id: depends if: matrix.ROCM_VERSION != '7.2.1'
id: therock_env
run: | run: |
sudo apt-get update wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
sudo apt install -y \ mkdir install
cmake \ tar -xf *.tar.gz -C install
hip-dev \ export ROCM_PATH=$(pwd)/install
hipblas-dev \ echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
ninja-build \ echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
rocm-dev \ echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
zip
# Clean apt caches to recover disk space
sudo apt clean
sudo rm -rf /var/lib/apt/lists/* || true
- name: Setup ROCm Environment # setup-node installs into /opt/hostedtoolcache, which is removed above.
run: | # Keep Node/pnpm setup after disk cleanup so the server frontend can be embedded.
# Add ROCm to PATH for current session - name: Setup Node
echo "/opt/rocm/bin" >> $GITHUB_PATH uses: actions/setup-node@v4
with:
node-version: 20
# Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring) - name: Setup pnpm
TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))" uses: pnpm/action-setup@v4
with:
# Remove library files for architectures we're not building for to save disk space version: 10.15.1
echo "Cleaning up unneeded architecture files..."
cd /opt/rocm/lib/rocblas/library
# Keep only our target architectures
for file in *; do
if printf '%s' "$file" | grep -q 'gfx'; then
if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
echo "Removing $file" &&
sudo rm -f "$file";
fi
fi
done
cd /opt/rocm/lib/hipblaslt/library
for file in *; do
if printf '%s' "$file" | grep -q 'gfx'; then
if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
echo "Removing $file" &&
sudo rm -f "$file";
fi
fi
done
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -641,12 +786,12 @@ jobs:
mkdir build mkdir build
cd build cd build
cmake .. -G Ninja \ cmake .. -G Ninja \
-DCMAKE_CXX_COMPILER=amdclang++ \ -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DCMAKE_C_COMPILER=amdclang \ -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DSD_HIPBLAS=ON \ -DSD_HIPBLAS=ON \
-DGPU_TARGETS="${{ env.GPU_TARGETS }}" \ -DHIP_PLATFORM=amd \
-DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \ -DGPU_TARGETS="${{ matrix.gpu_targets }}" \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DSD_BUILD_SHARED_LIBS=ON -DSD_BUILD_SHARED_LIBS=ON
@ -665,16 +810,6 @@ jobs:
cp ggml/LICENSE ./build/bin/ggml.txt cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt cp LICENSE ./build/bin/stable-diffusion.cpp.txt
# Move ROCm runtime libraries (to avoid double space consumption)
sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
- name: Fetch system info - name: Fetch system info
id: system-info id: system-info
run: | run: |
@ -689,15 +824,15 @@ jobs:
run: | run: |
cp ggml/LICENSE ./build/bin/ggml.txt cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip ./build/bin
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -712,6 +847,7 @@ jobs:
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-latest-cmake-hip - windows-latest-cmake-hip
- windows-latest-rocm
steps: steps:
- name: Clone - name: Clone

55
.github/workflows/stale-prs.yml vendored Normal file
View File

@ -0,0 +1,55 @@
name: Close inactive PRs
on:
schedule:
# Run daily. GitHub cron schedules use UTC.
- cron: "30 1 * * *"
workflow_dispatch:
inputs:
debug_only:
description: "Dry run: log intended actions without changing PRs"
required: false
default: false
type: boolean
permissions:
issues: write
pull-requests: write
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
jobs:
stale-prs:
runs-on: ubuntu-latest
steps:
- name: Mark and close inactive PRs
uses: actions/stale@v10
with:
days-before-issue-stale: -1
days-before-issue-close: -1
days-before-pr-stale: 365
days-before-pr-close: 7
stale-pr-label: pr:inactive
close-pr-label: pr:auto-closed
exempt-pr-labels: pr:keep-open
stale-pr-message: >
This PR has been inactive for 365 days. If there is no new activity
within 7 days, it will be closed automatically. Comment, push new
commits, or remove the pr:inactive label to keep it open. Add
pr:keep-open to exempt it from future inactive PR cleanup.
close-pr-message: >
Closing this PR because it has had no activity for 7 days after
being marked inactive. If this is still useful or ready to move
forward, feel free to reopen it with fresh context or updated
details. Sorry for any inconvenience.
remove-pr-stale-when-updated: true
delete-branch: false
operations-per-run: 100
debug-only: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_only || false }}

4
.gitmodules vendored
View File

@ -1,9 +1,9 @@
[submodule "ggml"] [submodule "ggml"]
path = ggml path = ggml
url = https://github.com/ggml-org/ggml.git url = https://github.com/leejet/ggml.git
[submodule "examples/server/frontend"] [submodule "examples/server/frontend"]
path = examples/server/frontend path = examples/server/frontend
url = https://github.com/leejet/stable-ui.git url = https://github.com/leejet/sdcpp-webui.git
[submodule "thirdparty/libwebp"] [submodule "thirdparty/libwebp"]
path = thirdparty/libwebp path = thirdparty/libwebp
url = https://github.com/webmproject/libwebp.git url = https://github.com/webmproject/libwebp.git

View File

@ -11,11 +11,42 @@ endif()
if (MSVC) if (MSVC)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING) add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
add_compile_options(
$<$<COMPILE_LANGUAGE:C>:/MP>
$<$<COMPILE_LANGUAGE:C>:/utf-8>
$<$<COMPILE_LANGUAGE:CXX>:/MP>
$<$<COMPILE_LANGUAGE:CXX>:/utf-8>
)
endif() endif()
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if(APPLE)
function(sd_set_macos_rpaths target)
get_target_property(target_type ${target} TYPE)
if(target_type STREQUAL "EXECUTABLE")
set(runtime_paths "@executable_path" "@executable_path/../lib")
elseif(target_type STREQUAL "SHARED_LIBRARY" OR target_type STREQUAL "MODULE_LIBRARY")
set(runtime_paths "@loader_path" "@loader_path/../lib")
set_target_properties(${target} PROPERTIES
MACOSX_RPATH ON
INSTALL_NAME_DIR "@rpath"
BUILD_WITH_INSTALL_NAME_DIR ON
)
else()
return()
endif()
# Release artifacts zip the build output directly, so keep macOS rpaths relocatable.
set_target_properties(${target} PROPERTIES
BUILD_RPATH "${runtime_paths}"
INSTALL_RPATH "${runtime_paths}"
BUILD_WITH_INSTALL_RPATH ON
)
endfunction()
endif()
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(SD_STANDALONE ON) set(SD_STANDALONE ON)
else() else()
@ -65,40 +96,46 @@ option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" O
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF) option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON) #option(SD_BUILD_SERVER "sd: build server example" ON)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED true)
if(SD_CUDA) if(SD_CUDA)
message("-- Use CUDA as backend stable-diffusion") message("-- Use CUDA as backend stable-diffusion")
set(GGML_CUDA ON) set(GGML_CUDA ON)
add_definitions(-DSD_USE_CUDA)
endif() endif()
if(SD_METAL) if(SD_METAL)
message("-- Use Metal as backend stable-diffusion") message("-- Use Metal as backend stable-diffusion")
set(GGML_METAL ON) set(GGML_METAL ON)
add_definitions(-DSD_USE_METAL)
endif() endif()
if (SD_VULKAN) if (SD_VULKAN)
message("-- Use Vulkan as backend stable-diffusion") message("-- Use Vulkan as backend stable-diffusion")
set(GGML_VULKAN ON) set(GGML_VULKAN ON)
add_definitions(-DSD_USE_VULKAN)
endif () endif ()
if (SD_OPENCL) if (SD_OPENCL)
message("-- Use OpenCL as backend stable-diffusion") message("-- Use OpenCL as backend stable-diffusion")
set(GGML_OPENCL ON) set(GGML_OPENCL ON)
add_definitions(-DSD_USE_OPENCL)
endif () endif ()
if (SD_HIPBLAS) if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion") message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON) set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA) # ggml-hip's device-stub objects must be position-independent, or the
# default-PIE sd-cli link fails with `relocation R_X86_64_32 ... cannot be
# used when making a PIE object` on distros that default to PIE
# (Ubuntu 24.04, Fedora 40+, Debian 12+). The shared-library branch below
# already sets this; the static build (the HIP default) did not.
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif () endif ()
if(SD_MUSA) if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion") message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON) set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA)
endif() endif()
if(SD_WEBP) if(SD_WEBP)
@ -108,19 +145,28 @@ if(SD_WEBP)
"Or link against system library:\n cmake (...) -DSD_USE_SYSTEM_WEBP=ON") "Or link against system library:\n cmake (...) -DSD_USE_SYSTEM_WEBP=ON")
endif() endif()
if(SD_USE_SYSTEM_WEBP) if(SD_USE_SYSTEM_WEBP)
find_package(WebP REQUIRED) find_package(WebP)
add_library(webp ALIAS WebP::webp) if(WebP_FOUND)
# libwebp CMake target naming is not consistent across versions/distros. add_library(webp ALIAS WebP::webp)
# Some export WebP::libwebpmux, others export WebP::webpmux. # libwebp CMake target naming is not consistent across versions/distros.
if(TARGET WebP::libwebpmux) # Some export WebP::libwebpmux, others export WebP::webpmux.
add_library(libwebpmux ALIAS WebP::libwebpmux) if(TARGET WebP::libwebpmux)
elseif(TARGET WebP::webpmux) add_library(libwebpmux ALIAS WebP::libwebpmux)
add_library(libwebpmux ALIAS WebP::webpmux) elseif(TARGET WebP::webpmux)
add_library(libwebpmux ALIAS WebP::webpmux)
else()
message(FATAL_ERROR
"Could not find a compatible webpmux target in system WebP package. "
"Expected WebP::libwebpmux or WebP::webpmux."
)
endif()
else() else()
message(FATAL_ERROR find_package(PkgConfig REQUIRED)
"Could not find a compatible webpmux target in system WebP package. " pkg_check_modules(WebP REQUIRED IMPORTED_TARGET GLOBAL libwebp)
"Expected WebP::libwebpmux or WebP::webpmux." pkg_check_modules(WebPMux REQUIRED IMPORTED_TARGET GLOBAL libwebpmux)
) link_libraries(PkgConfig::WebP)
link_libraries(PkgConfig::WebPMux)
add_library(libwebpmux ALIAS PkgConfig::WebPMux)
endif() endif()
endif() endif()
endif() endif()
@ -135,29 +181,56 @@ if(SD_WEBM)
"Or link against system library:\n cmake (...) -DSD_USE_SYSTEM_WEBM=ON") "Or link against system library:\n cmake (...) -DSD_USE_SYSTEM_WEBM=ON")
endif() endif()
if(SD_USE_SYSTEM_WEBM) if(SD_USE_SYSTEM_WEBM)
find_path(WEBM_INCLUDE_DIR find_package(PkgConfig)
NAMES mkvmuxer/mkvmuxer.h mkvparser/mkvparser.h common/webmids.h if(PkgConfig_FOUND)
PATH_SUFFIXES webm pkg_check_modules(WebM REQUIRED IMPORTED_TARGET GLOBAL libwebm)
REQUIRED) endif()
find_library(WEBM_LIBRARY if(PkgConfig_FOUND AND WebM_FOUND)
NAMES webm libwebm link_libraries(PkgConfig::WebM)
REQUIRED) else()
find_path(WEBM_INCLUDE_DIR
NAMES mkvmuxer/mkvmuxer.h mkvparser/mkvparser.h common/webmids.h
PATH_SUFFIXES webm
REQUIRED)
find_library(WEBM_LIBRARY
NAMES webm libwebm
REQUIRED)
add_library(webm UNKNOWN IMPORTED) add_library(webm UNKNOWN IMPORTED)
set_target_properties(webm PROPERTIES set_target_properties(webm PROPERTIES
IMPORTED_LOCATION "${WEBM_LIBRARY}" IMPORTED_LOCATION "${WEBM_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${WEBM_INCLUDE_DIR}") INTERFACE_INCLUDE_DIRECTORIES "${WEBM_INCLUDE_DIR}")
endif()
endif() endif()
endif() endif()
set(SD_LIB stable-diffusion) set(SD_LIB stable-diffusion)
file(GLOB SD_LIB_SOURCES file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
"src/*.h" "src/*.h"
"src/*.cpp" "src/*.cpp"
"src/*.hpp" "src/*.hpp"
"src/vocab/*.h" "src/conditioning/*.h"
"src/vocab/*.cpp" "src/conditioning/*.cpp"
"src/conditioning/*.hpp"
"src/core/*.h"
"src/core/*.cpp"
"src/core/*.hpp"
"src/extensions/*.h"
"src/extensions/*.cpp"
"src/extensions/*.hpp"
"src/model/*/*.h"
"src/model/*/*.cpp"
"src/model/*/*.hpp"
"src/runtime/*.h"
"src/runtime/*.cpp"
"src/runtime/*.hpp"
"src/model_io/*.h"
"src/model_io/*.cpp"
"src/tokenizers/*.h"
"src/tokenizers/*.cpp"
"src/tokenizers/vocab/*.h"
"src/tokenizers/vocab/*.cpp"
) )
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH) find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@ -210,11 +283,14 @@ else()
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES}) add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
endif() endif()
if(APPLE)
sd_set_macos_rpaths(${SD_LIB})
endif()
if(SD_SYCL) if(SD_SYCL)
message("-- Use SYCL as backend stable-diffusion") message("-- Use SYCL as backend stable-diffusion")
set(GGML_SYCL ON) set(GGML_SYCL ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
add_definitions(-DSD_USE_SYCL)
# disable fast-math on host, see: # disable fast-math on host, see:
# https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
if (WIN32) if (WIN32)
@ -250,7 +326,8 @@ endif()
add_subdirectory(thirdparty) add_subdirectory(thirdparty)
target_link_libraries(${SD_LIB} PUBLIC ggml zip) target_link_libraries(${SD_LIB} PUBLIC ggml zip)
target_include_directories(${SD_LIB} PUBLIC . include) target_include_directories(${SD_LIB} PUBLIC . src include)
target_include_directories(${SD_LIB} PRIVATE src/core)
target_include_directories(${SD_LIB} PUBLIC . thirdparty) target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17) target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)

67
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,67 @@
# Contributing
This document collects general contribution conventions for this repository.
## Before You Start
Before opening a PR, please search existing PRs to avoid duplicating ongoing work.
For large-scale refactors or changes with broad impact, please open an issue first to discuss the approach before submitting a PR.
If you want to update a third-party dependency, please open an issue first instead of submitting a direct PR. See [Dependency Updates](#dependency-updates) for details.
## Pull Requests
Keep each PR focused on one clear change. Large or overly complex PRs are harder to review and may not be merged.
Follow Conventional Commit-style subjects seen in history: `feat:`, `fix:`, `refactor:`, `ci:`, `docs:`, `chore:`. Keep subjects imperative and scoped.
PRs should include:
- What changed and why (short problem/solution summary).
- Verification evidence when applicable (commands and key outputs).
- Linked issue/PR context when applicable.
- Screenshots or sample outputs for UI/visual behavior changes.
## Code Style
Format code according to the repository style before submitting changes.
Formatting follows `.clang-format` (Chromium base, 4-space indent, no tabs). Run `format-code.sh` before opening a PR. Keep C++ standard at C++17-compatible patterns used in this repo.
Naming conventions:
- Use `PascalCase` for class/struct/type names.
- In `PascalCase` names, preserve common abbreviations in uppercase, for example `SD`, `API`, `HTTP`, `JSON`, `RGB`, `VAE`, `TAE`, `LoRA`, and `WebP`.
- Use `snake_case` for functions, methods, variables, and file names unless an existing API requires a different style.
- Use a trailing underscore for private data member names, for example `hidden_size_` or `tokenizer_`.
- Use `.h` for C and C++ header files. Do not introduce new `.hpp` headers.
- Use macro-based header include guards instead of `#pragma once`.
- Format header include guards as `__SD_{PATH}__`, where `{PATH}` is the header path in uppercase snake case without the file extension. For example, `src/sample.h` should use `__SD_SAMPLE_H__`.
- Do not introduce anonymous namespaces in new or modified code; prefer `static` file-local functions/variables or an explicit named namespace when scoping is needed.
- In `class`/`struct` definitions, place data members before member functions unless an existing type already clearly follows a different pattern.
- Keep `test_*.cpp` / `test_*.py` naming for tests.
Some older code in the project may not fully follow the current conventions. Please do not submit PRs that only rewrite existing code to match style rules.
When adding or modifying model implementations, follow the model config and weight detection conventions in [docs/model_config.md](docs/model_config.md).
## AI-Assisted Contributions
AI tools may be used to assist development, but contributors are responsible for the quality and correctness of the submitted code.
If any part of a contribution was generated with AI assistance, the contributor must perform a thorough human review before submitting the PR and understand every changed line.
Do not list AI tools as co-authors. The human contributor is the sole responsible author of the submitted code.
Please do not submit AI-generated code that you do not understand, and do not include meaningless experiments, temporary test code, or unrelated generated output in a PR.
## Dependency Updates
Do not submit PRs that update `ggml`. `ggml` updates are performed only after local validation by the maintainer.
Other third-party dependencies are not updated unless necessary. If you want to update a dependency, please open an issue first instead of submitting a direct PR.
## Security & Configuration
Do not commit model weights, secrets, or local absolute paths. Keep large binaries out of git unless intentionally tracked release assets.

View File

@ -2,7 +2,18 @@ ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake # sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake ca-certificates curl gnupg && \
mkdir -p /etc/apt/keyrings && \
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
rm /tmp/nodesource-repo.gpg.key && \
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
apt-get update && \
apt-get install -y --no-install-recommends nodejs && \
npm install -g pnpm@10.15.1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
WORKDIR /sd.cpp WORKDIR /sd.cpp
@ -20,4 +31,4 @@ RUN apt-get update && \
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ] ENTRYPOINT [ "/sd-cli" ]

View File

@ -3,14 +3,31 @@ ARG UBUNTU_VERSION=24.04
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake # sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake ca-certificates curl gnupg && \
mkdir -p /etc/apt/keyrings && \
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
rm /tmp/nodesource-repo.gpg.key && \
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
apt-get update && \
apt-get install -y --no-install-recommends nodejs && \
npm install -g pnpm@10.15.1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
WORKDIR /sd.cpp WORKDIR /sd.cpp
COPY . . COPY . .
ARG CUDACXX=/usr/local/cuda/bin/nvcc ARG CUDACXX=/usr/local/cuda/bin/nvcc
RUN cmake . -B ./build -DSD_CUDA=ON ARG CUDA_ARCHITECTURES=""
ARG GGML_CUDA_FA_ALL_QUANTS=""
RUN cmake . -B ./build \
-DSD_CUDA=ON \
${CUDA_ARCHITECTURES:+-DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES}"} \
${GGML_CUDA_FA_ALL_QUANTS:+-DGGML_CUDA_FA_ALL_QUANTS=${GGML_CUDA_FA_ALL_QUANTS}}
RUN cmake --build ./build --config Release -j$(nproc) RUN cmake --build ./build --config Release -j$(nproc)
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime

View File

@ -3,7 +3,18 @@ ARG UBUNTU_VERSION=22.04
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
RUN apt-get update && apt-get install -y ccache cmake git # sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
RUN apt-get update && apt-get install -y --no-install-recommends ccache cmake git ca-certificates curl gnupg && \
mkdir -p /etc/apt/keyrings && \
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
rm /tmp/nodesource-repo.gpg.key && \
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
apt-get update && \
apt-get install -y --no-install-recommends nodejs && \
npm install -g pnpm@10.15.1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
WORKDIR /sd.cpp WORKDIR /sd.cpp
@ -21,4 +32,4 @@ FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runt
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ] ENTRYPOINT [ "/sd-cli" ]

View File

@ -1,8 +1,20 @@
ARG SYCL_VERSION=2025.1.0-0 # ggml SYCL hardware detection uses BMG G31/WCL architecture enums added in oneAPI 2025.3.
ARG SYCL_VERSION=2025.3.2-0
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
RUN apt-get update && apt-get install -y cmake # sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
RUN apt-get update && apt-get install -y --no-install-recommends cmake ca-certificates curl gnupg && \
mkdir -p /etc/apt/keyrings && \
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
rm /tmp/nodesource-repo.gpg.key && \
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
apt-get update && \
apt-get install -y --no-install-recommends nodejs && \
npm install -g pnpm@10.15.1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
WORKDIR /sd.cpp WORKDIR /sd.cpp

View File

@ -2,7 +2,18 @@ ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc # sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc spirv-headers ca-certificates curl gnupg && \
mkdir -p /etc/apt/keyrings && \
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
rm /tmp/nodesource-repo.gpg.key && \
echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
apt-get update && \
apt-get install -y --no-install-recommends nodejs && \
npm install -g pnpm@10.15.1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
WORKDIR /sd.cpp WORKDIR /sd.cpp

View File

@ -15,26 +15,18 @@ API and command-line option may change frequently.***
## 🔥Important News ## 🔥Important News
* **2026/06/04** 🚀 stable-diffusion.cpp now supports **Ideogram4**
* **2026/05/31** 🚀 stable-diffusion.cpp now supports **PiD**
* **2026/05/27** 🚀 stable-diffusion.cpp now supports **Lens**
* **2026/05/17** 🚀 stable-diffusion.cpp now supports **LTX-2.3**
* **2026/04/11** 🚀 stable-diffusion.cpp now uses a brand-new embedded web UI.
* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein** * **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**
👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image** * **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**
👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev** * **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**
👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509** * **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**
👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image** * **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**
👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace** * **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**
👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2** * **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**
👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
## Features ## Features
@ -48,17 +40,25 @@ API and command-line option may change frequently.***
- [SD3/SD3.5](./docs/sd3.md) - [SD3/SD3.5](./docs/sd3.md)
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md) - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md) - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- [Lens](./docs/lens.md)
- [Chroma](./docs/chroma.md) - [Chroma](./docs/chroma.md)
- [Chroma1-Radiance](./docs/chroma_radiance.md) - [Chroma1-Radiance](./docs/chroma_radiance.md)
- [Qwen Image](./docs/qwen_image.md) - [Qwen Image](./docs/qwen_image.md)
- [PiD](./docs/pid.md)
- [LongCat Image](./docs/longcat_image.md)
- [Z-Image](./docs/z_image.md) - [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md) - [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md) - [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md)
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
- [Ideogram4](./docs/ideogram4.md)
- Image Edit Models - Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md) - [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit series](./docs/qwen_image_edit.md) - [Qwen Image Edit series](./docs/qwen_image_edit.md)
- [LongCat Image Edit](./docs/longcat_image.md)
- Video Models - Video Models
- [Wan2.1/Wan2.2](./docs/wan.md) - [Wan2.1/Wan2.2](./docs/wan.md)
- [LTX-2.3](./docs/ltx2.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support. - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
- Control Net support with SD 1.5 - Control Net support with SD 1.5
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora) - LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
@ -73,9 +73,10 @@ API and command-line option may change frequently.***
- OpenCL - OpenCL
- SYCL - SYCL
- Supported weight formats - Supported weight formats
- Pytorch checkpoint (`.ckpt` or `.pth`) - Pytorch checkpoint (`.ckpt` or `.pth` or `.pt`)
- Safetensors (`.safetensors`) - Safetensors (`.safetensors`)
- GGUF (`.gguf`) - GGUF (`.gguf`)
- Convert mode supports converting model weights to `.gguf` or `.safetensors`
- Supported platforms - Supported platforms
- Linux - Linux
- Mac OS - Mac OS
@ -93,6 +94,7 @@ API and command-line option may change frequently.***
- `DPM++ 2M` - `DPM++ 2M`
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457) - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
- `DPM++ 2S a` - `DPM++ 2S a`
- `ER-SDE`
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952) - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility - Cross-platform reproducibility
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG` - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
@ -126,9 +128,11 @@ API and command-line option may change frequently.***
## Performance ## Performance
If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md). If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
For runtime and parameter backend placement, see the [backend selection guide](./docs/backend.md).
## More Guides ## More Guides
- [Backend selection](./docs/backend.md)
- [SD1.x/SD2.x/SDXL](./docs/sd.md) - [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md) - [SD3/SD3.5](./docs/sd3.md)
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md) - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
@ -138,9 +142,14 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [🔥Qwen Image](./docs/qwen_image.md) - [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md) - [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md) - [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥LTX-2.3](./docs/ltx2.md)
- [🔥Z-Image](./docs/z_image.md) - [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md) - [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md) - [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md)
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
- [Lens](./docs/lens.md)
- [LongCat Image / LongCat Image Edit](./docs/longcat_image.md)
- [LoRA](./docs/lora.md) - [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md) - [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md) - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@ -156,6 +165,7 @@ These projects wrap `stable-diffusion.cpp` for easier use in other languages/fra
* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion) * Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion) * Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
* Golang (non-cgo): [l8bloom/gosd](https://github.com/l8bloom/gosd)
* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET) * C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python) * Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs) * Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)

Binary file not shown.

After

Width:  |  Height:  |  Size: 595 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 562 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 MiB

BIN
assets/lens/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 630 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 555 KiB

BIN
assets/longcat/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 423 KiB

BIN
assets/ltx2/flf2v.webm Normal file

Binary file not shown.

BIN
assets/ltx2/hires_i2v.webm Normal file

Binary file not shown.

BIN
assets/ltx2/i2v.webm Normal file

Binary file not shown.

BIN
assets/ltx2/t2v.webm Normal file

Binary file not shown.

BIN
assets/pid/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.0 MiB

122
docs/backend.md Normal file
View File

@ -0,0 +1,122 @@
# Backend selection
`stable-diffusion.cpp` has two backend assignments:
- `--backend` selects the runtime backend used to execute model graphs.
- `--params-backend` selects the backend used to allocate model parameters.
If `--params-backend` is not set, parameters use the same backend as their module runtime backend.
## Syntax
A backend assignment can be a single backend name:
```shell
sd-cli -m model.safetensors -p "a cat" --backend cpu
```
This applies to every module that does not have a more specific assignment.
Assignments can also target individual modules:
```shell
sd-cli -m model.safetensors -p "a cat" --backend te=cpu,vae=cuda0,diffusion=vulkan0
```
The same syntax is used for parameter placement:
```shell
sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend te=cpu,vae=cpu
```
Module names are case-insensitive. Hyphens and underscores in module names are ignored, so `clip_vision`, `clip-vision`, and `clipvision` are equivalent.
`all=`, `default=`, and `*=` can be used to set the default backend inside a mixed assignment:
```shell
sd-cli -m model.safetensors -p "a cat" --backend all=cuda0,te=cpu
```
## Modules
| Module | Purpose | Accepted names |
| --- | --- | --- |
| `diffusion` | UNet, DiT, MMDiT, Flux, Wan, Qwen Image, and other diffusion models | `diffusion`, `model`, `unet`, `dit` |
| `te` | Text encoders and conditioners | `te`, `clip`, `text`, `textencoder`, `textencoders`, `conditioner`, `cond`, `llm`, `t5`, `t5xxl` |
| `clip_vision` | CLIP vision encoder | `clip_vision`, `clipvision`, `clip-vision`, `vision` |
| `vae` | VAE and TAE | `vae`, `firststage`, `autoencoder`, `tae` |
| `controlnet` | ControlNet | `controlnet`, `control` |
| `photomaker` | PhotoMaker ID encoder and PhotoMaker LoRA | `photomaker`, `photomakerid`, `pmid`, `photo` |
| `upscaler` | ESRGAN upscaler | `upscaler`, `esrgan`, `hires` |
`te` is the preferred module name for text encoders. `clip` is kept as an accepted alias because many existing commands and model names use CLIP terminology.
## Backend names
Backend names are resolved against the GGML backend device list. Matching is case-insensitive and accepts exact names or unique prefixes, so common values include names such as:
- `cpu`
- `cuda0`
- `vulkan0`
- `metal`
The special values `auto`, `default`, and an empty backend name select the default backend. The default preference is GPU, then integrated GPU, then CPU.
The special value `gpu` selects the first GPU backend, falling back to the first integrated GPU backend.
## Runtime backend vs. parameter backend
The runtime backend controls where graph execution runs. The parameter backend controls where model weights are allocated.
For example:
```shell
sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend cpu
```
This runs all modules on `cuda0`, but stores parameters in CPU RAM. During execution, parameters are moved to the runtime backend as needed.
Per-module assignments can be mixed:
```shell
sd-cli -m model.safetensors -p "a cat" --backend diffusion=cuda0,te=cpu,vae=cpu --params-backend diffusion=cuda0,te=cpu,vae=cpu
```
This keeps text encoding and VAE execution on CPU while the diffusion model runs on GPU.
## Backend sharing and lifetime
Backends are managed by `SDBackendManager`.
Within one manager, backend instances are cached by resolved backend device name. If multiple modules request the same backend, they share the same `ggml_backend_t`.
For example:
```shell
--backend te=cpu,vae=cpu
```
uses one shared CPU backend for both `te` and `vae` runtime execution.
Runtime and parameter assignments also share the same backend cache. If `--backend diffusion=cuda0` and `--params-backend diffusion=cuda0` resolve to the same device, both use the same backend instance.
`SDBackendManager` owns the backend instances and frees them when the context or upscaler is destroyed. Model runners receive non-owning runtime and parameter backend pointers and do not free them.
## Compatibility flags
The older CPU placement flags are still supported:
- `--clip-on-cpu`
- `--vae-on-cpu`
- `--control-net-cpu`
- `--offload-to-cpu`
`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` affect runtime backend assignment only when `--backend` is not set. They map to `te=cpu`, `vae=cpu`, and `controlnet=cpu`.
`--offload-to-cpu` affects parameter backend assignment only when `--params-backend` is not set. It is equivalent to:
```shell
--params-backend cpu
```
Explicit `--backend` and `--params-backend` assignments are preferred for new commands.

View File

@ -102,6 +102,11 @@ cmake --build . --config Release
## Build with Vulkan ## Build with Vulkan
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/. Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
On Ubuntu, install the Vulkan development packages and SPIR-V headers:
```shell
sudo apt-get install build-essential libvulkan-dev glslc spirv-headers
```
```shell ```shell
mkdir build && cd build mkdir build && cd build

View File

@ -131,8 +131,6 @@ sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
| `warmup` | Steps to always compute before caching starts | 4 | | `warmup` | Steps to always compute before caching starts | 4 |
| `stop` | Stop caching at this fraction of total steps | 0.9 | | `stop` | Stop caching at this fraction of total steps | 0.9 |
```
### Performance Tips ### Performance Tips
- Start with default thresholds and adjust based on output quality - Start with default thresholds and adjust based on output quality

View File

@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
```bash ```bash
python convert_diffusers_to_original_stable_diffusion.py \ python convert_diffusers_to_original_stable_diffusion.py \
--model_path ./segmindtiny-sd \ --model_path ./segmindtiny-sd \
--checkpoint_path ./segmind_tiny-sd.ckpt --half --checkpoint_path ./segmind_tiny-sd.safetensors --half --use_safetensors
``` ```
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above. The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
##### Another available .ckpt file: ### SDXS-512-DreamShaper
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
To use this file, you must first adjust its non-contiguous tensors:
```python
import torch
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
for key, value in ckpt['state_dict'].items():
if isinstance(value, torch.Tensor):
ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```
### SDXS-512
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part. Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
##### Some ready-to-run SDXS-512 model files are available online, such as:
##### 1. Download the diffusers model from Hugging Face using Python: * https://huggingface.co/akleine/sdxs-512
* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
pipe.save_pretrained(save_directory="sdxs")
```
##### 2. Create a safetensors file
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
```
##### 3. Run the model as follows:
##### Run the model as follows:
```bash ```bash
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \ ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
--cfg-scale 1 --steps 1 --cfg-scale 1 --steps 1
``` ```
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here. ### SDXS-512-0.9
Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
##### Download a ready-to-run file from here:
* https://huggingface.co/akleine/sdxs-09
For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.

35
docs/ernie_image.md Normal file
View File

@ -0,0 +1,35 @@
# How to Use
You can run ERNIE-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
## Download weights
- Download ERNIE-Image-Turbo
- safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models
- gguf: https://huggingface.co/unsloth/ERNIE-Image-Turbo-GGUF/tree/main
- Download ERNIE-Image
- safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models
- gguf: https://huggingface.co/unsloth/ERNIE-Image-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/vae
- Download ministral 3b
- safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/text_encoders
- gguf: https://huggingface.co/unsloth/Ministral-3-3B-Instruct-2512-GGUF/tree/main
## Examples
### ERNIE-Image-Turbo
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\ernie-image-turbo.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 8 -v --offload-to-cpu --diffusion-fa
```
<img width="256" alt="ERNIE-Image Turbo example" src="../assets/ernie_image/turbo_example.png" />
### ERNIE-Image
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\ernie-image-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
```
<img width="256" alt="ERNIE-Image example" src="../assets/ernie_image/example.png" />

View File

@ -8,6 +8,8 @@
- gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main - gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
- Download vae - Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download FLUX.2-small-decoder (full_encoder_small_decoder.safetensors) as an alternative VAE option
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-small-decoder/tree/main
- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF - Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
- gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main - gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
@ -31,6 +33,8 @@
- gguf: https://huggingface.co/leejet/FLUX.2-klein-base-4B-GGUF/tree/main - gguf: https://huggingface.co/leejet/FLUX.2-klein-base-4B-GGUF/tree/main
- Download vae - Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download FLUX.2-small-decoder (full_encoder_small_decoder.safetensors) as an alternative VAE option
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-small-decoder/tree/main
- Download Qwen3 4b - Download Qwen3 4b
- safetensors: https://huggingface.co/Comfy-Org/flux2-klein-4B/tree/main/split_files/text_encoders - safetensors: https://huggingface.co/Comfy-Org/flux2-klein-4B/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/unsloth/Qwen3-4B-GGUF/tree/main - gguf: https://huggingface.co/unsloth/Qwen3-4B-GGUF/tree/main

20
docs/hidream_o1_image.md Normal file
View File

@ -0,0 +1,20 @@
# How to Use
## Download weights
- Download HiDream-O1-Image-Dev
- safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
- Download HiDream-O1-Image
- safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
## Examples
### HiDream-O1-Image-Dev
```
.\bin\Release\sd-cli.exe -m ..\..\ComfyUI\models\diffusion_models\hidream_o1_image_dev_bf16.safetensors -p "a lovely cat holding a sign says
'hidream o1 cpp'" --cfg-scale 1.0 -v -H 1024 -W 1024
```
<img width="256" alt="HiDream-O1-Image-Dev example" src="../assets/hidream-o1/dev_example.png" />

View File

@ -26,12 +26,12 @@ Fortunately, `AMD` provides complete help documentation, you can use the help do
Then we must set `ROCM` as environment variables before running cmake. Then we must set `ROCM` as environment variables before running cmake.
Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\5.5\bin` Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\7.1.1\bin`
This is what I use to set the clang: This is what I use to set the clang:
```Commandline ```Commandline
set CC=C:\Program Files\AMD\ROCm\5.5\bin\clang.exe set CC=C:\Program Files\AMD\ROCm\7.1.1\bin\clang.exe
set CXX=C:\Program Files\AMD\ROCm\5.5\bin\clang++.exe set CXX=C:\Program Files\AMD\ROCm\7.1.1\bin\clang++.exe
``` ```
## Ninja ## Ninja
@ -46,7 +46,7 @@ set ninja=C:\Program Files\ninja\ninja.exe
## Building stable-diffusion.cpp ## Building stable-diffusion.cpp
The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` , The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` ,
`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100` `-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032`
>**Notice**: check the `clang` and `clang++` information: >**Notice**: check the `clang` and `clang++` information:
```Commandline ```Commandline
@ -59,26 +59,29 @@ If you see like this, we can continue:
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be) clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc Target: x86_64-pc-windows-msvc
Thread model: posix Thread model: posix
InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin
``` ```
``` ```
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be) clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc Target: x86_64-pc-windows-msvc
Thread model: posix Thread model: posix
InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin
``` ```
>**Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus) >**Notice** that the GPU targets are now compatible with multiple GPU architectures (ROCm 7.1.1 targets). You can change them to match your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`. Examples:
- AMD Radeon™ RX 7900 XTX Graphics: `gfx1100`
- AMD Radeon™ RX 7900 XT Graphics: `gfx1101`
- AMD Radeon™ RX 7900 GRE Graphics: `gfx1102`
option: option:
```commandline ```commandline
mkdir build mkdir build
cd build cd build
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100 cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
cmake --build . --config Release cmake --build . --config Release
``` ```

40
docs/ideogram4.md Normal file
View File

@ -0,0 +1,40 @@
# How to Use
## Download weights
- Download Ideogram4
- safetensors: https://huggingface.co/ideogram-ai/ideogram-4-fp8/tree/main/transformer
- Download Ideogram4 uncond
- safetensors: https://huggingface.co/ideogram-ai/ideogram-4-fp8/tree/main/unconditional_transformer
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Qwen3-VL-8B-Instruct
- gguf: https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
## Convert weights
fp8 scale -> bf16
```
python .\convert_fp8_scale_to_bf16.py --input .\ideogram4_fp8.safetensors --output ideogram4_bf16.safetensors
python .\convert_fp8_scale_to_bf16.py --input .\ideogram4_uncond_fp8.safetensors --output ideogram4_uncond_bf16.safetensors
```
bf16 -> q8
```
.\bin\Release\sd-cli.exe -M convert -m ideogram4_bf16.safetensors -o ideogram4-Q8_0.gguf --tensor-type-rules "^layers.*adaln_modulation.*weight=q8_0,layers.*attention.o.*weight=q8_0,layers.*attention.qkv.*weight=q8_0,layers.*feed_forward.*weight=q8_0" -v
.\bin\Release\sd-cli.exe -M convert -m ideogram4_uncond_bf16.safetensors -o ideogram4_uncond-Q8_0.gguf --tensor-type-rules "^layers.*adaln_modulation.*weight=q8_0,layers.*attention.o.*weight=q8_0,layers.*attention.qkv.*weight=q8_0,layers.*feed_forward.*weight=q8_0" -v
```
If you want lower VRAM usage, you can change the quantization from q8_0 to a lower-level quantization, such as q4_0.
## Examples
```sh
.\bin\Release\sd-cli.exe --diffusion-model ideogram4-Q8_0.gguf --uncond-diffusion-model ideogram4_uncond-Q8_0.gguf --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors -p '{"high_level_description":"A square 1024 x 1024 luxury fashion magazine cover featuring exactly one short chubby fluffy cat as the main model. The cat sits on a soft ivory studio floor, facing the viewer with a stylish calm expression, wearing tiny black sunglasses, a red silk scarf, and a small gold collar charm. In front of the cat on the floor is a wide horizontal luxury nameplate that clearly reads ideogram4.cpp. The whole design feels premium, fashionable, clean, and editorial.","style_description":{"aesthetics":"luxury fashion magazine cover, high-end pet couture campaign, minimalist editorial design, elegant studio photography, soft paper texture, refined typography, fashionable and polished","lighting":"Soft diffused studio lighting, gentle spotlight on the cat, subtle floor shadow, warm ivory highlights, clean separation between subject and background","photo":"high-resolution fashion editorial photography look, front-facing cat portrait, crisp fur details, glossy sunglasses, clear readable nameplate text, shallow depth of field","medium":"mixed media fashion photography and premium editorial graphic design","color_palette":["#F4EFE7","#111111","#D8B56D","#B73A3A","#FFFFFF","#8A7A6A"]},"compositional_deconstruction":{"canvas":"Square 1024 x 1024 canvas with a normal upright orientation. Do not rotate the poster or any text. Use a clean fashion magazine cover layout.","background":"Warm ivory studio backdrop with subtle paper grain, a soft spotlight gradient, faint floor shadow, and a few minimal gold editorial lines. The background is spacious, premium, and uncluttered.","layout":"Top center has a small elegant headline. Center area features one cat as the main fashion model. Lower foreground has a wide horizontal luxury nameplate placed on the floor in front of the cat. Bottom center has a small footer. All text is horizontal, upright, and readable left to right.","elements":[{"type":"text","desc":"Top center headline reading LOOK WHAT I FOUND in a refined high-fashion serif font. The headline is horizontal, centered, elegant, and secondary to the nameplate text."},{"type":"obj","desc":"Exactly one short chubby fluffy cat sitting in the center like a luxury fashion model. The cat has a large round head, compact body, short legs, soft detailed fur, expressive eyes, and a calm confident pose. The cat is cute and rounded, not tall, not stretched, not duplicated."},{"type":"obj","desc":"Tiny glossy black sunglasses worn naturally by the cat, slightly oversized but still showing the cat face clearly. The sunglasses add a chic fashion-editorial attitude."},{"type":"obj","desc":"A red silk scarf tied neatly around the cat neck, with soft folds and a couture feeling. The scarf must not cover the cat face or the nameplate."},{"type":"obj","desc":"A small gold collar charm or fashion accessory under the scarf, subtle and premium, adding a luxury campaign detail."},{"type":"obj","desc":"In the lower foreground, place a wide horizontal luxury nameplate on the floor in front of the cat. The nameplate is low, flat, landscape-oriented, much wider than tall, like a fashion show seat card or premium display plaque. It is centered, front-facing, level, and fully visible. It must not become vertical, tall, standing, rotated, or side-facing."},{"type":"text","desc":"Print the exact text ideogram4.cpp only on the wide horizontal nameplate. Use clean bold black lettering, perfectly spelled, lowercase, with the number 4 and .cpp extension. The text must fit completely inside the nameplate, stay horizontal, and be readable from left to right."},{"type":"obj","desc":"Add sparse premium editorial accents around the edges: thin gold lines, small code brackets, tiny cursor marks, subtle dots, and minimal geometric details. No extra cats, no stickers, no animal faces, no busy decorations."},{"type":"text","desc":"Bottom center footer reading tiny paws, big compile energy in a small refined monospace or editorial font. The footer is horizontal, centered, understated, and much smaller than the nameplate text."}]}}' --diffusion-fa -v --offload-to-cpu -H 1024 -W 1024
```
<img alt="ideogram4 image example" src="../assets/ideogram4/example.png" />

32
docs/lens.md Normal file
View File

@ -0,0 +1,32 @@
# How to Use
Lens uses a Lens diffusion transformer, the FLUX.2 VAE, and GPT-OSS-20B as the LLM text encoder.
## Download weights
- Download Lens
- safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
- Download Lens Turbo
- safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download GPT-OSS-20B
- gguf: https://huggingface.co/unsloth/gpt-oss-20b-GGUF/tree/main
## Examples
### Lens
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 5.0 -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v
```
<img width="256" alt="Lens example" src="../assets/lens/example.png" />
### Lens Turbo
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_turbo_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 1.0 -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v --steps 4
```
<img width="256" alt="Lens Turbo example" src="../assets/lens/turbo_example.png" />

30
docs/longcat_image.md Normal file
View File

@ -0,0 +1,30 @@
# How to Use
LongCat-Image uses a LongCat diffusion transformer, the FLUX VAE, and Qwen2.5-VL as the LLM text encoder.
## Download weights
- Download LongCat Image
- safetensors: https://huggingface.co/Comfy-Org/LongCat-Image/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/vantagewithai/LongCat-Image-GGUF/tree/main/comfy
- Download LongCat Image Edit
- LongCat Image Edit Turbo: https://huggingface.co/meituan-longcat/LongCat-Image-Edit-Turbo
- gguf: https://huggingface.co/vantagewithai/LongCat-Image-Edit-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download qwen_2.5_vl 7b
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
- For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.
## Run
LongCat uses quoted text for character-level text rendering. Put target text inside single quotes, double quotes, or Chinese quotes.
### LongCat Image
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\LongCat-Image-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p "a lovely cat holding a sign says 'longcat.cpp'" --cfg-scale 5.0 --sampling-method euler --flow-shift 3 -v --offload-to-cpu --diffusion-fa
```
<img alt="longcat example" src="../assets/longcat/example.png" />

77
docs/ltx2.md Normal file
View File

@ -0,0 +1,77 @@
# How to Use
## Download weights
- Download LTX-2.3
- safetensors: https://huggingface.co/Kijai/LTX2.3_comfy/tree/main/diffusion_models
- gguf: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main
- Download gemma-3-12b-it
- gguf: https://huggingface.co/unsloth/gemma-3-12b-it-GGUF/tree/main
- Download embeddings connectors
- safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/text_encoders
- Download vae
- safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
- Download audio vae
- safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
- Download LTX spatial latent upscaler
- safetensors: https://huggingface.co/Lightricks/LTX-2.3/resolve/main/ltx-2.3-spatial-upscaler-x2-1.1.safetensors
## Examples
### LTX-2.3 dev T2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "worst quality, low quality, blurry, distorted, artifacts" -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --fps 24 -o t2v.webm
```
<video
src="../assets/ltx2/t2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>
### LTX-2.3 dev I2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\ernie_image\turbo_example.png -o i2v.webm
```
<video
src="../assets/ltx2/i2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>
### LTX-2.3 dev FLF2V
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png -o flf2v.webm
```
<video
src="../assets/ltx2/flf2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>
### LTX-2.3 spatial latent upscale
LTX spatial latent upscale runs a model-backed x2 latent upsampler between the low-resolution video pass and the high-resolution refine pass. `-W` and `-H` are the pre-upscale generation size; the spatial upsampler produces x2 latent dimensions.
Put `ltx-2.3-spatial-upscaler-x2-1.1.safetensors` under the directory passed to `--hires-upscalers-dir`, then use the model name without path or extension in `--hires-upscaler`.
```
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors --hires-upscalers-dir ..\..\ComfyUI\models\latent_upscale_models --hires-upscaler ltx-2.3-spatial-upscaler-x2-1.1 --hires --hires-steps 4 -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -W 640 -H 360 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\ernie_image\turbo_example.png -o hires_i2v.webm
```
By default, the hires refine pass uses the main sampler and scheduler, then trims the second-pass sigma schedule by `--hires-denoising-strength` (`0.7` by default). To reproduce a ComfyUI-style explicit refine schedule, pass custom hires sigmas:
```
--hires-sigmas "0.85,0.725,0.421875,0.0"
```
<video
src="../assets/ltx2/hires_i2v.webm"
controls
muted
style="max-width: 100%; height: auto;"></video>

118
docs/model_config.md Normal file
View File

@ -0,0 +1,118 @@
# Model Configuration Conventions
This document describes the conventions for model configuration structs and
weight-based configuration detection.
## Config Types
Model configuration should live in a model-specific `*Config` struct.
Examples:
- `ZImageConfig`
- `UNetConfig`
- `MMDiTConfig`
- `LLMConfig`
Preserve established acronym casing in type names, such as `UNet`, `MMDiT`,
`LLM`, `VAE`, and `T5`.
Place the config struct near the top of the model header, before the main model
blocks and runner types that consume it.
## Config Variables
Variables and members that hold a config should be named `config`.
Examples:
```cpp
UNetConfig config;
UnetModelBlock unet;
MMDiTRunner(...)
: DiffusionModelRunner(backend, params_backend, prefix),
config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
mmdit(config) {
}
```
Avoid alternate names such as `params`, `params_cfg`, `model_params`, or
model-specific aliases unless an existing public API requires them.
## Weight Detection
If a model can derive configuration from loaded weight metadata, expose that
logic as a static method on the config type:
```cpp
static XxxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix);
```
Additional selector arguments are allowed when required by an existing model
family, for example `SDVersion version` or an architecture enum:
```cpp
static UNetConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix,
SDVersion version = VERSION_SD1);
```
Use `TensorStorage` metadata, especially `n_dims` and `ne`, to infer shapes.
Do not load or parse tensor data for config detection.
Detection should respect `prefix`. For nested weights, construct full names from
`prefix + "." + suffix` or filter entries with `starts_with(name, prefix)`.
Do not add persistent config fields such as `inferred_from_weights` only to
record whether detection happened. If the function needs to decide whether to
print a debug line, keep that as local control flow inside `detect_from_weights`.
## Logging
When config values are inferred from weights, print one `LOG_DEBUG` line at the
end of `detect_from_weights`.
Example:
```cpp
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
config.num_layers,
config.vocab_size,
config.hidden_size,
config.intermediate_size);
```
Only print the config detection log when the function actually inferred values
from weights. Do not duplicate the same config summary in runner constructors or
model loading code.
Use the correct format specifiers for field types, such as `%" PRId64 "` for
`int64_t` and `%d` for `int`.
## Runner And Model Responsibilities
Runners should detect the config once and pass it into the model block:
```cpp
struct XxxRunner : public DiffusionModelRunner {
XxxConfig config;
XxxModel model;
XxxRunner(..., const String2TensorStorage& tensor_storage_map, const std::string prefix)
: DiffusionModelRunner(backend, params_backend, prefix),
config(XxxConfig::detect_from_weights(tensor_storage_map, prefix)),
model(config) {
model.init(params_ctx, tensor_storage_map, prefix);
}
};
```
Model blocks should consume `config` directly instead of re-scanning weights in
their constructors. Keep config-derived behavior centralized in the config
struct.
If a model has no weight-derived config today, it may still provide
`detect_from_weights` for API consistency, but it should not print a config
detection log unless it actually derives values from weights.

39
docs/pid.md Normal file
View File

@ -0,0 +1,39 @@
# How to Use
PiD is NVIDIA's Pixel Diffusion Decoder. It replaces the usual VAE decode or decode-then-upscale path with a pixel-space diffusion decoder conditioned on a
source latent and text prompt.
In stable-diffusion.cpp, PiD currently runs as an image edit pipeline: provide a reference image with `-r`/`--ref-image`, encode that image with a matching VAE, then let the PiD diffusion model decode/upscale directly to RGB.
## Download weights
- Download PiD
- safetensors: https://huggingface.co/Comfy-Org/PixelDiT/tree/main/diffusion_models
- Download Gemma 2 2B
- safetensors: https://huggingface.co/Comfy-Org/PixelDiT/tree/main/text_encoders
- Download the VAE that matches the PiD checkpoint backbone
- safetensors: https://huggingface.co/nvidia/PiD/tree/main/checkpoints
- Flux / Z-Image PiD: use the Flux VAE and pass `--vae-format flux`
- SD3 PiD: use the SD3 VAE and pass `--vae-format sd3`
- Flux.2 PiD: use the Flux.2 VAE and pass `--vae-format flux2`
The official PiD model card should be checked before use. At the time of the initial PiD release, the official weights are under the NSCLv1 non-commercial license.
## Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\pid_flux1_512_to_2048_4step_bf16.safetensors --llm "..\..\ComfyUI\models\text_encoders\gemma_2_2b_it_elm_bf16.safetensors" --vae ..\..\ComfyUI\models\vae\ae.sft --vae-format flux --cfg-scale 1.0 -p "a lovely cat" -r ..\assets\ernie_image\turbo_example.png --diffusion-fa -v --steps 4 -H 2048 -W 2048 --rng cpu
```
Before:
<img width="256" alt="ERNIE-Image Turbo example" src="../assets/ernie_image/turbo_example.png" />
After:
<img width="1024" alt="PiD example" src="../assets/pid/example.png" />
## Notes
- `-r`/`--ref-image` is required. PiD uses the first reference image as the source latent condition.
- `--vae-format` should match the VAE latent layout used by the PiD checkpoint. This is important when using standalone VAE files because the PiD diffusion
checkpoint alone does not identify the VAE format.

View File

@ -21,7 +21,7 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
### Z-Image-Turbo ### Z-Image-Turbo
``` ```
.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512 .\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512 --steps 8
``` ```
<img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" /> <img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />

View File

@ -1,11 +1,19 @@
set(TARGET sd-cli) set(TARGET sd-cli)
add_executable(${TARGET} add_executable(${TARGET}
../common/common.cpp
../common/log.cpp ../common/log.cpp
../common/media_io.cpp ../common/media_io.cpp
image_metadata.cpp image_metadata.cpp
main.cpp main.cpp
) )
if(APPLE)
sd_set_macos_rpaths(${TARGET})
endif()
target_include_directories(${TARGET} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}/.."
"${PROJECT_SOURCE_DIR}/src"
)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE stable-diffusion zip ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE stable-diffusion zip ${CMAKE_THREAD_LIBS_INIT})
if(SD_WEBP) if(SD_WEBP)

View File

@ -4,26 +4,29 @@
usage: ./bin/sd-cli [options] usage: ./bin/sd-cli [options]
CLI Options: CLI Options:
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image sequences (default: -o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image
./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi, .webm, and animated .webp sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs
--preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp support .avi, .webm, and animated .webp
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at --image <string> path to the image to inspect (for metadata mode)
every step) --metadata-format <string> metadata output format, one of [text, json] (default: text)
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise) --preview-path <string> path to write preview image to (default: ./preview.png). Multi-frame previews support
--image <string> path to the image to inspect (for metadata mode) .avi, .webm, and animated .webp
--metadata-format <string> metadata output format, one of [text, json] (default: text) --preview-interval <int> interval in denoising steps between consecutive updates of the image preview file
--canny apply canny preprocessor (edge detection) (default is 1, meaning updating at every step)
--convert-name convert tensor name (for convert mode) --output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified
-v, --verbose print extra info %d in output path, 1 otherwise)
--color colors the logging tags according to level --canny apply canny preprocessor (edge detection)
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) --convert-name convert tensor name (for convert mode)
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs -v, --verbose print extra info
--metadata-raw include raw hex previews for unparsed metadata payloads --color colors the logging tags according to level
--metadata-brief truncate long metadata text values in text output --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
--metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments --preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen --metadata-raw include raw hex previews for unparsed metadata payloads
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none) --metadata-brief truncate long metadata text values in text output
-h, --help show this help message and exit --metadata-all include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
-h, --help show this help message and exit
Context Options: Context Options:
-m, --model <string> path to full model -m, --model <string> path to full model
@ -31,28 +34,34 @@ Context Options:
--clip_g <string> path to the clip-g text encoder --clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder --clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder --t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...) --llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit --llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated. --qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated. --qwen2vl_vision <string> alias of --llm_vision. Deprecated.
--diffusion-model <string> path to the standalone diffusion model --diffusion-model <string> path to the standalone diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model --high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--uncond-diffusion-model <string> path to the standalone unconditional diffusion model, currently used by
Ideogram4 CFG
--vae <string> path to standalone vae model --vae <string> path to standalone vae model
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) --taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--tae <string> alias of --taesd --tae <string> alias of --taesd
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
--hires-upscalers-dir <string> highres fix upscaler model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model --photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model. --upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5) --max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
--vae-tiling process vae in tiles to reduce memory usage graph splitting; a negative value auto-detects free VRAM, sparing the
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
--mmap whether to memory-map model --mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram) --control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram)
@ -67,20 +76,19 @@ Context Options:
--chroma-disable-dit-mask disable dit mask for chroma --chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma --chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
type of the weight file q4_K). If not specified, the default is the type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights flux2_flow]
contain any quantized parameters, the at_runtime mode will be used; otherwise, --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
immediately will be used.The immediately mode may have precision and auto. In auto mode, if the model weights contain any quantized parameters,
compatibility issues with quantized parameters, but it usually offers faster inference the at_runtime mode will be used; otherwise, immediately will be used.The
speed and, in some cases, lower memory usage. The at_runtime mode, on the immediately mode may have precision and compatibility issues with quantized
other hand, is exactly the opposite. parameters, but it usually offers faster inference speed and, in some cases,
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) lower memory usage. The at_runtime mode, on the other hand, is exactly the
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 opposite.
(overrides --vae-tile-size)
Generation Options: Generation Options:
-p, --prompt <string> the prompt to render -p, --prompt <string> the prompt to render
@ -89,69 +97,108 @@ Generation Options:
--end-img <string> path to the end image, required by flf2v --end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image --mask <string> path to the mask image
--control-image <string> path to control image, control net --control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in --control-video <string> path to control video frames, It must be a directory path. The video frames
lexicographical (character) order. For example, if the control video path is inside should be stored as images in lexicographical (character) order. For
`frames`, the directory contain images such as 00.png, 01.png, ... etc. example, if the control video path is `frames`, the directory contain images
such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir --pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed --pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
--extra-sample-args <string> extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta,
apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports
slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end;
ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma
--extra-tiling-args <string> extra VAE tiling args, key=value list. LTX video VAE supports
temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto) --high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, --clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
will be 1 for SD1.x, 2 for SD2.x (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count -b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1) --video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24) --fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
NitroSD-Vibrant NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1) --upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128) --upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --img-cfg-scale <float> image guidance scale for inpaint or image edit models: (default: same as
--cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 --slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
medium disabled, a value of 2.5 is nice for sd3.5 medium
--skip-layer-start <float> SLG enabling point (default: 0.01) --skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2) --skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a) --eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto) --flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0) --high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) --high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or image edit models (default:
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5) same as --cfg-scale)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) --high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
(default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01) --high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2) --high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a) --high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--strength <float> strength for noising/unnoising (default: 0.75) --strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float> --pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image --control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
`--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength --vace-strength <float> wan vace strength
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
--increase-ref-index automatically increase the indices of references images based on the order
they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
--disable-image-metadata do not embed generation metadata on image files --disable-image-metadata do not embed generation metadata on image files
--vae-tiling process vae in tiles to reduce memory usage
--temporal-tiling enable temporal tiling for LTX video VAE decode
--hires enable highres fix
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
otherwise) er_sde, euler_cfg_pp, euler_a_cfg_pp] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
euler_a otherwise res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp] default: euler for Flux/SD3/Wan, euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
kl_optimal, lcm, bong_tangent], default: discrete smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default:
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0"). model-specific
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--hires-sigmas custom sigma values for the highres fix second pass, comma-separated (e.g.,
"0.85,0.725,0.421875,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache: --cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
"threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2" Examples: "threshold=0.25" or "threshold=1.5,reset=0"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static' --scm-policy SCM policy: 'dynamic' (default) or 'static'
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
``` ```
Metadata mode inspects PNG/JPEG container metadata without loading any model: Metadata mode inspects PNG/JPEG container metadata without loading any model:

View File

@ -15,10 +15,13 @@
// #include "preprocessing.hpp" // #include "preprocessing.hpp"
#include "stable-diffusion.h" #include "stable-diffusion.h"
#include "common/common.hpp" #include "common/common.h"
#include "common/media_io.h" #include "common/media_io.h"
#include "common/resource_owners.hpp"
#include "image_metadata.h" #include "image_metadata.h"
namespace fs = std::filesystem;
const char* previews_str[] = { const char* previews_str[] = {
"none", "none",
"proj", "proj",
@ -166,8 +169,9 @@ struct SDCliParams {
return 1; return 1;
}; };
auto on_help_arg = [&](int argc, const char** argv, int index) { auto on_help_arg = [&](int argc, const char** argv, int index, bool& valid) {
normal_exit = true; normal_exit = true;
valid = true;
return -1; return -1;
}; };
@ -189,17 +193,22 @@ struct SDCliParams {
return options; return options;
}; };
bool process_and_check() { bool resolve() {
if (mode != METADATA && output_path.length() == 0) {
LOG_ERROR("error: the following arguments are required: output_path");
return false;
}
if (mode == CONVERT) { if (mode == CONVERT) {
if (output_path == "output.png") { if (output_path == "output.png") {
output_path = "output.gguf"; output_path = "output.gguf";
} }
} else if (mode == METADATA) { }
return true;
}
bool validate() {
if (mode != METADATA) {
if (output_path.length() == 0) {
LOG_ERROR("error: the following arguments are required: output_path");
return false;
}
} else {
if (image_path.empty()) { if (image_path.empty()) {
LOG_ERROR("error: metadata mode needs an image path (--image)"); LOG_ERROR("error: metadata mode needs an image path (--image)");
return false; return false;
@ -213,6 +222,16 @@ struct SDCliParams {
return true; return true;
} }
bool resolve_and_validate() {
if (!resolve()) {
return false;
}
if (!validate()) {
return false;
}
return true;
}
std::string to_string() const { std::string to_string() const {
std::ostringstream oss; std::ostringstream oss;
oss << "SDCliParams {\n" oss << "SDCliParams {\n"
@ -257,10 +276,12 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
exit(cli_params.normal_exit ? 0 : 1); exit(cli_params.normal_exit ? 0 : 1);
} }
bool valid = cli_params.process_and_check(); bool valid = cli_params.resolve_and_validate();
if (valid && cli_params.mode != METADATA) { if (valid && cli_params.mode != METADATA) {
valid = ctx_params.process_and_check(cli_params.mode) && valid = ctx_params.resolve_and_validate(cli_params.mode) &&
gen_params.process_and_check(cli_params.mode, ctx_params.lora_model_dir); gen_params.resolve_and_validate(cli_params.mode,
ctx_params.lora_model_dir,
ctx_params.hires_upscalers_dir);
} }
if (!valid) { if (!valid) {
@ -275,7 +296,7 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
} }
bool load_images_from_dir(const std::string dir, bool load_images_from_dir(const std::string dir,
std::vector<sd_image_t>& images, std::vector<SDImageOwner>& images,
int expected_width = 0, int expected_width = 0,
int expected_height = 0, int expected_height = 0,
int max_image_num = 0, int max_image_num = 0,
@ -312,12 +333,12 @@ bool load_images_from_dir(const std::string dir,
return false; return false;
} }
images.push_back({(uint32_t)width, images.emplace_back(sd_image_t{(uint32_t)width,
(uint32_t)height, (uint32_t)height,
3, 3,
image_buffer}); image_buffer});
if (max_image_num > 0 && images.size() >= max_image_num) { if (max_image_num > 0 && static_cast<int>(images.size()) >= max_image_num) {
break; break;
} }
} }
@ -365,11 +386,32 @@ std::string format_frame_idx(std::string pattern, int frame_idx) {
return result; return result;
} }
static fs::path get_video_audio_sidecar_path(const SDCliParams& cli_params) {
fs::path out_path = cli_params.output_path;
fs::path base_path = out_path;
fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{};
std::string ext_lower = ext.string();
std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
const EncodedImageFormat output_format = encoded_image_format_from_path(out_path.string());
if (!ext.empty()) {
if (output_format == EncodedImageFormat::JPEG ||
output_format == EncodedImageFormat::PNG ||
output_format == EncodedImageFormat::WEBP ||
ext_lower == ".avi" ||
ext_lower == ".webm") {
base_path.replace_extension();
}
}
base_path += ".wav";
return base_path;
}
bool save_results(const SDCliParams& cli_params, bool save_results(const SDCliParams& cli_params,
const SDContextParams& ctx_params, const SDContextParams& ctx_params,
const SDGenerationParams& gen_params, const SDGenerationParams& gen_params,
sd_image_t* results, sd_image_t* results,
int num_results) { int num_results,
const sd_audio_t* generated_audio = nullptr) {
if (results == nullptr || num_results <= 0) { if (results == nullptr || num_results <= 0) {
return false; return false;
} }
@ -413,14 +455,30 @@ bool save_results(const SDCliParams& cli_params,
if (!img.data) if (!img.data)
return false; return false;
std::string params = gen_params.embed_image_metadata const int64_t metadata_seed = cli_params.mode == VID_GEN ? gen_params.seed : gen_params.seed + idx;
? get_image_params(ctx_params, gen_params, gen_params.seed + idx) std::string params = gen_params.embed_image_metadata
: ""; ? get_image_params(ctx_params, gen_params, metadata_seed, cli_params.mode)
const bool ok = write_image_to_file(path.string(), img.data, img.width, img.height, img.channel, params, 90); : "";
const bool ok = write_image_to_file(path.string(), img.data, img.width, img.height, img.channel, params, 90);
LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure"); LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
return ok; return ok;
}; };
auto write_audio_sidecar = [&](const fs::path& wav_path) {
if (generated_audio == nullptr) {
return;
}
if (write_wav_to_file(wav_path.string(),
generated_audio->data,
generated_audio->sample_count,
generated_audio->channels,
generated_audio->sample_rate)) {
LOG_INFO("save result audio to '%s'", wav_path.string().c_str());
} else {
LOG_WARN("failed to save result audio to '%s'", wav_path.string().c_str());
}
};
int sucessful_reults = 0; int sucessful_reults = 0;
if (std::regex_search(cli_params.output_path, format_specifier_regex)) { if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
@ -444,8 +502,16 @@ bool save_results(const SDCliParams& cli_params,
ext = ".avi"; ext = ".avi";
fs::path video_path = base_path; fs::path video_path = base_path;
video_path += ext; video_path += ext;
if (create_video_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) { std::string final_ext_lower = ext.string();
std::transform(final_ext_lower.begin(), final_ext_lower.end(), final_ext_lower.begin(), ::tolower);
const bool mux_audio = generated_audio != nullptr && (final_ext_lower == ".avi" || final_ext_lower == ".webm");
if (create_video_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps, 90, mux_audio ? generated_audio : nullptr) == 0) {
LOG_INFO("save result video to '%s'", video_path.string().c_str()); LOG_INFO("save result video to '%s'", video_path.string().c_str());
if (generated_audio != nullptr && !mux_audio) {
fs::path wav_path = video_path;
wav_path.replace_extension(".wav");
write_audio_sidecar(wav_path);
}
return true; return true;
} else { } else {
LOG_ERROR("Failed to save result video to '%s'", video_path.string().c_str()); LOG_ERROR("Failed to save result video to '%s'", video_path.string().c_str());
@ -467,6 +533,9 @@ bool save_results(const SDCliParams& cli_params,
} }
} }
LOG_INFO("%d/%d images saved", sucessful_reults, num_results); LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
if (generated_audio != nullptr) {
write_audio_sidecar(get_video_audio_sidecar_path(cli_params));
}
return sucessful_reults != 0; return sucessful_reults != 0;
} }
@ -554,39 +623,10 @@ int main(int argc, const char* argv[]) {
} }
} }
bool vae_decode_only = true; bool vae_decode_only = true;
sd_image_t init_image = {0, 0, 3, nullptr};
sd_image_t end_image = {0, 0, 3, nullptr};
sd_image_t control_image = {0, 0, 3, nullptr};
sd_image_t mask_image = {0, 0, 1, nullptr};
std::vector<sd_image_t> ref_images;
std::vector<sd_image_t> pmid_images;
std::vector<sd_image_t> control_frames;
auto release_all_resources = [&]() {
free(init_image.data);
free(end_image.data);
free(control_image.data);
free(mask_image.data);
for (auto image : ref_images) {
free(image.data);
image.data = nullptr;
}
ref_images.clear();
for (auto image : pmid_images) {
free(image.data);
image.data = nullptr;
}
pmid_images.clear();
for (auto image : control_frames) {
free(image.data);
image.data = nullptr;
}
control_frames.clear();
};
auto load_image_and_update_size = [&](const std::string& path, auto load_image_and_update_size = [&](const std::string& path,
sd_image_t& image, SDImageOwner& image,
bool resize_image = true, bool resize_image = true,
int expected_channel = 3) -> bool { int expected_channel = 3) -> bool {
int expected_width = 0; int expected_width = 0;
@ -596,74 +636,73 @@ int main(int argc, const char* argv[]) {
expected_height = gen_params.height; expected_height = gen_params.height;
} }
if (!load_sd_image_from_file(&image, path.c_str(), expected_width, expected_height, expected_channel)) { if (!load_sd_image_from_file(image.put(), path.c_str(), expected_width, expected_height, expected_channel)) {
LOG_ERROR("load image from '%s' failed", path.c_str()); LOG_ERROR("load image from '%s' failed", path.c_str());
release_all_resources();
return false; return false;
} }
gen_params.set_width_and_height_if_unset(image.width, image.height); gen_params.set_width_and_height_if_unset(image.get().width, image.get().height);
return true; return true;
}; };
if (gen_params.init_image_path.size() > 0) { if (gen_params.init_image_path.size() > 0) {
vae_decode_only = false; vae_decode_only = false;
if (!load_image_and_update_size(gen_params.init_image_path, init_image)) { if (!load_image_and_update_size(gen_params.init_image_path, gen_params.init_image)) {
return 1; return 1;
} }
} }
if (gen_params.end_image_path.size() > 0) { if (gen_params.end_image_path.size() > 0) {
vae_decode_only = false; vae_decode_only = false;
if (!load_image_and_update_size(gen_params.end_image_path, end_image)) { if (!load_image_and_update_size(gen_params.end_image_path, gen_params.end_image)) {
return 1; return 1;
} }
} }
if (gen_params.ref_image_paths.size() > 0) { if (gen_params.ref_image_paths.size() > 0) {
vae_decode_only = false; vae_decode_only = false;
gen_params.ref_images.clear();
for (auto& path : gen_params.ref_image_paths) { for (auto& path : gen_params.ref_image_paths) {
sd_image_t ref_image = {0, 0, 3, nullptr}; SDImageOwner ref_image({0, 0, 3, nullptr});
if (!load_image_and_update_size(path, ref_image, false)) { if (!load_image_and_update_size(path, ref_image, false)) {
return 1; return 1;
} }
ref_images.push_back(ref_image); gen_params.ref_images.push_back(std::move(ref_image));
} }
} }
if (gen_params.mask_image_path.size() > 0) { if (gen_params.mask_image_path.size() > 0) {
if (!load_sd_image_from_file(&mask_image, if (!load_sd_image_from_file(gen_params.mask_image.put(),
gen_params.mask_image_path.c_str(), gen_params.mask_image_path.c_str(),
gen_params.get_resolved_width(), gen_params.get_resolved_width(),
gen_params.get_resolved_height(), gen_params.get_resolved_height(),
1)) { 1)) {
LOG_ERROR("load image from '%s' failed", gen_params.mask_image_path.c_str()); LOG_ERROR("load image from '%s' failed", gen_params.mask_image_path.c_str());
release_all_resources();
return 1; return 1;
} }
} else { } else {
mask_image.data = (uint8_t*)malloc(gen_params.get_resolved_width() * gen_params.get_resolved_height()); sd_image_t generated_mask = {0, 0, 1, nullptr};
if (mask_image.data == nullptr) { generated_mask.data = (uint8_t*)malloc(gen_params.get_resolved_width() * gen_params.get_resolved_height());
if (generated_mask.data == nullptr) {
LOG_ERROR("malloc mask image failed"); LOG_ERROR("malloc mask image failed");
release_all_resources();
return 1; return 1;
} }
mask_image.width = gen_params.get_resolved_width(); generated_mask.width = gen_params.get_resolved_width();
mask_image.height = gen_params.get_resolved_height(); generated_mask.height = gen_params.get_resolved_height();
memset(mask_image.data, 255, gen_params.get_resolved_width() * gen_params.get_resolved_height()); memset(generated_mask.data, 255, gen_params.get_resolved_width() * gen_params.get_resolved_height());
gen_params.mask_image.reset(generated_mask);
} }
if (gen_params.control_image_path.size() > 0) { if (gen_params.control_image_path.size() > 0) {
if (!load_sd_image_from_file(&control_image, if (!load_sd_image_from_file(gen_params.control_image.put(),
gen_params.control_image_path.c_str(), gen_params.control_image_path.c_str(),
gen_params.get_resolved_width(), gen_params.get_resolved_width(),
gen_params.get_resolved_height())) { gen_params.get_resolved_height())) {
LOG_ERROR("load image from '%s' failed", gen_params.control_image_path.c_str()); LOG_ERROR("load image from '%s' failed", gen_params.control_image_path.c_str());
release_all_resources();
return 1; return 1;
} }
if (cli_params.canny_preprocess) { // apply preprocessor if (cli_params.canny_preprocess) { // apply preprocessor
preprocess_canny(control_image, preprocess_canny(gen_params.control_image.get(),
0.08f, 0.08f,
0.08f, 0.08f,
0.8f, 0.8f,
@ -673,25 +712,25 @@ int main(int argc, const char* argv[]) {
} }
if (!gen_params.control_video_path.empty()) { if (!gen_params.control_video_path.empty()) {
gen_params.control_frames.clear();
if (!load_images_from_dir(gen_params.control_video_path, if (!load_images_from_dir(gen_params.control_video_path,
control_frames, gen_params.control_frames,
gen_params.get_resolved_width(), gen_params.get_resolved_width(),
gen_params.get_resolved_height(), gen_params.get_resolved_height(),
gen_params.video_frames, gen_params.video_frames,
cli_params.verbose)) { cli_params.verbose)) {
release_all_resources();
return 1; return 1;
} }
} }
if (!gen_params.pm_id_images_dir.empty()) { if (!gen_params.pm_id_images_dir.empty()) {
gen_params.pm_id_images.clear();
if (!load_images_from_dir(gen_params.pm_id_images_dir, if (!load_images_from_dir(gen_params.pm_id_images_dir,
pmid_images, gen_params.pm_id_images,
0, 0,
0, 0,
0, 0,
cli_params.verbose)) { cli_params.verbose)) {
release_all_resources();
return 1; return 1;
} }
} }
@ -700,119 +739,71 @@ int main(int argc, const char* argv[]) {
vae_decode_only = false; vae_decode_only = false;
} }
if (gen_params.hires_enabled &&
(gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_MODEL ||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_LANCZOS ||
gen_params.resolved_hires_upscaler == SD_HIRES_UPSCALER_NEAREST)) {
vae_decode_only = false;
}
sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview); sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview);
sd_image_t* results = nullptr; SDImageVec results;
int num_results = 0; int num_results = 0;
sd_audio_t* generated_audio = nullptr;
if (cli_params.mode == UPSCALE) { if (cli_params.mode == UPSCALE) {
num_results = 1; num_results = 1;
results = (sd_image_t*)calloc(num_results, sizeof(sd_image_t)); results.push_back(gen_params.init_image.release());
if (results == nullptr) {
LOG_INFO("failed to allocate results array");
release_all_resources();
return 1;
}
results[0] = init_image;
init_image.data = nullptr;
} else { } else {
sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params); SDCtxPtr sd_ctx(new_sd_ctx(&sd_ctx_params));
if (sd_ctx == nullptr) { if (sd_ctx == nullptr) {
LOG_INFO("new_sd_ctx_t failed"); LOG_INFO("new_sd_ctx_t failed");
release_all_resources();
return 1; return 1;
} }
if (gen_params.sample_params.sample_method == SAMPLE_METHOD_COUNT) { if (gen_params.sample_params.sample_method == SAMPLE_METHOD_COUNT) {
gen_params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx); gen_params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx.get());
} }
if (gen_params.high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { if (gen_params.high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) {
gen_params.high_noise_sample_params.sample_method = sd_get_default_sample_method(sd_ctx); gen_params.high_noise_sample_params.sample_method = sd_get_default_sample_method(sd_ctx.get());
} }
if (gen_params.sample_params.scheduler == SCHEDULER_COUNT) { if (gen_params.sample_params.scheduler == SCHEDULER_COUNT) {
gen_params.sample_params.scheduler = sd_get_default_scheduler(sd_ctx, gen_params.sample_params.sample_method); gen_params.sample_params.scheduler = sd_get_default_scheduler(sd_ctx.get(), gen_params.sample_params.sample_method);
} }
if (cli_params.mode == IMG_GEN) { if (cli_params.mode == IMG_GEN) {
sd_img_gen_params_t img_gen_params = { sd_img_gen_params_t img_gen_params = gen_params.to_sd_img_gen_params_t();
gen_params.lora_vec.data(),
static_cast<uint32_t>(gen_params.lora_vec.size()),
gen_params.prompt.c_str(),
gen_params.negative_prompt.c_str(),
gen_params.clip_skip,
init_image,
ref_images.data(),
(int)ref_images.size(),
gen_params.auto_resize_ref_image,
gen_params.increase_ref_index,
mask_image,
gen_params.get_resolved_width(),
gen_params.get_resolved_height(),
gen_params.sample_params,
gen_params.strength,
gen_params.seed,
gen_params.batch_count,
control_image,
gen_params.control_strength,
{
pmid_images.data(),
(int)pmid_images.size(),
gen_params.pm_id_embed_path.c_str(),
gen_params.pm_style_strength,
}, // pm_params
gen_params.vae_tiling_params,
gen_params.cache_params,
};
results = generate_image(sd_ctx, &img_gen_params);
num_results = gen_params.batch_count; num_results = gen_params.batch_count;
results.adopt(generate_image(sd_ctx.get(), &img_gen_params), num_results);
} else if (cli_params.mode == VID_GEN) { } else if (cli_params.mode == VID_GEN) {
sd_vid_gen_params_t vid_gen_params = { sd_vid_gen_params_t vid_gen_params = gen_params.to_sd_vid_gen_params_t();
gen_params.lora_vec.data(), sd_image_t* generated_video = nullptr;
static_cast<uint32_t>(gen_params.lora_vec.size()), if (!generate_video(sd_ctx.get(), &vid_gen_params, &generated_video, &num_results, &generated_audio)) {
gen_params.prompt.c_str(), generated_video = nullptr;
gen_params.negative_prompt.c_str(), }
gen_params.clip_skip, results.adopt(generated_video, num_results);
init_image,
end_image,
control_frames.data(),
(int)control_frames.size(),
gen_params.get_resolved_width(),
gen_params.get_resolved_height(),
gen_params.sample_params,
gen_params.high_noise_sample_params,
gen_params.moe_boundary,
gen_params.strength,
gen_params.seed,
gen_params.video_frames,
gen_params.vace_strength,
gen_params.vae_tiling_params,
gen_params.cache_params,
};
results = generate_video(sd_ctx, &vid_gen_params, &num_results);
} }
if (results == nullptr) { if (!results) {
LOG_ERROR("generate failed"); LOG_ERROR("generate failed");
free_sd_ctx(sd_ctx);
return 1; return 1;
} }
free_sd_ctx(sd_ctx);
} }
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
if (ctx_params.esrgan_path.size() > 0 && gen_params.upscale_repeats > 0) { if (ctx_params.esrgan_path.size() > 0 && gen_params.upscale_repeats > 0) {
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(), UpscalerCtxPtr upscaler_ctx(new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
ctx_params.offload_params_to_cpu, ctx_params.offload_params_to_cpu,
ctx_params.diffusion_conv_direct, ctx_params.diffusion_conv_direct,
ctx_params.n_threads, ctx_params.n_threads,
gen_params.upscale_tile_size); gen_params.upscale_tile_size,
ctx_params.backend.c_str(),
ctx_params.params_backend.c_str()));
if (upscaler_ctx == nullptr) { if (upscaler_ctx == nullptr) {
LOG_ERROR("new_upscaler_ctx failed"); LOG_ERROR("new_upscaler_ctx failed");
@ -821,32 +812,27 @@ int main(int argc, const char* argv[]) {
if (results[i].data == nullptr) { if (results[i].data == nullptr) {
continue; continue;
} }
sd_image_t current_image = results[i]; SDImageOwner current_image(results[i]);
results[i] = {0, 0, 0, nullptr};
for (int u = 0; u < gen_params.upscale_repeats; ++u) { for (int u = 0; u < gen_params.upscale_repeats; ++u) {
sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor); SDImageOwner upscaled_image(upscale(upscaler_ctx.get(), current_image.get(), upscale_factor));
if (upscaled_image.data == nullptr) { if (upscaled_image.get().data == nullptr) {
LOG_ERROR("upscale failed"); LOG_ERROR("upscale failed");
break; break;
} }
free(current_image.data); current_image = std::move(upscaled_image);
current_image = upscaled_image;
} }
results[i] = current_image; // Set the final upscaled image as the result results[i] = current_image.release(); // Set the final upscaled image as the result
} }
} }
} }
if (!save_results(cli_params, ctx_params, gen_params, results, num_results)) { if (!save_results(cli_params, ctx_params, gen_params, results.data(), num_results, generated_audio)) {
free_sd_audio(generated_audio);
return 1; return 1;
} }
for (int i = 0; i < num_results; i++) { free_sd_audio(generated_audio);
free(results[i].data);
results[i].data = nullptr;
}
free(results);
release_all_resources();
return 0; return 0;
} }

2721
examples/common/common.cpp Normal file

File diff suppressed because it is too large Load Diff

304
examples/common/common.h Normal file
View File

@ -0,0 +1,304 @@
#ifndef __EXAMPLES_COMMON_COMMON_H__
#define __EXAMPLES_COMMON_COMMON_H__
#include <cmath>
#include <cstdint>
#include <functional>
#include <map>
#include <string>
#include <vector>
#include "log.h"
#include "resource_owners.hpp"
#include "stable-diffusion.h"
#define SAFE_STR(s) ((s) ? (s) : "")
#define BOOL_STR(b) ((b) ? "true" : "false")
extern const char* const modes_str[];
#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale, metadata"
enum SDMode {
IMG_GEN,
VID_GEN,
CONVERT,
UPSCALE,
METADATA,
MODE_COUNT
};
struct StringOption {
std::string short_name;
std::string long_name;
std::string desc;
std::string* target;
};
struct IntOption {
std::string short_name;
std::string long_name;
std::string desc;
int* target;
};
struct FloatOption {
std::string short_name;
std::string long_name;
std::string desc;
float* target;
};
struct BoolOption {
std::string short_name;
std::string long_name;
std::string desc;
bool keep_true;
bool* target;
};
struct ManualFunction {
std::function<int(int, const char**, int, bool&)> _func;
ManualFunction() = default;
ManualFunction(std::function<int(int argc, const char** argv, int index, bool& valid)> func)
: _func(std::move(func)) {
}
template <typename F>
ManualFunction(F func)
: _func(make_function(func)) {
}
int operator()(int argc, const char** argv, int index, bool& valid) const {
return _func(argc, argv, index, valid);
}
private:
template <typename F>
static std::function<int(int, const char**, int, bool&)> make_function(F func) {
if constexpr (std::is_invocable_v<F, int, const char**, int, bool&>) {
return func;
} else {
return [func](int argc, const char** argv, int index, bool&) {
return func(argc, argv, index);
};
}
}
};
struct ManualOption {
std::string short_name;
std::string long_name;
std::string desc;
ManualFunction cb;
};
struct ArgOptions {
std::vector<StringOption> string_options;
std::vector<IntOption> int_options;
std::vector<FloatOption> float_options;
std::vector<BoolOption> bool_options;
std::vector<ManualOption> manual_options;
static std::string wrap_text(const std::string& text, size_t width, size_t indent);
void print() const;
};
bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& options_list);
bool decode_base64_image(const std::string& encoded_input,
int target_channels,
int expected_width,
int expected_height,
SDImageOwner& out_image);
struct SDContextParams {
int n_threads = -1;
std::string model_path;
std::string clip_l_path;
std::string clip_g_path;
std::string clip_vision_path;
std::string t5xxl_path;
std::string llm_path;
std::string llm_vision_path;
std::string diffusion_model_path;
std::string high_noise_diffusion_model_path;
std::string uncond_diffusion_model_path;
std::string embeddings_connectors_path;
std::string vae_path;
std::string vae_format = "auto";
std::string audio_vae_path;
std::string taesd_path;
std::string esrgan_path;
std::string control_net_path;
std::string embedding_dir;
std::string photo_maker_path;
sd_type_t wtype = SD_TYPE_COUNT;
std::string tensor_type_rules;
std::string lora_model_dir = ".";
std::string hires_upscalers_dir;
std::map<std::string, std::string> embedding_map;
std::vector<sd_embedding_t> embedding_vec;
rng_type_t rng_type = CUDA_RNG;
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
bool offload_params_to_cpu = false;
float max_vram = 0.f;
bool stream_layers = false;
std::string backend;
std::string params_backend;
bool enable_mmap = false;
bool control_net_cpu = false;
bool clip_on_cpu = false;
bool vae_on_cpu = false;
bool flash_attn = false;
bool diffusion_flash_attn = false;
bool diffusion_conv_direct = false;
bool vae_conv_direct = false;
bool circular = false;
bool circular_x = false;
bool circular_y = false;
bool chroma_use_dit_mask = true;
bool chroma_use_t5_mask = false;
int chroma_t5_mask_pad = 1;
bool qwen_image_zero_cond_t = false;
prediction_t prediction = PREDICTION_COUNT;
lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
bool force_sdxl_vae_conv_scale = false;
float flow_shift = INFINITY;
ArgOptions get_options();
void build_embedding_map();
bool resolve(SDMode mode);
bool validate(SDMode mode);
bool resolve_and_validate(SDMode mode);
std::string to_string() const;
sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview);
};
struct SDGenerationParams {
// User-facing input fields.
std::string prompt;
std::string negative_prompt;
int clip_skip = -1; // <= 0 represents unspecified
int width = -1;
int height = -1;
int batch_count = 1;
int64_t seed = 42;
float strength = 0.75f;
float control_strength = 0.9f;
bool auto_resize_ref_image = true;
bool increase_ref_index = false;
bool embed_image_metadata = true;
std::string init_image_path;
std::string end_image_path;
std::string mask_image_path;
std::string control_image_path;
std::vector<std::string> ref_image_paths;
std::string control_video_path;
sd_sample_params_t sample_params;
sd_sample_params_t high_noise_sample_params;
std::string extra_sample_args;
std::string high_noise_extra_sample_args;
std::vector<int> skip_layers = {7, 8, 9};
std::vector<int> high_noise_skip_layers = {7, 8, 9};
std::vector<float> custom_sigmas;
std::string cache_mode;
std::string cache_option;
std::string scm_mask;
bool scm_policy_dynamic = true;
sd_cache_params_t cache_params{};
float moe_boundary = 0.875f;
int video_frames = 1;
int fps = 16;
float vace_strength = 1.f;
sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
std::string extra_tiling_args;
std::string pm_id_images_dir;
std::string pm_id_embed_path;
float pm_style_strength = 20.f;
int upscale_repeats = 1;
int upscale_tile_size = 128;
bool hires_enabled = false;
std::string hires_upscaler = "Latent";
std::string hires_upscaler_model_path;
float hires_scale = 2.f;
int hires_width = 0;
int hires_height = 0;
int hires_steps = 0;
float hires_denoising_strength = 0.7f;
int hires_upscale_tile_size = 128;
std::vector<float> hires_custom_sigmas;
std::map<std::string, float> lora_map;
std::map<std::string, float> high_noise_lora_map;
// Derived and normalized fields.
std::string prompt_with_lora; // for metadata record only
std::vector<sd_lora_t> lora_vec;
sd_hires_upscaler_t resolved_hires_upscaler;
// Owned execution payload.
SDImageOwner init_image;
SDImageOwner end_image;
std::vector<SDImageOwner> ref_images;
SDImageOwner mask_image;
SDImageOwner control_image;
std::vector<SDImageOwner> pm_id_images;
std::vector<SDImageOwner> control_frames;
// Backing storage for sd_img_gen_params_t view fields.
std::vector<sd_image_t> ref_image_views;
std::vector<sd_image_t> pm_id_image_views;
std::vector<sd_image_t> control_frame_views;
SDGenerationParams();
SDGenerationParams(const SDGenerationParams& other) = default;
SDGenerationParams& operator=(const SDGenerationParams& other) = default;
SDGenerationParams(SDGenerationParams&& other) noexcept = default;
SDGenerationParams& operator=(SDGenerationParams&& other) noexcept = default;
ArgOptions get_options();
bool from_json_str(const std::string& json_str,
const std::function<std::string(const std::string&)>& lora_path_resolver = {});
bool initialize_cache_params();
void extract_and_remove_lora(const std::string& lora_model_dir);
bool width_and_height_are_set() const;
void set_width_and_height_if_unset(int w, int h);
int get_resolved_width() const;
int get_resolved_height() const;
bool resolve(const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict = false);
bool validate(SDMode mode);
bool resolve_and_validate(SDMode mode,
const std::string& lora_model_dir,
const std::string& hires_upscalers_dir,
bool strict = false);
sd_img_gen_params_t to_sd_img_gen_params_t();
sd_vid_gen_params_t to_sd_vid_gen_params_t();
std::string to_string() const;
};
std::string version_string();
std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode = IMG_GEN);
std::string get_image_params(const SDContextParams& ctx_params,
const SDGenerationParams& gen_params,
int64_t seed,
SDMode mode = IMG_GEN);
#endif // __EXAMPLES_COMMON_COMMON_H__

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,7 @@
#include "log.h" #include "log.h"
#include <vector>
bool log_verbose = false; bool log_verbose = false;
bool log_color = false; bool log_color = false;
@ -34,17 +36,12 @@ void print_utf8(FILE* stream, const char* utf8) {
return; return;
} }
wchar_t* wbuf = (wchar_t*)malloc(wlen * sizeof(wchar_t)); std::vector<wchar_t> wbuf(static_cast<size_t>(wlen));
if (!wbuf) {
return;
}
MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wbuf, wlen); MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wbuf.data(), wlen);
DWORD written; DWORD written;
WriteConsoleW(h, wbuf, wlen - 1, &written, NULL); WriteConsoleW(h, wbuf.data(), wlen - 1, &written, NULL);
free(wbuf);
} else { } else {
DWORD written; DWORD written;
WriteFile(h, utf8, (DWORD)strlen(utf8), &written, NULL); WriteFile(h, utf8, (DWORD)strlen(utf8), &written, NULL);

File diff suppressed because it is too large Load Diff

View File

@ -57,7 +57,13 @@ int create_mjpg_avi_from_sd_images(const char* filename,
sd_image_t* images, sd_image_t* images,
int num_images, int num_images,
int fps, int fps,
int quality = 90); int quality = 90,
const sd_audio_t* audio = nullptr);
std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images,
int num_images,
int fps,
int quality = 90,
const sd_audio_t* audio = nullptr);
#ifdef SD_USE_WEBP #ifdef SD_USE_WEBP
int create_animated_webp_from_sd_images(const char* filename, int create_animated_webp_from_sd_images(const char* filename,
@ -65,6 +71,10 @@ int create_animated_webp_from_sd_images(const char* filename,
int num_images, int num_images,
int fps, int fps,
int quality = 90); int quality = 90);
std::vector<uint8_t> create_animated_webp_from_sd_images_to_vector(sd_image_t* images,
int num_images,
int fps,
int quality = 90);
#endif #endif
#ifdef SD_USE_WEBM #ifdef SD_USE_WEBM
@ -72,13 +82,32 @@ int create_webm_from_sd_images(const char* filename,
sd_image_t* images, sd_image_t* images,
int num_images, int num_images,
int fps, int fps,
int quality = 90); int quality = 90,
const sd_audio_t* audio = nullptr);
std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images,
int num_images,
int fps,
int quality = 90,
const sd_audio_t* audio = nullptr);
#endif #endif
int create_video_from_sd_images(const char* filename, int create_video_from_sd_images(const char* filename,
sd_image_t* images, sd_image_t* images,
int num_images, int num_images,
int fps, int fps,
int quality = 90); int quality = 90,
const sd_audio_t* audio = nullptr);
std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
sd_image_t* images,
int num_images,
int fps,
int quality = 90,
const sd_audio_t* audio = nullptr);
bool write_wav_to_file(const std::string& path,
const float* interleaved_samples,
uint64_t sample_count,
uint32_t channels,
uint32_t sample_rate);
#endif // __MEDIA_IO_H__ #endif // __MEDIA_IO_H__

View File

@ -0,0 +1,236 @@
#ifndef __EXAMPLE_RESOURCE_OWNERS_H__
#define __EXAMPLE_RESOURCE_OWNERS_H__
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <memory>
#include <utility>
#include <vector>
#include "stable-diffusion.h"
struct FreeDeleter {
void operator()(void* ptr) const {
free(ptr);
}
};
struct FileCloser {
void operator()(FILE* file) const {
if (file != nullptr) {
fclose(file);
}
}
};
struct SDCtxDeleter {
void operator()(sd_ctx_t* ctx) const {
if (ctx != nullptr) {
free_sd_ctx(ctx);
}
}
};
struct UpscalerCtxDeleter {
void operator()(upscaler_ctx_t* ctx) const {
if (ctx != nullptr) {
free_upscaler_ctx(ctx);
}
}
};
template <typename T>
using FreeUniquePtr = std::unique_ptr<T, FreeDeleter>;
using FilePtr = std::unique_ptr<FILE, FileCloser>;
using SDCtxPtr = std::unique_ptr<sd_ctx_t, SDCtxDeleter>;
using UpscalerCtxPtr = std::unique_ptr<upscaler_ctx_t, UpscalerCtxDeleter>;
class SDImageOwner {
private:
static sd_image_t copy_image(const sd_image_t& image) {
if (image.data == nullptr) {
return {image.width, image.height, image.channel, nullptr};
}
const size_t byte_count = static_cast<size_t>(image.width) * image.height * image.channel;
uint8_t* raw_copy = static_cast<uint8_t*>(malloc(byte_count));
if (raw_copy == nullptr) {
return {0, 0, 0, nullptr};
}
std::memcpy(raw_copy, image.data, byte_count);
return {image.width, image.height, image.channel, raw_copy};
}
sd_image_t image_ = {0, 0, 0, nullptr};
public:
SDImageOwner() = default;
explicit SDImageOwner(sd_image_t image)
: image_(image) {
}
SDImageOwner(const SDImageOwner& other)
: image_(copy_image(other.image_)) {
}
SDImageOwner& operator=(const SDImageOwner& other) {
if (this != &other) {
reset(copy_image(other.image_));
}
return *this;
}
SDImageOwner(SDImageOwner&& other) noexcept
: image_(other.release()) {
}
SDImageOwner& operator=(SDImageOwner&& other) noexcept {
if (this != &other) {
reset();
image_ = other.release();
}
return *this;
}
~SDImageOwner() {
reset();
}
sd_image_t* put() {
if (image_.data != nullptr) {
free(image_.data);
image_.data = nullptr;
}
image_.width = 0;
image_.height = 0;
image_.channel = 0;
return &image_;
}
sd_image_t& get() {
return image_;
}
const sd_image_t& get() const {
return image_;
}
sd_image_t release() {
sd_image_t image = image_;
image_ = {0, 0, 0, nullptr};
return image;
}
void reset(sd_image_t image = {0, 0, 0, nullptr}) {
if (image_.data != nullptr) {
free(image_.data);
}
image_ = image;
}
};
class SDImageVec {
private:
std::vector<sd_image_t> images_;
public:
SDImageVec() = default;
SDImageVec(const SDImageVec&) = delete;
SDImageVec& operator=(const SDImageVec&) = delete;
SDImageVec(SDImageVec&& other) noexcept
: images_(std::move(other.images_)) {
}
SDImageVec& operator=(SDImageVec&& other) noexcept {
if (this != &other) {
clear();
images_ = std::move(other.images_);
}
return *this;
}
~SDImageVec() {
clear();
}
void push_back(sd_image_t image) {
images_.push_back(image);
}
void push_back(SDImageOwner&& image) {
images_.push_back(image.release());
}
void reserve(size_t count) {
images_.reserve(count);
}
void adopt(sd_image_t* images, int count) {
clear();
if (images == nullptr || count <= 0) {
free(images);
return;
}
images_.reserve(static_cast<size_t>(count));
for (int i = 0; i < count; ++i) {
images_.push_back(images[i]);
}
free(images);
}
size_t size() const {
return images_.size();
}
bool empty() const {
return images_.empty();
}
int count() const {
return static_cast<int>(images_.size());
}
explicit operator bool() const {
return !images_.empty();
}
sd_image_t* data() {
return images_.data();
}
const sd_image_t* data() const {
return images_.data();
}
sd_image_t& operator[](size_t index) {
return images_[index];
}
const sd_image_t& operator[](size_t index) const {
return images_[index];
}
std::vector<sd_image_t>& raw() {
return images_;
}
const std::vector<sd_image_t>& raw() const {
return images_;
}
void clear() {
for (sd_image_t& image : images_) {
free(image.data);
image.data = nullptr;
}
images_.clear();
}
};
#endif // __EXAMPLE_RESOURCE_OWNERS_H__

View File

@ -50,17 +50,33 @@ if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}")
set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE) set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE)
else() else()
message(WARNING "pnpm not found, frontend build disabled") if(EXISTS "${GENERATED_HTML_HEADER}")
message(STATUS "pnpm not found; using pre-built frontend header detected at ${GENERATED_HTML_HEADER}")
set(HAVE_FRONTEND_BUILD ON)
add_custom_target(${TARGET}_frontend)
else()
message(WARNING "pnpm not found; frontend build disabled.")
endif()
endif() endif()
else() else()
message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}") message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}")
endif() endif()
add_executable(${TARGET} add_executable(${TARGET}
../common/common.cpp
../common/log.cpp ../common/log.cpp
../common/media_io.cpp ../common/media_io.cpp
main.cpp main.cpp
runtime.cpp
async_jobs.cpp
routes_index.cpp
routes_openai.cpp
routes_sdapi.cpp
routes_sdcpp.cpp
) )
if(APPLE)
sd_set_macos_rpaths(${TARGET})
endif()
if(HAVE_FRONTEND_BUILD) if(HAVE_FRONTEND_BUILD)
add_dependencies(${TARGET} ${TARGET}_frontend) add_dependencies(${TARGET} ${TARGET}_frontend)

View File

@ -1,3 +1,33 @@
# Example
The following example starts `sd-server` with a standalone diffusion model, VAE, and LLM text encoder:
```
.\bin\Release\sd-server.exe --diffusion-model ..\models\diffusion_models\z_image_turbo_bf16.safetensors --vae ..\models\vae\ae.sft --llm ..\models\text_encoders\qwen_3_4b.safetensors --diffusion-fa --offload-to-cpu -v --cfg-scale 1.0
```
What this example does:
* `--diffusion-model` selects the standalone diffusion model
* `--vae` selects the VAE decoder
* `--llm` selects the text encoder / language model used by this pipeline
* `--diffusion-fa` enables flash attention in the diffusion model
* `--offload-to-cpu` reduces VRAM pressure by keeping weights in RAM when possible
* `-v` enables verbose logging
* `--cfg-scale 1.0` sets the default CFG scale for generation
After the server starts successfully:
* the web UI is available at `http://127.0.0.1:1234/`
* the native async API is available under `/sdcpp/v1/...`
* the compatibility APIs are available under `/v1/...` and `/sdapi/v1/...`
If you want to use a different host or port, pass:
```bash
--listen-ip <ip> --listen-port <port>
```
# Frontend # Frontend
## Build with Frontend ## Build with Frontend
@ -8,7 +38,7 @@ The server can optionally build the web frontend and embed it into the binary as
Install the following tools: Install the following tools:
* **Node.js** ≥ 22.18 * **Node.js** ≥ 20
https://nodejs.org/ https://nodejs.org/
* **pnpm** ≥ 10 * **pnpm** ≥ 10
@ -54,7 +84,7 @@ and embed the generated frontend into the server binary.
## Frontend Repository ## Frontend Repository
The web frontend is maintained in a **separate repository**, https://github.com/leejet/stable-ui. The web frontend is maintained in a **separate repository**, https://github.com/leejet/sdcpp-webui.
If you want to modify the UI or frontend logic, please submit pull requests to the **frontend repository**. If you want to modify the UI or frontend logic, please submit pull requests to the **frontend repository**.
@ -93,11 +123,11 @@ In this case, the server will load and serve the specified `index.html` file ins
usage: ./bin/sd-server [options] usage: ./bin/sd-server [options]
Svr Options: Svr Options:
-l, --listen-ip <string> server listen ip (default: 127.0.0.1) -l, --listen-ip <string> server listen ip (default: 127.0.0.1)
--serve-html-path <string> path to HTML file to serve at root (optional) --serve-html-path <string> path to HTML file to serve at root (optional)
--listen-port <int> server listen port (default: 1234) --listen-port <int> server listen port (default: 1234)
-v, --verbose print extra info -v, --verbose print extra info
--color colors the logging tags according to level --color colors the logging tags according to level
-h, --help show this help message and exit -h, --help show this help message and exit
Context Options: Context Options:
@ -106,28 +136,34 @@ Context Options:
--clip_g <string> path to the clip-g text encoder --clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder --clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder --t5xxl <string> path to the t5xxl text encoder
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...) --llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image,
mistral-small3.2 for flux2, ...)
--llm_vision <string> path to the llm vit --llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated. --qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated. --qwen2vl_vision <string> alias of --llm_vision. Deprecated.
--diffusion-model <string> path to the standalone diffusion model --diffusion-model <string> path to the standalone diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model --high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--uncond-diffusion-model <string> path to the standalone unconditional diffusion model, currently used by
Ideogram4 CFG
--vae <string> path to standalone vae model --vae <string> path to standalone vae model
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) --taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--tae <string> alias of --taesd --tae <string> alias of --taesd
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
--hires-upscalers-dir <string> highres fix upscaler model directory
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model --photo-maker <string> path to PHOTOMAKER model
--upscale-model <string> path to esrgan model. --upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0,
CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5) --max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
--vae-tiling process vae in tiles to reduce memory usage graph splitting; a negative value auto-detects free VRAM, sparing the
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
--mmap whether to memory-map model --mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram) --control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram)
@ -142,20 +178,19 @@ Context Options:
--chroma-disable-dit-mask disable dit mask for chroma --chroma-disable-dit-mask disable dit mask for chroma
--qwen-image-zero-cond-t enable zero_cond_t for qwen image --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma --chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K,
type of the weight file q4_K). If not specified, the default is the type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow,
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights flux2_flow]
contain any quantized parameters, the at_runtime mode will be used; otherwise, --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is
immediately will be used.The immediately mode may have precision and auto. In auto mode, if the model weights contain any quantized parameters,
compatibility issues with quantized parameters, but it usually offers faster inference the at_runtime mode will be used; otherwise, immediately will be used.The
speed and, in some cases, lower memory usage. The at_runtime mode, on the immediately mode may have precision and compatibility issues with quantized
other hand, is exactly the opposite. parameters, but it usually offers faster inference speed and, in some cases,
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) lower memory usage. The at_runtime mode, on the other hand, is exactly the
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 opposite.
(overrides --vae-tile-size)
Default Generation Options: Default Generation Options:
-p, --prompt <string> the prompt to render -p, --prompt <string> the prompt to render
@ -164,65 +199,106 @@ Default Generation Options:
--end-img <string> path to the end image, required by flf2v --end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image --mask <string> path to the mask image
--control-image <string> path to control image, control net --control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in --control-video <string> path to control video frames, It must be a directory path. The video frames
lexicographical (character) order. For example, if the control video path is inside should be stored as images in lexicographical (character) order. For
`frames`, the directory contain images such as 00.png, 01.png, ... etc. example, if the control video path is `frames`, the directory contain images
such as 00.png, 01.png, ... etc.
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir --pm-id-images-dir <string> path to PHOTOMAKER input id images dir
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed --pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
--extra-sample-args <string> extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta,
apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports
slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end;
ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma
--extra-tiling-args <string> extra VAE tiling args, key=value list. LTX video VAE supports
temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto) --high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, --clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer
will be 1 for SD1.x, 2 for SD2.x (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count -b, --batch-count <int> batch count
--video-frames <int> video frames (default: 1) --video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24) --fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for
NitroSD-Vibrant NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1) --upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128) --upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
--hires-width <int> highres fix target width, 0 to use --hires-scale (default: 0)
--hires-height <int> highres fix target height, 0 to use --hires-scale (default: 0)
--hires-steps <int> highres fix second pass sample steps, 0 to reuse --steps (default: 0)
--hires-upscale-tile-size <int> highres fix upscaler tile size, reserved for model-backed upscalers (default:
128)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --img-cfg-scale <float> image guidance scale for inpaint or image edit models: (default: same as
--cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 --slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means
medium disabled, a value of 2.5 is nice for sd3.5 medium
--skip-layer-start <float> SLG enabling point (default: 0.01) --skip-layer-start <float> SLG enabling point (default: 0.01)
--skip-layer-end <float> SLG disabling point (default: 0.2) --skip-layer-end <float> SLG disabling point (default: 0.2)
--eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a) --eta <float> noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and
res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto) --flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0) --high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) --high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or image edit models (default:
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5) same as --cfg-scale)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) --high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input
(default: 3.5)
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default:
0)
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01) --high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2) --high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
--high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a) --high-noise-eta <float> (high noise) noise multiplier (default: 0 for ddim_trailing, tcd,
res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
--strength <float> strength for noising/unnoising (default: 0.75) --strength <float> strength for noising/unnoising (default: 0.75)
--pm-style-strength <float> --pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image --control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if
`--high-noise-steps` is set to -1
--vace-strength <float> wan vace strength --vace-strength <float> wan vace strength
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--hires-scale <float> highres fix scale when target size is not set (default: 2.0)
--hires-denoising-strength <float> highres fix second pass denoising strength (default: 0.7)
--increase-ref-index automatically increase the indices of references images based on the order
they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
--disable-image-metadata do not embed generation metadata on image files --disable-image-metadata do not embed generation metadata on image files
--vae-tiling process vae in tiles to reduce memory usage
--temporal-tiling enable temporal tiling for LTX video VAE decode
--hires enable highres fix
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m,
tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s,
otherwise) er_sde, euler_cfg_pp, euler_a_cfg_pp] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a,
ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep,
euler_a otherwise res_2s, er_sde, euler_cfg_pp, euler_a_cfg_pp] default: euler for Flux/SD3/Wan, euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits,
kl_optimal, lcm, bong_tangent], default: discrete smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent, ltx2], default:
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0"). model-specific
--sigmas custom sigma values for the sampler, comma-separated (e.g.,
"14.61,7.8,3.5,0.0").
--hires-sigmas custom sigma values for the highres fix second pass, comma-separated (e.g.,
"0.85,0.725,0.421875,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9]) --skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET),
'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT
Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache: --cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit:
"threshold=0.25" or "threshold=1.5,reset=0" Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=.
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache Examples: "threshold=0.25" or "threshold=1.5,reset=0"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g.,
"1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static' --scm-policy SCM policy: 'dynamic' (default) or 'static'
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size
if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
``` ```

1315
examples/server/api.md Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,356 @@
// Extracted from main.cpp during server refactor.
#include "async_jobs.h"
#include <iomanip>
#include <sstream>
#include "common/log.h"
#include "common/media_io.h"
#include "common/resource_owners.hpp"
const char* async_job_kind_name(AsyncJobKind kind) {
switch (kind) {
case AsyncJobKind::ImgGen:
return "img_gen";
case AsyncJobKind::VidGen:
return "vid_gen";
default:
return "img_gen";
}
}
const char* async_job_status_name(AsyncJobStatus status) {
switch (status) {
case AsyncJobStatus::Queued:
return "queued";
case AsyncJobStatus::Generating:
return "generating";
case AsyncJobStatus::Completed:
return "completed";
case AsyncJobStatus::Failed:
return "failed";
case AsyncJobStatus::Cancelled:
return "cancelled";
default:
return "failed";
}
}
void purge_expired_jobs(AsyncJobManager& manager) {
const int64_t now = unix_timestamp_now();
for (auto it = manager.expired_jobs.begin(); it != manager.expired_jobs.end();) {
if (it->second <= now) {
it = manager.expired_jobs.erase(it);
} else {
++it;
}
}
for (auto it = manager.jobs.begin(); it != manager.jobs.end();) {
const auto& job = it->second;
if (job->completed_at == 0) {
++it;
continue;
}
int64_t ttl_seconds = job->status == AsyncJobStatus::Completed
? manager.completed_ttl_seconds
: manager.failed_ttl_seconds;
if (now - job->completed_at >= ttl_seconds) {
manager.expired_jobs[job->id] = now + std::max<int64_t>(ttl_seconds, 60);
it = manager.jobs.erase(it);
} else {
++it;
}
}
}
size_t count_pending_jobs(const AsyncJobManager& manager) {
size_t pending = 0;
for (const auto& entry : manager.jobs) {
if (entry.second->status == AsyncJobStatus::Queued ||
entry.second->status == AsyncJobStatus::Generating) {
++pending;
}
}
return pending;
}
std::string make_async_job_id(AsyncJobManager& manager) {
std::ostringstream oss;
oss << "job_" << std::hex << unix_timestamp_now() << "_" << std::setw(8)
<< std::setfill('0') << manager.next_id++;
return oss.str();
}
bool cancel_queued_job(AsyncJobManager& manager, AsyncGenerationJob& job) {
auto new_end = std::remove(manager.queue.begin(), manager.queue.end(), job.id);
if (new_end == manager.queue.end()) {
return false;
}
manager.queue.erase(new_end, manager.queue.end());
job.status = AsyncJobStatus::Cancelled;
job.completed_at = unix_timestamp_now();
job.result_images_b64.clear();
job.result_media_b64.clear();
job.result_media_mime_type.clear();
job.result_frame_count = 0;
job.result_fps = 0;
job.error_code = "cancelled";
job.error_message = "job cancelled by client";
return true;
}
json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJob& job) {
json result;
result["id"] = job.id;
result["kind"] = async_job_kind_name(job.kind);
result["status"] = async_job_status_name(job.status);
result["created"] = job.created_at;
result["started"] = job.started_at == 0 ? json(nullptr) : json(job.started_at);
result["completed"] = job.completed_at == 0 ? json(nullptr) : json(job.completed_at);
result["queue_position"] = 0;
if (job.status == AsyncJobStatus::Queued) {
size_t position = 1;
for (const auto& queued_id : manager.queue) {
if (queued_id == job.id) {
result["queue_position"] = position;
break;
}
++position;
}
}
if (job.status == AsyncJobStatus::Completed) {
if (job.kind == AsyncJobKind::VidGen) {
result["result"] = {
{"output_format", job.vid_gen.output_format},
{"mime_type", job.result_media_mime_type},
{"fps", job.result_fps},
{"frame_count", job.result_frame_count},
{"b64_json", job.result_media_b64},
};
} else {
json images = json::array();
for (size_t i = 0; i < job.result_images_b64.size(); ++i) {
images.push_back({{"index", i}, {"b64_json", job.result_images_b64[i]}});
}
result["result"] = {
{"output_format", job.img_gen.output_format},
{"images", images},
};
}
result["error"] = nullptr;
} else if (job.status == AsyncJobStatus::Failed ||
job.status == AsyncJobStatus::Cancelled) {
result["result"] = nullptr;
result["error"] = {
{"code",
job.error_code.empty()
? (job.status == AsyncJobStatus::Cancelled ? "cancelled" : "generation_failed")
: job.error_code},
{"message", job.error_message},
};
} else {
result["result"] = nullptr;
result["error"] = nullptr;
}
return result;
}
bool execute_img_gen_job(ServerRuntime& runtime,
AsyncGenerationJob& job,
std::vector<std::string>& output_images,
std::string& error_message) {
sd_img_gen_params_t params = job.img_gen.to_sd_img_gen_params_t();
SDImageVec results;
{
std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
sd_image_t* raw_results = generate_image(runtime.sd_ctx, &params);
results.adopt(raw_results, params.batch_count);
}
const int num_results = results.count();
if (num_results <= 0) {
error_message = "generate_image returned no results";
return false;
}
EncodedImageFormat encoded_format = EncodedImageFormat::PNG;
if (job.img_gen.output_format == "jpeg") {
encoded_format = EncodedImageFormat::JPEG;
} else if (job.img_gen.output_format == "webp") {
encoded_format = EncodedImageFormat::WEBP;
}
for (int i = 0; i < num_results; ++i) {
if (results[i].data == nullptr) {
continue;
}
const std::string metadata = job.img_gen.gen_params.embed_image_metadata
? get_image_params(*runtime.ctx_params,
job.img_gen.gen_params,
job.img_gen.gen_params.seed + i)
: "";
auto image_bytes = encode_image_to_vector(encoded_format,
results[i].data,
results[i].width,
results[i].height,
results[i].channel,
metadata,
job.img_gen.output_compression);
if (image_bytes.empty()) {
continue;
}
output_images.push_back(base64_encode(image_bytes));
}
if (output_images.empty()) {
error_message = "generate_image returned empty encoded outputs";
return false;
}
return true;
}
bool execute_vid_gen_job(ServerRuntime& runtime,
AsyncGenerationJob& job,
std::string& output_media_b64,
std::string& output_media_mime_type,
int& output_frame_count,
int& output_fps,
std::string& error_message) {
sd_vid_gen_params_t params = job.vid_gen.to_sd_vid_gen_params_t();
SDImageVec results;
int num_results = 0;
sd_audio_t* generated_audio = nullptr;
{
std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
sd_image_t* raw_results = nullptr;
if (!generate_video(runtime.sd_ctx, &params, &raw_results, &num_results, &generated_audio)) {
raw_results = nullptr;
}
results.adopt(raw_results, num_results);
}
num_results = results.count();
if (num_results <= 0) {
free_sd_audio(generated_audio);
error_message = "generate_video returned no results";
return false;
}
std::vector<uint8_t> video_bytes = create_video_from_sd_images_to_vector(job.vid_gen.output_format,
results.data(),
num_results,
job.vid_gen.gen_params.fps,
job.vid_gen.output_compression,
generated_audio);
free_sd_audio(generated_audio);
if (video_bytes.empty()) {
error_message = "failed to encode generated video container";
return false;
}
output_media_b64 = base64_encode(video_bytes);
output_media_mime_type = video_mime_type(job.vid_gen.output_format);
output_frame_count = num_results;
output_fps = job.vid_gen.gen_params.fps;
return true;
}
void async_job_worker(ServerRuntime& runtime) {
AsyncJobManager& manager = *runtime.async_job_manager;
while (true) {
std::shared_ptr<AsyncGenerationJob> job;
{
std::unique_lock<std::mutex> lock(manager.mutex);
manager.cv.wait(lock, [&]() { return manager.stop || !manager.queue.empty(); });
if (manager.stop && manager.queue.empty()) {
break;
}
purge_expired_jobs(manager);
if (manager.queue.empty()) {
continue;
}
const std::string job_id = manager.queue.front();
manager.queue.pop_front();
auto it = manager.jobs.find(job_id);
if (it == manager.jobs.end()) {
continue;
}
job = it->second;
job->status = AsyncJobStatus::Generating;
job->started_at = unix_timestamp_now();
}
std::vector<std::string> output_images;
std::string output_media_b64;
std::string output_media_mime_type;
int output_frame_count = 0;
int output_fps = 0;
std::string error_message;
bool ok = false;
if (job->kind == AsyncJobKind::ImgGen) {
ok = execute_img_gen_job(runtime, *job, output_images, error_message);
} else if (job->kind == AsyncJobKind::VidGen) {
ok = execute_vid_gen_job(runtime,
*job,
output_media_b64,
output_media_mime_type,
output_frame_count,
output_fps,
error_message);
} else {
error_message = "unsupported job kind";
}
{
std::lock_guard<std::mutex> lock(manager.mutex);
auto it = manager.jobs.find(job->id);
if (it == manager.jobs.end()) {
continue;
}
job->completed_at = unix_timestamp_now();
if (ok) {
job->status = AsyncJobStatus::Completed;
job->result_images_b64 = std::move(output_images);
job->result_media_b64 = std::move(output_media_b64);
job->result_media_mime_type = std::move(output_media_mime_type);
job->result_frame_count = output_frame_count;
job->result_fps = output_fps;
job->error_code.clear();
job->error_message.clear();
} else {
job->status = AsyncJobStatus::Failed;
job->error_code = "generation_failed";
job->error_message = error_message.empty() ? "unknown generation error" : error_message;
job->result_images_b64.clear();
job->result_media_b64.clear();
job->result_media_mime_type.clear();
job->result_frame_count = 0;
job->result_fps = 0;
}
purge_expired_jobs(manager);
}
}
}

View File

@ -0,0 +1,78 @@
#pragma once
#include <condition_variable>
#include <cstdint>
#include <deque>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
#include "runtime.h"
enum class AsyncJobKind {
ImgGen,
VidGen,
};
enum class AsyncJobStatus {
Queued,
Generating,
Completed,
Failed,
Cancelled,
};
const char* async_job_kind_name(AsyncJobKind kind);
const char* async_job_status_name(AsyncJobStatus status);
struct AsyncGenerationJob {
std::string id;
AsyncJobKind kind = AsyncJobKind::ImgGen;
AsyncJobStatus status = AsyncJobStatus::Queued;
int64_t created_at = unix_timestamp_now();
int64_t started_at = 0;
int64_t completed_at = 0;
ImgGenJobRequest img_gen;
VidGenJobRequest vid_gen;
std::vector<std::string> result_images_b64;
std::string result_media_b64;
std::string result_media_mime_type;
int result_frame_count = 0;
int result_fps = 0;
std::string error_code;
std::string error_message;
};
struct AsyncJobManager {
std::mutex mutex;
std::condition_variable cv;
std::unordered_map<std::string, std::shared_ptr<AsyncGenerationJob>> jobs;
std::unordered_map<std::string, int64_t> expired_jobs;
std::deque<std::string> queue;
uint64_t next_id = 0;
bool stop = false;
size_t max_pending_jobs = 64;
int64_t completed_ttl_seconds = 600;
int64_t failed_ttl_seconds = 600;
};
void purge_expired_jobs(AsyncJobManager& manager);
size_t count_pending_jobs(const AsyncJobManager& manager);
std::string make_async_job_id(AsyncJobManager& manager);
bool cancel_queued_job(AsyncJobManager& manager, AsyncGenerationJob& job);
json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJob& job);
bool execute_img_gen_job(ServerRuntime& runtime,
AsyncGenerationJob& job,
std::vector<std::string>& output_images,
std::string& error_message);
bool execute_vid_gen_job(ServerRuntime& runtime,
AsyncGenerationJob& job,
std::string& output_media_b64,
std::string& output_media_mime_type,
int& output_frame_count,
int& output_fps,
std::string& error_message);
void async_job_worker(ServerRuntime& runtime);

@ -1 +1 @@
Subproject commit 1a34176cd6d39ad3a226b2b69047e71f6797f6bc Subproject commit 797ccf80825cc035508ba9b599b2a21953e7f835

File diff suppressed because it is too large Load Diff

11
examples/server/routes.h Normal file
View File

@ -0,0 +1,11 @@
#pragma once
#include <string>
#include "httplib.h"
#include "runtime.h"
void register_index_endpoints(httplib::Server& svr, const SDSvrParams& svr_params, const std::string& index_html);
void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt);
void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt);
void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt);

View File

@ -0,0 +1,22 @@
#include "routes.h"
#include <fstream>
#include <iterator>
void register_index_endpoints(httplib::Server& svr, const SDSvrParams& svr_params, const std::string& index_html) {
const std::string serve_html_path = svr_params.serve_html_path;
svr.Get("/", [serve_html_path, index_html](const httplib::Request&, httplib::Response& res) {
if (!serve_html_path.empty()) {
std::ifstream file(serve_html_path);
if (file) {
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
res.set_content(content, "text/html");
} else {
res.status = 500;
res.set_content("Error: Unable to read HTML file", "text/plain");
}
} else {
res.set_content(index_html, "text/html");
}
});
}

View File

@ -0,0 +1,388 @@
#include "routes.h"
#include <algorithm>
#include <ctime>
#include <regex>
#include "common/common.h"
#include "common/media_io.h"
#include "common/resource_owners.hpp"
static std::string extract_and_remove_sd_cpp_extra_args(std::string& text) {
std::regex re("<sd_cpp_extra_args>(.*?)</sd_cpp_extra_args>");
std::smatch match;
std::string extracted;
if (std::regex_search(text, match, re)) {
extracted = match[1].str();
text = std::regex_replace(text, re, "");
}
return extracted;
}
static bool build_openai_generation_request(const httplib::Request& req,
ServerRuntime& runtime,
ImgGenJobRequest& request,
std::string& error_message) {
if (req.body.empty()) {
error_message = "empty body";
return false;
}
json j = json::parse(req.body);
std::string prompt = j.value("prompt", "");
int n = std::max(1, j.value("n", 1));
std::string size = j.value("size", "");
std::string output_format = j.value("output_format", "png");
int output_compression = j.value("output_compression", 100);
int width = runtime.default_gen_params->width > 0 ? runtime.default_gen_params->width : 512;
int height = runtime.default_gen_params->width > 0 ? runtime.default_gen_params->height : 512;
if (!size.empty()) {
auto pos = size.find('x');
if (pos != std::string::npos) {
try {
width = std::stoi(size.substr(0, pos));
height = std::stoi(size.substr(pos + 1));
} catch (...) {
}
}
}
if (prompt.empty()) {
error_message = "prompt required";
return false;
}
request.gen_params = *runtime.default_gen_params;
if (!assign_output_options(request, output_format, output_compression, true, error_message)) {
return false;
}
request.gen_params.prompt = prompt;
request.gen_params.width = width;
request.gen_params.height = height;
request.gen_params.batch_count = n;
std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(request.gen_params.prompt);
if (!sd_cpp_extra_args_str.empty() && !request.gen_params.from_json_str(sd_cpp_extra_args_str)) {
error_message = "invalid sd_cpp_extra_args";
return false;
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
error_message = "invalid params";
return false;
}
return true;
}
static bool build_openai_edit_request(const httplib::Request& req,
ServerRuntime& runtime,
ImgGenJobRequest& request,
std::string& error_message) {
if (!req.is_multipart_form_data()) {
error_message = "Content-Type must be multipart/form-data";
return false;
}
std::string prompt = req.form.get_field("prompt");
if (prompt.empty()) {
error_message = "prompt required";
return false;
}
size_t image_count = req.form.get_file_count("image[]");
bool has_legacy_image = req.form.has_file("image");
if (image_count == 0 && !has_legacy_image) {
error_message = "at least one image[] required";
return false;
}
std::vector<std::vector<uint8_t>> images_bytes;
for (size_t i = 0; i < image_count; ++i) {
auto file = req.form.get_file("image[]", i);
images_bytes.emplace_back(file.content.begin(), file.content.end());
}
if (image_count == 0 && has_legacy_image) {
auto file = req.form.get_file("image");
images_bytes.emplace_back(file.content.begin(), file.content.end());
}
std::vector<uint8_t> mask_bytes;
if (req.form.has_file("mask")) {
auto file = req.form.get_file("mask");
mask_bytes.assign(file.content.begin(), file.content.end());
}
int n = 1;
if (req.form.has_field("n")) {
try {
n = std::stoi(req.form.get_field("n"));
} catch (...) {
}
}
std::string size = req.form.get_field("size");
int width = -1;
int height = -1;
if (!size.empty()) {
auto pos = size.find('x');
if (pos != std::string::npos) {
try {
width = std::stoi(size.substr(0, pos));
height = std::stoi(size.substr(pos + 1));
} catch (...) {
}
}
}
std::string output_format = req.form.has_field("output_format")
? req.form.get_field("output_format")
: "png";
int output_compression = 100;
try {
output_compression = std::stoi(req.form.get_field("output_compression"));
} catch (...) {
}
request.gen_params = *runtime.default_gen_params;
if (!assign_output_options(request, output_format, output_compression, false, error_message)) {
return false;
}
request.gen_params.prompt = prompt;
request.gen_params.width = width;
request.gen_params.height = height;
request.gen_params.batch_count = n;
for (auto& bytes : images_bytes) {
int img_w = 0;
int img_h = 0;
uint8_t* raw_pixels = load_image_from_memory(
reinterpret_cast<const char*>(bytes.data()),
static_cast<int>(bytes.size()),
img_w, img_h,
width, height, 3);
if (raw_pixels == nullptr) {
continue;
}
SDImageOwner image_owner({(uint32_t)img_w, (uint32_t)img_h, 3, raw_pixels});
request.gen_params.set_width_and_height_if_unset(image_owner.get().width, image_owner.get().height);
request.gen_params.ref_images.push_back(std::move(image_owner));
}
if (!request.gen_params.ref_images.empty()) {
request.gen_params.init_image = request.gen_params.ref_images.front();
}
if (!mask_bytes.empty()) {
int expected_width = 0;
int expected_height = 0;
if (request.gen_params.width_and_height_are_set()) {
expected_width = request.gen_params.width;
expected_height = request.gen_params.height;
}
int mask_w = 0;
int mask_h = 0;
uint8_t* mask_raw = load_image_from_memory(
reinterpret_cast<const char*>(mask_bytes.data()),
static_cast<int>(mask_bytes.size()),
mask_w, mask_h,
expected_width, expected_height, 1);
request.gen_params.mask_image.reset({(uint32_t)mask_w, (uint32_t)mask_h, 1, mask_raw});
const sd_image_t& mask_image = request.gen_params.mask_image.get();
request.gen_params.set_width_and_height_if_unset(mask_image.width, mask_image.height);
} else {
request.gen_params.mask_image.reset({
(uint32_t)request.gen_params.get_resolved_width(),
(uint32_t)request.gen_params.get_resolved_height(),
1,
nullptr,
});
}
std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(request.gen_params.prompt);
if (!sd_cpp_extra_args_str.empty() && !request.gen_params.from_json_str(sd_cpp_extra_args_str)) {
error_message = "invalid sd_cpp_extra_args";
return false;
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
error_message = "invalid params";
return false;
}
return true;
}
static bool execute_sync_img_gen_request(ServerRuntime& runtime,
ImgGenJobRequest& request,
SDImageVec& results,
std::string& error_message) {
sd_img_gen_params_t img_gen_params = request.to_sd_img_gen_params_t();
int num_results = 0;
{
std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
sd_image_t* raw_results = generate_image(runtime.sd_ctx, &img_gen_params);
num_results = request.gen_params.batch_count;
results.adopt(raw_results, num_results);
}
if (results.empty()) {
error_message = "generate_image returned no results";
return false;
}
return true;
}
void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
ServerRuntime* runtime = &rt;
svr.Get("/v1/models", [runtime](const httplib::Request&, httplib::Response& res) {
json r;
r["data"] = json::array();
r["data"].push_back({{"id", "sd-cpp-local"}, {"object", "model"}, {"owned_by", "local"}});
res.set_content(r.dump(), "application/json");
});
svr.Post("/v1/images/generations", [runtime](const httplib::Request& req, httplib::Response& res) {
try {
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
return;
}
ImgGenJobRequest request;
std::string error_message;
if (!build_openai_generation_request(req, *runtime, request, error_message)) {
res.status = 400;
res.set_content(json({{"error", error_message}}).dump(), "application/json");
return;
}
LOG_DEBUG("%s\n", request.gen_params.to_string().c_str());
SDImageVec results;
if (!execute_sync_img_gen_request(*runtime, request, results, error_message)) {
res.status = 500;
res.set_content(json({{"error", error_message}}).dump(), "application/json");
return;
}
json out;
out["created"] = static_cast<long long>(std::time(nullptr));
out["data"] = json::array();
out["output_format"] = request.output_format;
for (int i = 0; i < request.gen_params.batch_count; ++i) {
if (results[i].data == nullptr) {
continue;
}
std::string params = request.gen_params.embed_image_metadata
? get_image_params(*runtime->ctx_params,
request.gen_params,
request.gen_params.seed + i)
: "";
auto image_bytes = encode_image_to_vector(request.output_format == "jpeg"
? EncodedImageFormat::JPEG
: request.output_format == "webp"
? EncodedImageFormat::WEBP
: EncodedImageFormat::PNG,
results[i].data,
results[i].width,
results[i].height,
results[i].channel,
params,
request.output_compression);
if (image_bytes.empty()) {
LOG_ERROR("write image to mem failed");
continue;
}
json item;
item["b64_json"] = base64_encode(image_bytes);
out["data"].push_back(item);
}
res.set_content(out.dump(), "application/json");
res.status = 200;
} catch (const std::exception& e) {
res.status = 500;
json err;
err["error"] = "server_error";
err["message"] = e.what();
res.set_content(err.dump(), "application/json");
}
});
svr.Post("/v1/images/edits", [runtime](const httplib::Request& req, httplib::Response& res) {
try {
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
return;
}
ImgGenJobRequest request;
std::string error_message;
if (!build_openai_edit_request(req, *runtime, request, error_message)) {
res.status = 400;
res.set_content(json({{"error", error_message}}).dump(), "application/json");
return;
}
LOG_DEBUG("%s\n", request.gen_params.to_string().c_str());
SDImageVec results;
if (!execute_sync_img_gen_request(*runtime, request, results, error_message)) {
res.status = 500;
res.set_content(json({{"error", error_message}}).dump(), "application/json");
return;
}
json out;
out["created"] = static_cast<long long>(std::time(nullptr));
out["data"] = json::array();
out["output_format"] = request.output_format;
for (int i = 0; i < request.gen_params.batch_count; ++i) {
if (results[i].data == nullptr) {
continue;
}
std::string params = request.gen_params.embed_image_metadata
? get_image_params(*runtime->ctx_params,
request.gen_params,
request.gen_params.seed + i)
: "";
auto image_bytes = encode_image_to_vector(request.output_format == "jpeg" ? EncodedImageFormat::JPEG : EncodedImageFormat::PNG,
results[i].data,
results[i].width,
results[i].height,
results[i].channel,
params,
request.output_compression);
json item;
item["b64_json"] = base64_encode(image_bytes);
out["data"].push_back(item);
}
res.set_content(out.dump(), "application/json");
res.status = 200;
} catch (const std::exception& e) {
res.status = 500;
json err;
err["error"] = "server_error";
err["message"] = e.what();
res.set_content(err.dump(), "application/json");
}
});
}

View File

@ -0,0 +1,473 @@
#include "routes.h"
#include <algorithm>
#include <cctype>
#include <cstring>
#include <regex>
#include <string_view>
#include <unordered_map>
#include "common/common.h"
#include "common/media_io.h"
#include "common/resource_owners.hpp"
namespace fs = std::filesystem;
static std::string extract_and_remove_sd_cpp_extra_args(std::string& text) {
std::regex re("<sd_cpp_extra_args>(.*?)</sd_cpp_extra_args>");
std::smatch match;
std::string extracted;
if (std::regex_search(text, match, re)) {
extracted = match[1].str();
text = std::regex_replace(text, re, "");
}
return extracted;
}
static fs::path resolve_display_model_path(const ServerRuntime& runtime) {
const auto& ctx = *runtime.ctx_params;
if (!ctx.model_path.empty()) {
return fs::path(ctx.model_path);
}
if (!ctx.diffusion_model_path.empty()) {
return fs::path(ctx.diffusion_model_path);
}
return {};
}
static std::string lower_ascii(std::string value) {
std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
return value;
}
static enum sample_method_t get_sdapi_sample_method(std::string name) {
enum sample_method_t result = str_to_sample_method(name.c_str());
if (result != SAMPLE_METHOD_COUNT) {
return result;
}
name = lower_ascii(name);
static const std::unordered_map<std::string_view, sample_method_t> hardcoded{
{"euler a", EULER_A_SAMPLE_METHOD},
{"k_euler_a", EULER_A_SAMPLE_METHOD},
{"euler", EULER_SAMPLE_METHOD},
{"k_euler", EULER_SAMPLE_METHOD},
{"heun", HEUN_SAMPLE_METHOD},
{"k_heun", HEUN_SAMPLE_METHOD},
{"dpm2", DPM2_SAMPLE_METHOD},
{"k_dpm_2", DPM2_SAMPLE_METHOD},
{"lcm", LCM_SAMPLE_METHOD},
{"ddim", DDIM_TRAILING_SAMPLE_METHOD},
{"dpm++ 2m", DPMPP2M_SAMPLE_METHOD},
{"k_dpmpp_2m", DPMPP2M_SAMPLE_METHOD},
{"res multistep", RES_MULTISTEP_SAMPLE_METHOD},
{"k_res_multistep", RES_MULTISTEP_SAMPLE_METHOD},
{"res 2s", RES_2S_SAMPLE_METHOD},
{"k_res_2s", RES_2S_SAMPLE_METHOD},
{"euler_cfg_pp", EULER_CFG_PP_SAMPLE_METHOD},
{"k_euler_cfg_pp", EULER_CFG_PP_SAMPLE_METHOD},
{"euler_a_cfg_pp", EULER_CFG_PP_SAMPLE_METHOD},
{"k_euler_a_cfg_pp", EULER_CFG_PP_SAMPLE_METHOD},
};
auto it = hardcoded.find(name);
return it != hardcoded.end() ? it->second : SAMPLE_METHOD_COUNT;
}
static void assign_solid_mask(SDImageOwner& mask_owner, int width, int height) {
const size_t pixel_count = static_cast<size_t>(width) * static_cast<size_t>(height);
uint8_t* raw_mask = static_cast<uint8_t*>(malloc(pixel_count));
if (raw_mask == nullptr) {
mask_owner.reset({0, 0, 1, nullptr});
return;
}
std::memset(raw_mask, 255, pixel_count);
mask_owner.reset({(uint32_t)width, (uint32_t)height, 1, raw_mask});
}
static bool build_sdapi_img_gen_request(const json& j,
ServerRuntime& runtime,
bool img2img,
ImgGenJobRequest& request,
std::string& error_message) {
std::string prompt = j.value("prompt", "");
std::string negative_prompt = j.value("negative_prompt", "");
int width = j.value("width", 512);
int height = j.value("height", 512);
int steps = j.value("steps", runtime.default_gen_params->sample_params.sample_steps);
float cfg_scale = j.value("cfg_scale", runtime.default_gen_params->sample_params.guidance.txt_cfg);
int64_t seed = j.value("seed", -1);
int batch_size = j.value("batch_size", 1);
int clip_skip = j.value("clip_skip", -1);
std::string sampler_name = j.value("sampler_name", "");
std::string scheduler_name = j.value("scheduler", "");
if (width <= 0 || height <= 0) {
error_message = "width and height must be positive";
return false;
}
if (prompt.empty()) {
error_message = "prompt required";
return false;
}
request.gen_params = *runtime.default_gen_params;
request.gen_params.prompt = prompt;
request.gen_params.negative_prompt = negative_prompt;
request.gen_params.seed = seed;
request.gen_params.sample_params.sample_steps = steps;
request.gen_params.batch_count = batch_size;
request.gen_params.sample_params.guidance.txt_cfg = cfg_scale;
request.gen_params.width = j.value("width", -1);
request.gen_params.height = j.value("height", -1);
if (!img2img && j.value("enable_hr", false)) {
request.gen_params.hires_enabled = true;
request.gen_params.hires_scale = j.value("hr_scale", request.gen_params.hires_scale);
request.gen_params.hires_width = j.value("hr_resize_x", request.gen_params.hires_width);
request.gen_params.hires_height = j.value("hr_resize_y", request.gen_params.hires_height);
request.gen_params.hires_steps = j.value("hr_steps", request.gen_params.hires_steps);
request.gen_params.hires_denoising_strength =
j.value("denoising_strength", request.gen_params.hires_denoising_strength);
request.gen_params.hires_upscaler = j.value("hr_upscaler", request.gen_params.hires_upscaler);
}
std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(request.gen_params.prompt);
if (!sd_cpp_extra_args_str.empty() && !request.gen_params.from_json_str(sd_cpp_extra_args_str)) {
error_message = "invalid sd_cpp_extra_args";
return false;
}
if (clip_skip > 0) {
request.gen_params.clip_skip = clip_skip;
}
enum sample_method_t sample_method = get_sdapi_sample_method(sampler_name);
if (sample_method != SAMPLE_METHOD_COUNT) {
request.gen_params.sample_params.sample_method = sample_method;
}
enum scheduler_t scheduler = str_to_scheduler(scheduler_name.c_str());
if (scheduler != SCHEDULER_COUNT) {
request.gen_params.sample_params.scheduler = scheduler;
}
if (j.contains("lora") && j["lora"].is_array()) {
request.gen_params.lora_map.clear();
request.gen_params.high_noise_lora_map.clear();
for (const auto& item : j["lora"]) {
if (!item.is_object()) {
continue;
}
std::string path = item.value("path", "");
float multiplier = item.value("multiplier", 1.0f);
bool is_high_noise = item.value("is_high_noise", false);
if (path.empty()) {
error_message = "lora.path required";
return false;
}
std::string fullpath = get_lora_full_path(runtime, path);
if (fullpath.empty()) {
error_message = "invalid lora path: " + path;
return false;
}
if (is_high_noise) {
request.gen_params.high_noise_lora_map[fullpath] += multiplier;
} else {
request.gen_params.lora_map[fullpath] += multiplier;
}
}
}
if (img2img) {
const int expected_width = request.gen_params.width_and_height_are_set() ? request.gen_params.width : 0;
const int expected_height = request.gen_params.width_and_height_are_set() ? request.gen_params.height : 0;
if (j.contains("init_images") && j["init_images"].is_array() && !j["init_images"].empty()) {
if (decode_base64_image(j["init_images"][0].get<std::string>(),
3,
expected_width,
expected_height,
request.gen_params.init_image)) {
const sd_image_t& image = request.gen_params.init_image.get();
request.gen_params.set_width_and_height_if_unset(image.width, image.height);
}
}
if (j.contains("mask") && j["mask"].is_string()) {
if (decode_base64_image(j["mask"].get<std::string>(),
1,
expected_width,
expected_height,
request.gen_params.mask_image)) {
const sd_image_t& image = request.gen_params.mask_image.get();
request.gen_params.set_width_and_height_if_unset(image.width, image.height);
}
sd_image_t& mask_image = request.gen_params.mask_image.get();
bool inpainting_mask_invert = j.value("inpainting_mask_invert", 0) != 0;
if (inpainting_mask_invert && mask_image.data != nullptr) {
for (uint32_t i = 0; i < mask_image.width * mask_image.height; ++i) {
mask_image.data[i] = 255 - mask_image.data[i];
}
}
} else {
const int resolved_width = request.gen_params.get_resolved_width();
const int resolved_height = request.gen_params.get_resolved_height();
assign_solid_mask(request.gen_params.mask_image, resolved_width, resolved_height);
}
float denoising_strength = j.value("denoising_strength", -1.f);
if (denoising_strength >= 0.f) {
request.gen_params.strength = std::min(denoising_strength, 1.0f);
}
}
if (j.contains("extra_images") && j["extra_images"].is_array()) {
for (const auto& extra_image : j["extra_images"]) {
if (!extra_image.is_string()) {
continue;
}
SDImageOwner image_owner;
if (decode_base64_image(extra_image.get<std::string>(),
3,
request.gen_params.width_and_height_are_set() ? request.gen_params.width : 0,
request.gen_params.width_and_height_are_set() ? request.gen_params.height : 0,
image_owner)) {
const sd_image_t& image = image_owner.get();
request.gen_params.set_width_and_height_if_unset(image.width, image.height);
request.gen_params.ref_images.push_back(std::move(image_owner));
}
}
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
error_message = "invalid params";
return false;
}
return true;
}
void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
ServerRuntime* runtime = &rt;
auto sdapi_any2img = [runtime](const httplib::Request& req, httplib::Response& res, bool img2img) {
try {
if (req.body.empty()) {
res.status = 400;
res.set_content(R"({"error":"empty body"})", "application/json");
return;
}
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
return;
}
json j = json::parse(req.body);
ImgGenJobRequest request;
std::string error_message;
if (!build_sdapi_img_gen_request(j, *runtime, img2img, request, error_message)) {
res.status = 400;
res.set_content(json({{"error", error_message}}).dump(), "application/json");
return;
}
LOG_DEBUG("%s\n", request.gen_params.to_string().c_str());
sd_img_gen_params_t img_gen_params = request.to_sd_img_gen_params_t();
SDImageVec results;
int num_results = 0;
{
std::lock_guard<std::mutex> lock(*runtime->sd_ctx_mutex);
sd_image_t* raw_results = generate_image(runtime->sd_ctx, &img_gen_params);
num_results = request.gen_params.batch_count;
results.adopt(raw_results, num_results);
}
if (results.empty()) {
res.status = 500;
res.set_content(R"({"error":"generate_image returned no results"})", "application/json");
return;
}
json out;
out["images"] = json::array();
out["parameters"] = j;
out["info"] = "";
for (int i = 0; i < num_results; ++i) {
if (results[i].data == nullptr) {
continue;
}
std::string params = request.gen_params.embed_image_metadata
? get_image_params(*runtime->ctx_params,
request.gen_params,
request.gen_params.seed + i)
: "";
auto image_bytes = encode_image_to_vector(EncodedImageFormat::PNG,
results[i].data,
results[i].width,
results[i].height,
results[i].channel,
params);
if (image_bytes.empty()) {
LOG_ERROR("write image to mem failed");
continue;
}
out["images"].push_back(base64_encode(image_bytes));
}
res.set_content(out.dump(), "application/json");
res.status = 200;
} catch (const std::exception& e) {
res.status = 500;
json err;
err["error"] = "server_error";
err["message"] = e.what();
res.set_content(err.dump(), "application/json");
}
};
svr.Post("/sdapi/v1/txt2img", [sdapi_any2img](const httplib::Request& req, httplib::Response& res) {
sdapi_any2img(req, res, false);
});
svr.Post("/sdapi/v1/img2img", [sdapi_any2img](const httplib::Request& req, httplib::Response& res) {
sdapi_any2img(req, res, true);
});
svr.Get("/sdapi/v1/loras", [runtime](const httplib::Request&, httplib::Response& res) {
refresh_lora_cache(*runtime);
json result = json::array();
{
std::lock_guard<std::mutex> lock(*runtime->lora_mutex);
for (const auto& e : *runtime->lora_cache) {
json item;
item["name"] = e.name;
item["path"] = e.path;
result.push_back(item);
}
}
res.set_content(result.dump(), "application/json");
});
svr.Get("/sdapi/v1/upscalers", [runtime](const httplib::Request&, httplib::Response& res) {
refresh_upscaler_cache(*runtime);
auto make_builtin = [](const char* name) {
json item;
item["name"] = name;
item["model_name"] = nullptr;
item["model_path"] = nullptr;
item["model_url"] = nullptr;
item["scale"] = 4;
return item;
};
json result = json::array();
result.push_back(make_builtin("None"));
result.push_back(make_builtin("Lanczos"));
result.push_back(make_builtin("Nearest"));
{
std::lock_guard<std::mutex> lock(*runtime->upscaler_mutex);
for (const auto& e : *runtime->upscaler_cache) {
json item;
item["name"] = e.name;
item["model_name"] = e.model_name;
item["model_path"] = e.fullpath;
item["model_url"] = nullptr;
item["scale"] = e.scale;
result.push_back(item);
}
}
res.set_content(result.dump(), "application/json");
});
svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) {
json result = json::array({
{{"name", "Latent"}},
{{"name", "Latent (nearest)"}},
{{"name", "Latent (nearest-exact)"}},
{{"name", "Latent (antialiased)"}},
{{"name", "Latent (bicubic)"}},
{{"name", "Latent (bicubic antialiased)"}},
});
res.set_content(result.dump(), "application/json");
});
svr.Get("/sdapi/v1/samplers", [runtime](const httplib::Request&, httplib::Response& res) {
std::vector<std::string> sampler_names;
sampler_names.push_back("default");
for (int i = 0; i < SAMPLE_METHOD_COUNT; i++) {
sampler_names.push_back(sd_sample_method_name((sample_method_t)i));
}
json r = json::array();
for (auto name : sampler_names) {
json entry;
entry["name"] = name;
entry["aliases"] = json::array({name});
entry["options"] = json::object();
r.push_back(entry);
}
res.set_content(r.dump(), "application/json");
});
svr.Get("/sdapi/v1/schedulers", [runtime](const httplib::Request&, httplib::Response& res) {
std::vector<std::string> scheduler_names;
scheduler_names.push_back("default");
for (int i = 0; i < SCHEDULER_COUNT; i++) {
scheduler_names.push_back(sd_scheduler_name((scheduler_t)i));
}
json r = json::array();
for (auto name : scheduler_names) {
json entry;
entry["name"] = name;
entry["label"] = name;
r.push_back(entry);
}
res.set_content(r.dump(), "application/json");
});
svr.Get("/sdapi/v1/sd-models", [runtime](const httplib::Request&, httplib::Response& res) {
fs::path model_path = resolve_display_model_path(*runtime);
json entry;
entry["title"] = model_path.stem();
entry["model_name"] = model_path.stem();
entry["filename"] = model_path.filename();
entry["hash"] = "8888888888";
entry["sha256"] = "8888888888888888888888888888888888888888888888888888888888888888";
entry["config"] = nullptr;
json r = json::array();
r.push_back(entry);
res.set_content(r.dump(), "application/json");
});
svr.Get("/sdapi/v1/options", [runtime](const httplib::Request&, httplib::Response& res) {
fs::path model_path = resolve_display_model_path(*runtime);
json r;
r["samples_format"] = "png";
r["sd_model_checkpoint"] = model_path.stem();
res.set_content(r.dump(), "application/json");
});
}

View File

@ -0,0 +1,595 @@
#include "routes.h"
#include <algorithm>
#include <cmath>
#include <filesystem>
#include "async_jobs.h"
#include "common/common.h"
namespace fs = std::filesystem;
static bool parse_cache_mode(const std::string& mode_str, sd_cache_mode_t& mode_out) {
if (mode_str == "disabled") {
mode_out = SD_CACHE_DISABLED;
return true;
}
if (mode_str == "easycache") {
mode_out = SD_CACHE_EASYCACHE;
return true;
}
if (mode_str == "ucache") {
mode_out = SD_CACHE_UCACHE;
return true;
}
if (mode_str == "dbcache") {
mode_out = SD_CACHE_DBCACHE;
return true;
}
if (mode_str == "taylorseer") {
mode_out = SD_CACHE_TAYLORSEER;
return true;
}
if (mode_str == "cache-dit") {
mode_out = SD_CACHE_CACHE_DIT;
return true;
}
if (mode_str == "spectrum") {
mode_out = SD_CACHE_SPECTRUM;
return true;
}
return false;
}
static json finite_number_or_null(float value) {
return std::isfinite(value) ? json(value) : json(nullptr);
}
static const char* capability_scheduler_name(enum scheduler_t scheduler) {
return scheduler < SCHEDULER_COUNT ? sd_scheduler_name(scheduler) : "default";
}
static const char* capability_sample_method_name(enum sample_method_t sample_method) {
return sample_method < SAMPLE_METHOD_COUNT ? sd_sample_method_name(sample_method) : "default";
}
static json make_vae_tiling_json(const sd_tiling_params_t& params) {
return {
{"enabled", params.enabled},
{"temporal_tiling", params.temporal_tiling},
{"tile_size_x", params.tile_size_x},
{"tile_size_y", params.tile_size_y},
{"target_overlap", params.target_overlap},
{"rel_size_x", params.rel_size_x},
{"rel_size_y", params.rel_size_y},
{"extra_tiling_args", params.extra_tiling_args ? params.extra_tiling_args : ""},
};
}
static fs::path resolve_display_model_path(const ServerRuntime& runtime) {
const auto& ctx = *runtime.ctx_params;
if (!ctx.model_path.empty()) {
return fs::path(ctx.model_path);
}
if (!ctx.diffusion_model_path.empty()) {
return fs::path(ctx.diffusion_model_path);
}
return {};
}
static json make_sample_params_json(const sd_sample_params_t& sample_params, const std::vector<int>& skip_layers) {
const auto& guidance = sample_params.guidance;
return {
{"scheduler", capability_scheduler_name(sample_params.scheduler)},
{"sample_method", capability_sample_method_name(sample_params.sample_method)},
{"sample_steps", sample_params.sample_steps},
{"eta", finite_number_or_null(sample_params.eta)},
{"shifted_timestep", sample_params.shifted_timestep},
{"flow_shift", finite_number_or_null(sample_params.flow_shift)},
{"guidance",
{
{"txt_cfg", guidance.txt_cfg},
{"img_cfg", finite_number_or_null(guidance.img_cfg)},
{"distilled_guidance", guidance.distilled_guidance},
{"slg",
{
{"layers", skip_layers},
{"layer_start", guidance.slg.layer_start},
{"layer_end", guidance.slg.layer_end},
{"scale", guidance.slg.scale},
}},
}},
};
}
static json make_hires_json(const SDGenerationParams& defaults) {
return {
{"enabled", defaults.hires_enabled},
{"upscaler", defaults.hires_upscaler},
{"scale", defaults.hires_scale},
{"target_width", defaults.hires_width},
{"target_height", defaults.hires_height},
{"steps", defaults.hires_steps},
{"denoising_strength", defaults.hires_denoising_strength},
{"custom_sigmas", defaults.hires_custom_sigmas},
{"upscale_tile_size", defaults.hires_upscale_tile_size},
};
}
static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
return {
{"prompt", defaults.prompt},
{"negative_prompt", defaults.negative_prompt},
{"clip_skip", defaults.clip_skip},
{"width", defaults.width > 0 ? defaults.width : 512},
{"height", defaults.height > 0 ? defaults.height : 512},
{"strength", defaults.strength},
{"seed", defaults.seed},
{"batch_count", defaults.batch_count},
{"auto_resize_ref_image", defaults.auto_resize_ref_image},
{"increase_ref_index", defaults.increase_ref_index},
{"control_strength", defaults.control_strength},
{"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
{"hires", make_hires_json(defaults)},
{"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
{"cache_mode", defaults.cache_mode},
{"cache_option", defaults.cache_option},
{"scm_mask", defaults.scm_mask},
{"scm_policy_dynamic", defaults.scm_policy_dynamic},
{"output_format", output_format},
{"output_compression", 100},
};
}
static json make_vid_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
return {
{"prompt", defaults.prompt},
{"negative_prompt", defaults.negative_prompt},
{"clip_skip", defaults.clip_skip},
{"width", defaults.width > 0 ? defaults.width : 512},
{"height", defaults.height > 0 ? defaults.height : 512},
{"strength", defaults.strength},
{"seed", defaults.seed},
{"video_frames", defaults.video_frames},
{"fps", defaults.fps},
{"moe_boundary", defaults.moe_boundary},
{"vace_strength", defaults.vace_strength},
{"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
{"high_noise_sample_params", make_sample_params_json(defaults.high_noise_sample_params, defaults.high_noise_skip_layers)},
{"hires", make_hires_json(defaults)},
{"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
{"cache_mode", defaults.cache_mode},
{"cache_option", defaults.cache_option},
{"scm_mask", defaults.scm_mask},
{"scm_policy_dynamic", defaults.scm_policy_dynamic},
{"output_format", output_format},
{"output_compression", 100},
};
}
static json make_img_gen_features_json() {
return {
{"init_image", true},
{"mask_image", true},
{"control_image", true},
{"ref_images", true},
{"lora", true},
{"vae_tiling", true},
{"hires", true},
{"cache", true},
{"cancel_queued", true},
{"cancel_generating", false},
};
}
static json make_vid_gen_features_json() {
return {
{"init_image", true},
{"end_image", true},
{"control_frames", true},
{"high_noise_sample_params", true},
{"lora", true},
{"vae_tiling", true},
{"cache", true},
{"cancel_queued", true},
{"cancel_generating", false},
};
}
static json make_capabilities_json(ServerRuntime& runtime) {
refresh_lora_cache(runtime);
refresh_upscaler_cache(runtime);
AsyncJobManager& manager = *runtime.async_job_manager;
const auto& defaults = *runtime.default_gen_params;
const fs::path model_path = resolve_display_model_path(runtime);
const bool supports_img = runtime_supports_generation_mode(runtime, IMG_GEN);
const bool supports_vid = runtime_supports_generation_mode(runtime, VID_GEN);
json samplers = json::array();
json schedulers = json::array();
json image_output_formats = supported_img_output_formats();
json video_output_formats = supported_vid_output_formats();
json available_loras = json::array();
json available_upscalers = json::array();
json supported_modes = json::array();
for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) {
samplers.push_back(sd_sample_method_name((sample_method_t)i));
}
for (int i = 0; i < SCHEDULER_COUNT; ++i) {
schedulers.push_back(sd_scheduler_name((scheduler_t)i));
}
{
std::lock_guard<std::mutex> lock(*runtime.lora_mutex);
for (const auto& entry : *runtime.lora_cache) {
available_loras.push_back({
{"name", entry.name},
{"path", entry.path},
});
}
}
available_upscalers.push_back({
{"name", "None"},
});
available_upscalers.push_back({
{"name", "Lanczos"},
});
available_upscalers.push_back({
{"name", "Nearest"},
});
available_upscalers.push_back({
{"name", "Latent"},
});
available_upscalers.push_back({
{"name", "Latent (nearest)"},
});
available_upscalers.push_back({
{"name", "Latent (nearest-exact)"},
});
available_upscalers.push_back({
{"name", "Latent (antialiased)"},
});
available_upscalers.push_back({
{"name", "Latent (bicubic)"},
});
available_upscalers.push_back({
{"name", "Latent (bicubic antialiased)"},
});
{
std::lock_guard<std::mutex> lock(*runtime.upscaler_mutex);
for (const auto& entry : *runtime.upscaler_cache) {
available_upscalers.push_back({
{"name", entry.name},
});
}
}
if (supports_img) {
supported_modes.push_back("img_gen");
}
if (supports_vid) {
supported_modes.push_back("vid_gen");
}
std::string default_img_output_format = "png";
std::string default_vid_output_format = "avi";
if (!image_output_formats.empty()) {
default_img_output_format = image_output_formats[0].get<std::string>();
}
if (!video_output_formats.empty()) {
default_vid_output_format = video_output_formats[0].get<std::string>();
}
json defaults_by_mode = json::object();
json output_formats_by_mode = json::object();
json features_by_mode = json::object();
if (supports_img) {
defaults_by_mode["img_gen"] = make_img_gen_defaults_json(defaults, default_img_output_format);
output_formats_by_mode["img_gen"] = image_output_formats;
features_by_mode["img_gen"] = make_img_gen_features_json();
}
if (supports_vid) {
defaults_by_mode["vid_gen"] = make_vid_gen_defaults_json(defaults, default_vid_output_format);
output_formats_by_mode["vid_gen"] = video_output_formats;
features_by_mode["vid_gen"] = make_vid_gen_features_json();
}
json top_level_defaults = json::object();
json top_level_output_formats = json::array();
json top_level_features = {
{"cancel_queued", true},
{"cancel_generating", false},
};
std::string current_mode = "";
if (supports_img) {
current_mode = "img_gen";
top_level_defaults = defaults_by_mode["img_gen"];
top_level_output_formats = output_formats_by_mode["img_gen"];
top_level_features = features_by_mode["img_gen"];
} else if (supports_vid) {
current_mode = "vid_gen";
top_level_defaults = defaults_by_mode["vid_gen"];
top_level_output_formats = output_formats_by_mode["vid_gen"];
top_level_features = features_by_mode["vid_gen"];
}
json result;
result["model"] = {
{"name", model_path.filename().u8string()},
{"stem", model_path.stem().u8string()},
{"path", model_path.u8string()},
};
result["current_mode"] = current_mode;
result["supported_modes"] = supported_modes;
result["defaults"] = top_level_defaults;
result["defaults_by_mode"] = defaults_by_mode;
result["limits"] = {
{"min_width", 64},
{"max_width", 4096},
{"min_height", 64},
{"max_height", 4096},
{"max_batch_count", 8},
{"max_queue_size", manager.max_pending_jobs},
};
result["samplers"] = samplers;
result["schedulers"] = schedulers;
result["output_formats"] = top_level_output_formats;
result["output_formats_by_mode"] = output_formats_by_mode;
result["features"] = top_level_features;
result["features_by_mode"] = features_by_mode;
result["loras"] = available_loras;
result["upscalers"] = available_upscalers;
return result;
}
static bool parse_img_gen_request(const json& body,
ServerRuntime& runtime,
ImgGenJobRequest& request,
std::string& error_message) {
request.gen_params = *runtime.default_gen_params;
refresh_lora_cache(runtime);
if (!request.gen_params.from_json_str(body.dump(), [&](const std::string& path) {
return get_lora_full_path(runtime, path);
})) {
error_message = "invalid generation parameters";
return false;
}
std::string output_format = body.value("output_format", "png");
int output_compression = body.value("output_compression", 100);
if (!assign_output_options(request, output_format, output_compression, true, error_message)) {
return false;
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
error_message = "invalid generation parameters";
return false;
}
return true;
}
static bool parse_vid_gen_request(const json& body,
ServerRuntime& runtime,
VidGenJobRequest& request,
std::string& error_message) {
request.gen_params = *runtime.default_gen_params;
refresh_lora_cache(runtime);
if (!request.gen_params.from_json_str(body.dump(), [&](const std::string& path) {
return get_lora_full_path(runtime, path);
})) {
error_message = "invalid generation parameters";
return false;
}
std::string output_format = body.value("output_format", "webm");
int output_compression = body.value("output_compression", 100);
if (!assign_output_options(request, output_format, output_compression, error_message)) {
return false;
}
// Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
if (!request.gen_params.resolve_and_validate(VID_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
error_message = "invalid generation parameters";
return false;
}
return true;
}
void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
ServerRuntime* runtime = &rt;
svr.Get("/sdcpp/v1/capabilities", [runtime](const httplib::Request&, httplib::Response& res) {
res.status = 200;
res.set_content(make_capabilities_json(*runtime).dump(), "application/json");
});
svr.Post("/sdcpp/v1/img_gen", [runtime](const httplib::Request& req, httplib::Response& res) {
try {
if (req.body.empty()) {
res.status = 400;
res.set_content(R"({"error":"empty body"})", "application/json");
return;
}
if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
return;
}
json body = json::parse(req.body);
ImgGenJobRequest request;
std::string error_message;
if (!parse_img_gen_request(body, *runtime, request, error_message)) {
res.status = 400;
res.set_content(json({{"error", error_message}}).dump(), "application/json");
return;
}
AsyncJobManager& manager = *runtime->async_job_manager;
std::shared_ptr<AsyncGenerationJob> job = std::make_shared<AsyncGenerationJob>();
job->kind = AsyncJobKind::ImgGen;
job->status = AsyncJobStatus::Queued;
job->created_at = unix_timestamp_now();
job->img_gen = std::move(request);
{
std::lock_guard<std::mutex> lock(manager.mutex);
purge_expired_jobs(manager);
if (count_pending_jobs(manager) >= manager.max_pending_jobs) {
res.status = 429;
res.set_content(R"({"error":"job queue is full"})", "application/json");
return;
}
job->id = make_async_job_id(manager);
manager.jobs[job->id] = job;
manager.queue.push_back(job->id);
}
manager.cv.notify_one();
json out;
out["id"] = job->id;
out["kind"] = async_job_kind_name(job->kind);
out["status"] = async_job_status_name(job->status);
out["created"] = job->created_at;
out["poll_url"] = "/sdcpp/v1/jobs/" + job->id;
res.status = 202;
res.set_content(out.dump(), "application/json");
} catch (const json::parse_error& e) {
res.status = 400;
res.set_content(json({{"error", "invalid json"}, {"message", e.what()}}).dump(), "application/json");
} catch (const std::exception& e) {
res.status = 500;
res.set_content(json({{"error", "server_error"}, {"message", e.what()}}).dump(), "application/json");
}
});
svr.Post("/sdcpp/v1/vid_gen", [runtime](const httplib::Request& req, httplib::Response& res) {
try {
if (req.body.empty()) {
res.status = 400;
res.set_content(R"({"error":"empty body"})", "application/json");
return;
}
if (!runtime_supports_generation_mode(*runtime, VID_GEN)) {
res.status = 400;
res.set_content(json({{"error", unsupported_generation_mode_error(VID_GEN)}}).dump(), "application/json");
return;
}
json body = json::parse(req.body);
VidGenJobRequest request;
std::string error_message;
if (!parse_vid_gen_request(body, *runtime, request, error_message)) {
res.status = 400;
res.set_content(json({{"error", error_message}}).dump(), "application/json");
return;
}
AsyncJobManager& manager = *runtime->async_job_manager;
std::shared_ptr<AsyncGenerationJob> job = std::make_shared<AsyncGenerationJob>();
job->kind = AsyncJobKind::VidGen;
job->status = AsyncJobStatus::Queued;
job->created_at = unix_timestamp_now();
job->vid_gen = std::move(request);
{
std::lock_guard<std::mutex> lock(manager.mutex);
purge_expired_jobs(manager);
if (count_pending_jobs(manager) >= manager.max_pending_jobs) {
res.status = 429;
res.set_content(R"({"error":"job queue is full"})", "application/json");
return;
}
job->id = make_async_job_id(manager);
manager.jobs[job->id] = job;
manager.queue.push_back(job->id);
}
manager.cv.notify_one();
json out;
out["id"] = job->id;
out["kind"] = async_job_kind_name(job->kind);
out["status"] = async_job_status_name(job->status);
out["created"] = job->created_at;
out["poll_url"] = "/sdcpp/v1/jobs/" + job->id;
res.status = 202;
res.set_content(out.dump(), "application/json");
} catch (const json::parse_error& e) {
res.status = 400;
res.set_content(json({{"error", "invalid json"}, {"message", e.what()}}).dump(), "application/json");
} catch (const std::exception& e) {
res.status = 500;
res.set_content(json({{"error", "server_error"}, {"message", e.what()}}).dump(), "application/json");
}
});
svr.Get(R"(/sdcpp/v1/jobs/([A-Za-z0-9_\-]+))", [runtime](const httplib::Request& req, httplib::Response& res) {
AsyncJobManager& manager = *runtime->async_job_manager;
std::lock_guard<std::mutex> lock(manager.mutex);
purge_expired_jobs(manager);
std::string job_id = req.matches[1];
auto it = manager.jobs.find(job_id);
if (it == manager.jobs.end()) {
if (manager.expired_jobs.find(job_id) != manager.expired_jobs.end()) {
res.status = 410;
res.set_content(R"({"error":"job expired"})", "application/json");
} else {
res.status = 404;
res.set_content(R"({"error":"job not found"})", "application/json");
}
return;
}
res.status = 200;
res.set_content(make_async_job_json(manager, *it->second).dump(), "application/json");
});
svr.Post(R"(/sdcpp/v1/jobs/([A-Za-z0-9_\-]+)/cancel)", [runtime](const httplib::Request& req, httplib::Response& res) {
AsyncJobManager& manager = *runtime->async_job_manager;
std::lock_guard<std::mutex> lock(manager.mutex);
purge_expired_jobs(manager);
std::string job_id = req.matches[1];
auto it = manager.jobs.find(job_id);
if (it == manager.jobs.end()) {
if (manager.expired_jobs.find(job_id) != manager.expired_jobs.end()) {
res.status = 410;
res.set_content(R"({"error":"job expired"})", "application/json");
} else {
res.status = 404;
res.set_content(R"({"error":"job not found"})", "application/json");
}
return;
}
auto& job = *it->second;
if (job.status == AsyncJobStatus::Queued) {
if (!cancel_queued_job(manager, job)) {
res.status = 409;
res.set_content(R"({"error":"job queue state changed before cancellation"})", "application/json");
return;
}
res.status = 200;
res.set_content(make_async_job_json(manager, job).dump(), "application/json");
return;
}
if (job.status == AsyncJobStatus::Generating) {
res.status = 409;
res.set_content(R"({"error":"job is currently generating and cannot be interrupted yet"})", "application/json");
return;
}
res.status = 200;
res.set_content(make_async_job_json(manager, job).dump(), "application/json");
});
}

333
examples/server/runtime.cpp Normal file
View File

@ -0,0 +1,333 @@
#include "runtime.h"
#include <algorithm>
#include <cctype>
#include <chrono>
#include <cstdlib>
#include <filesystem>
#include <mutex>
#include <regex>
#include <sstream>
#include "common/common.h"
#include "common/log.h"
namespace fs = std::filesystem;
static std::string lower_ascii(std::string value) {
std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
return value;
}
static bool is_supported_model_ext(const fs::path& p) {
auto ext = lower_ascii(p.extension().string());
return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors";
}
static const std::string k_base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
std::string base64_encode(const std::vector<uint8_t>& bytes) {
std::string ret;
int val = 0;
int valb = -6;
for (uint8_t c : bytes) {
val = (val << 8) + c;
valb += 8;
while (valb >= 0) {
ret.push_back(k_base64_chars[(val >> valb) & 0x3F]);
valb -= 6;
}
}
if (valb > -6) {
ret.push_back(k_base64_chars[((val << 8) >> (valb + 8)) & 0x3F]);
}
while (ret.size() % 4) {
ret.push_back('=');
}
return ret;
}
std::string normalize_output_format(std::string output_format) {
std::transform(output_format.begin(), output_format.end(), output_format.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
return output_format;
}
std::vector<std::string> supported_img_output_formats(bool allow_webp) {
std::vector<std::string> formats = {"png", "jpeg"};
#ifdef SD_USE_WEBP
if (allow_webp) {
formats.push_back("webp");
}
#else
(void)allow_webp;
#endif
return formats;
}
std::vector<std::string> supported_vid_output_formats() {
std::vector<std::string> formats;
#ifdef SD_USE_WEBM
formats.push_back("webm");
#endif
#ifdef SD_USE_WEBP
formats.push_back("webp");
#endif
formats.push_back("avi");
return formats;
}
static std::string valid_vid_output_formats_message() {
const std::vector<std::string> formats = supported_vid_output_formats();
std::string message = "invalid output_format, must be one of [";
for (size_t i = 0; i < formats.size(); ++i) {
if (i > 0) {
message += ", ";
}
message += formats[i];
}
message += "]";
return message;
}
bool assign_output_options(ImgGenJobRequest& request,
std::string output_format,
int output_compression,
bool allow_webp,
std::string& error_message) {
request.output_format = normalize_output_format(std::move(output_format));
request.output_compression = std::clamp(output_compression, 0, 100);
const std::vector<std::string> valid_formats = supported_img_output_formats(allow_webp);
const bool valid_format = std::find(valid_formats.begin(),
valid_formats.end(),
request.output_format) != valid_formats.end();
if (!valid_format) {
error_message = "invalid output_format, must be one of [";
for (size_t i = 0; i < valid_formats.size(); ++i) {
if (i > 0) {
error_message += ", ";
}
error_message += valid_formats[i];
}
error_message += "]";
return false;
}
return true;
}
bool assign_output_options(VidGenJobRequest& request,
std::string output_format,
int output_compression,
std::string& error_message) {
request.output_format = normalize_output_format(std::move(output_format));
request.output_compression = std::clamp(output_compression, 0, 100);
if (request.output_format == "avi") {
return true;
}
if (request.output_format == "webm") {
#ifdef SD_USE_WEBM
return true;
#else
error_message = valid_vid_output_formats_message();
return false;
#endif
}
if (request.output_format == "webp") {
#ifdef SD_USE_WEBP
return true;
#else
error_message = valid_vid_output_formats_message();
return false;
#endif
}
error_message = valid_vid_output_formats_message();
return false;
}
std::string video_mime_type(const std::string& output_format) {
if (output_format == "webm") {
return "video/webm";
}
if (output_format == "webp") {
return "image/webp";
}
return "video/x-msvideo";
}
bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode) {
if (mode == VID_GEN) {
return sd_ctx_supports_video_generation(runtime.sd_ctx);
}
if (mode == IMG_GEN) {
return sd_ctx_supports_image_generation(runtime.sd_ctx);
}
return true;
}
std::string unsupported_generation_mode_error(SDMode mode) {
if (mode == VID_GEN) {
return "loaded model does not support vid_gen";
}
if (mode == IMG_GEN) {
return "loaded model does not support img_gen";
}
return "loaded model does not support requested mode";
}
ArgOptions SDSvrParams::get_options() {
ArgOptions options;
options.string_options = {
{"-l", "--listen-ip", "server listen ip (default: 127.0.0.1)", &listen_ip},
{"", "--serve-html-path", "path to HTML file to serve at root (optional)", &serve_html_path},
};
options.int_options = {
{"", "--listen-port", "server listen port (default: 1234)", &listen_port},
};
options.bool_options = {
{"-v", "--verbose", "print extra info", true, &verbose},
{"", "--color", "colors the logging tags according to level", true, &color},
};
auto on_help_arg = [&](int, const char**, int, bool& valid) {
normal_exit = true;
valid = true;
return -1;
};
options.manual_options = {
{"-h", "--help", "show this help message and exit", on_help_arg},
};
return options;
}
bool SDSvrParams::validate() {
if (listen_ip.empty()) {
LOG_ERROR("error: the following arguments are required: listen_ip");
return false;
}
if (listen_port < 0 || listen_port > 65535) {
LOG_ERROR("error: listen_port should be in the range [0, 65535]");
return false;
}
if (!serve_html_path.empty() && !fs::exists(serve_html_path)) {
LOG_ERROR("error: serve_html_path file does not exist: %s", serve_html_path.c_str());
return false;
}
return true;
}
bool SDSvrParams::resolve_and_validate() {
if (!validate()) {
return false;
}
return true;
}
std::string SDSvrParams::to_string() const {
std::ostringstream oss;
oss << "SDSvrParams {\n"
<< " listen_ip: " << listen_ip << ",\n"
<< " listen_port: \"" << listen_port << "\",\n"
<< " serve_html_path: \"" << serve_html_path << "\",\n"
<< "}";
return oss.str();
}
void refresh_lora_cache(ServerRuntime& rt) {
std::vector<LoraEntry> new_cache;
fs::path lora_dir = rt.ctx_params->lora_model_dir;
if (fs::exists(lora_dir) && fs::is_directory(lora_dir)) {
for (auto& entry : fs::recursive_directory_iterator(lora_dir, fs::directory_options::skip_permission_denied)) {
if (!entry.is_regular_file()) {
continue;
}
const fs::path& p = entry.path();
if (!is_supported_model_ext(p)) {
continue;
}
LoraEntry lora_entry;
lora_entry.name = p.stem().u8string();
lora_entry.fullpath = p.u8string();
std::string rel = p.lexically_relative(lora_dir).u8string();
std::replace(rel.begin(), rel.end(), '\\', '/');
lora_entry.path = rel;
new_cache.push_back(std::move(lora_entry));
}
}
std::sort(new_cache.begin(), new_cache.end(), [](const LoraEntry& a, const LoraEntry& b) {
return a.path < b.path;
});
{
std::lock_guard<std::mutex> lock(*rt.lora_mutex);
*rt.lora_cache = std::move(new_cache);
}
}
std::string get_lora_full_path(ServerRuntime& rt, const std::string& path) {
std::lock_guard<std::mutex> lock(*rt.lora_mutex);
auto it = std::find_if(rt.lora_cache->begin(), rt.lora_cache->end(),
[&](const LoraEntry& entry) { return entry.path == path; });
return it != rt.lora_cache->end() ? it->fullpath : "";
}
void refresh_upscaler_cache(ServerRuntime& rt) {
std::vector<UpscalerEntry> new_cache;
fs::path upscaler_dir = rt.ctx_params->hires_upscalers_dir;
if (fs::exists(upscaler_dir) && fs::is_directory(upscaler_dir)) {
for (auto& entry : fs::directory_iterator(upscaler_dir)) {
if (!entry.is_regular_file()) {
continue;
}
const fs::path& p = entry.path();
if (!is_supported_model_ext(p)) {
continue;
}
UpscalerEntry upscaler_entry;
upscaler_entry.name = p.stem().u8string();
upscaler_entry.fullpath = fs::absolute(p).lexically_normal().u8string();
upscaler_entry.model_name = "ESRGAN_4x";
upscaler_entry.path = p.filename().u8string();
new_cache.push_back(std::move(upscaler_entry));
}
}
std::sort(new_cache.begin(), new_cache.end(), [](const UpscalerEntry& a, const UpscalerEntry& b) {
return a.name < b.name;
});
{
std::lock_guard<std::mutex> lock(*rt.upscaler_mutex);
*rt.upscaler_cache = std::move(new_cache);
}
}
int64_t unix_timestamp_now() {
return std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
}

100
examples/server/runtime.h Normal file
View File

@ -0,0 +1,100 @@
#pragma once
#include <algorithm>
#include <cstdint>
#include <mutex>
#include <string>
#include <vector>
#include <json.hpp>
#include "common/common.h"
#include "common/resource_owners.hpp"
#include "stable-diffusion.h"
using json = nlohmann::json;
struct ArgOptions;
struct SDContextParams;
struct AsyncJobManager;
struct SDSvrParams {
std::string listen_ip = "127.0.0.1";
int listen_port = 1234;
std::string serve_html_path;
bool normal_exit = false;
bool verbose = false;
bool color = false;
ArgOptions get_options();
bool validate();
bool resolve_and_validate();
std::string to_string() const;
};
struct LoraEntry {
std::string name;
std::string path;
std::string fullpath;
};
struct UpscalerEntry {
std::string name;
std::string path;
std::string fullpath;
std::string model_name;
int scale = 4;
};
struct ServerRuntime {
sd_ctx_t* sd_ctx;
std::mutex* sd_ctx_mutex;
const SDSvrParams* svr_params;
const SDContextParams* ctx_params;
const SDGenerationParams* default_gen_params;
std::vector<LoraEntry>* lora_cache;
std::mutex* lora_mutex;
std::vector<UpscalerEntry>* upscaler_cache;
std::mutex* upscaler_mutex;
AsyncJobManager* async_job_manager;
};
struct ImgGenJobRequest {
SDGenerationParams gen_params;
std::string output_format = "png";
int output_compression = 100;
sd_img_gen_params_t to_sd_img_gen_params_t() {
return gen_params.to_sd_img_gen_params_t();
}
};
struct VidGenJobRequest {
SDGenerationParams gen_params;
std::string output_format = "webm";
int output_compression = 100;
sd_vid_gen_params_t to_sd_vid_gen_params_t() {
return gen_params.to_sd_vid_gen_params_t();
}
};
std::string base64_encode(const std::vector<uint8_t>& bytes);
std::string normalize_output_format(std::string output_format);
std::vector<std::string> supported_img_output_formats(bool allow_webp = true);
std::vector<std::string> supported_vid_output_formats();
bool assign_output_options(ImgGenJobRequest& request,
std::string output_format,
int output_compression,
bool allow_webp,
std::string& error_message);
bool assign_output_options(VidGenJobRequest& request,
std::string output_format,
int output_compression,
std::string& error_message);
std::string video_mime_type(const std::string& output_format);
bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode);
std::string unsupported_generation_mode_error(SDMode mode);
void refresh_lora_cache(ServerRuntime& rt);
std::string get_lora_full_path(ServerRuntime& rt, const std::string& path);
void refresh_upscaler_cache(ServerRuntime& rt);
int64_t unix_timestamp_now();

View File

@ -1,8 +1,17 @@
for f in src/*.cpp src/*.h src/*.hpp src/vocab/*.h src/vocab/*.cpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do for f in src/*.cpp src/*.h src/*.hpp \
src/conditioning/*.cpp src/conditioning/*.h src/conditioning/*.hpp \
src/core/*.cpp src/core/*.h src/core/*.hpp \
src/extensions/*.cpp src/extensions/*.h src/extensions/*.hpp \
src/runtime/*.cpp src/runtime/*.h src/runtime/*.hpp \
src/model/*/*.cpp src/model/*/*.h src/model/*/*.hpp \
src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
src/model_io/*.h src/model_io/*.cpp examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
examples/common/*.hpp examples/common/*.h examples/common/*.cpp; do
[[ -e "$f" ]] || continue
[[ "$f" == vocab* ]] && continue [[ "$f" == vocab* ]] && continue
echo "formatting '$f'" echo "formatting '$f'"
# if [ "$f" != "stable-diffusion.h" ]; then # if [ "$f" != "stable-diffusion.h" ]; then
# clang-tidy -fix -p build_linux/ "$f" # clang-tidy -fix -p build_linux/ "$f"
# fi # fi
clang-format -style=file -i "$f" clang-format -style=file -i "$f"
done done

2
ggml

@ -1 +1 @@
Subproject commit 404fcb9d7c96989569e68c9e7881ee3465a05c50 Subproject commit 0ce7ad348a3151e1da9f65d962044546bcaad421

View File

@ -50,6 +50,10 @@ enum sample_method_t {
TCD_SAMPLE_METHOD, TCD_SAMPLE_METHOD,
RES_MULTISTEP_SAMPLE_METHOD, RES_MULTISTEP_SAMPLE_METHOD,
RES_2S_SAMPLE_METHOD, RES_2S_SAMPLE_METHOD,
ER_SDE_SAMPLE_METHOD,
EULER_CFG_PP_SAMPLE_METHOD,
EULER_A_CFG_PP_SAMPLE_METHOD,
EULER_GE_SAMPLE_METHOD,
SAMPLE_METHOD_COUNT SAMPLE_METHOD_COUNT
}; };
@ -65,6 +69,7 @@ enum scheduler_t {
KL_OPTIMAL_SCHEDULER, KL_OPTIMAL_SCHEDULER,
LCM_SCHEDULER, LCM_SCHEDULER,
BONG_TANGENT_SCHEDULER, BONG_TANGENT_SCHEDULER,
LTX2_SCHEDULER,
SCHEDULER_COUNT SCHEDULER_COUNT
}; };
@ -121,7 +126,8 @@ enum sd_type_t {
// SD_TYPE_IQ4_NL_8_8 = 38, // SD_TYPE_IQ4_NL_8_8 = 38,
SD_TYPE_MXFP4 = 39, // MXFP4 (1 block) SD_TYPE_MXFP4 = 39, // MXFP4 (1 block)
SD_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) SD_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
SD_TYPE_COUNT = 41, SD_TYPE_Q1_0 = 41,
SD_TYPE_COUNT = 42,
}; };
enum sd_log_level_t { enum sd_log_level_t {
@ -148,11 +154,13 @@ enum lora_apply_mode_t {
typedef struct { typedef struct {
bool enabled; bool enabled;
bool temporal_tiling;
int tile_size_x; int tile_size_x;
int tile_size_y; int tile_size_y;
float target_overlap; float target_overlap;
float rel_size_x; float rel_size_x;
float rel_size_y; float rel_size_y;
const char* extra_tiling_args;
} sd_tiling_params_t; } sd_tiling_params_t;
typedef struct { typedef struct {
@ -160,6 +168,14 @@ typedef struct {
const char* path; const char* path;
} sd_embedding_t; } sd_embedding_t;
enum sd_vae_format_t {
SD_VAE_FORMAT_AUTO = -1,
SD_VAE_FORMAT_FLUX,
SD_VAE_FORMAT_SD3,
SD_VAE_FORMAT_FLUX2,
SD_VAE_FORMAT_COUNT,
};
typedef struct { typedef struct {
const char* model_path; const char* model_path;
const char* clip_l_path; const char* clip_l_path;
@ -170,7 +186,10 @@ typedef struct {
const char* llm_vision_path; const char* llm_vision_path;
const char* diffusion_model_path; const char* diffusion_model_path;
const char* high_noise_diffusion_model_path; const char* high_noise_diffusion_model_path;
const char* uncond_diffusion_model_path;
const char* embeddings_connectors_path;
const char* vae_path; const char* vae_path;
const char* audio_vae_path;
const char* taesd_path; const char* taesd_path;
const char* control_net_path; const char* control_net_path;
const sd_embedding_t* embeddings; const sd_embedding_t* embeddings;
@ -202,8 +221,20 @@ typedef struct {
bool chroma_use_t5_mask; bool chroma_use_t5_mask;
int chroma_t5_mask_pad; int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t; bool qwen_image_zero_cond_t;
enum sd_vae_format_t vae_format;
float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
const char* backend;
const char* params_backend;
} sd_ctx_params_t; } sd_ctx_params_t;
typedef struct {
uint32_t sample_rate;
uint32_t channels;
uint64_t sample_count;
float* data;
} sd_audio_t;
typedef struct { typedef struct {
uint32_t width; uint32_t width;
uint32_t height; uint32_t height;
@ -236,6 +267,7 @@ typedef struct {
float* custom_sigmas; float* custom_sigmas;
int custom_sigmas_count; int custom_sigmas_count;
float flow_shift; float flow_shift;
const char* extra_sample_args;
} sd_sample_params_t; } sd_sample_params_t;
typedef struct { typedef struct {
@ -288,6 +320,34 @@ typedef struct {
const char* path; const char* path;
} sd_lora_t; } sd_lora_t;
enum sd_hires_upscaler_t {
SD_HIRES_UPSCALER_NONE,
SD_HIRES_UPSCALER_LATENT,
SD_HIRES_UPSCALER_LATENT_NEAREST,
SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT,
SD_HIRES_UPSCALER_LATENT_ANTIALIASED,
SD_HIRES_UPSCALER_LATENT_BICUBIC,
SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED,
SD_HIRES_UPSCALER_LANCZOS,
SD_HIRES_UPSCALER_NEAREST,
SD_HIRES_UPSCALER_MODEL,
SD_HIRES_UPSCALER_COUNT,
};
typedef struct {
bool enabled;
enum sd_hires_upscaler_t upscaler;
const char* model_path;
float scale;
int target_width;
int target_height;
int steps;
float denoising_strength;
int upscale_tile_size;
float* custom_sigmas;
int custom_sigmas_count;
} sd_hires_params_t;
typedef struct { typedef struct {
const sd_lora_t* loras; const sd_lora_t* loras;
uint32_t lora_count; uint32_t lora_count;
@ -311,6 +371,7 @@ typedef struct {
sd_pm_params_t pm_params; sd_pm_params_t pm_params;
sd_tiling_params_t vae_tiling_params; sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache; sd_cache_params_t cache;
sd_hires_params_t hires;
} sd_img_gen_params_t; } sd_img_gen_params_t;
typedef struct { typedef struct {
@ -331,9 +392,11 @@ typedef struct {
float strength; float strength;
int64_t seed; int64_t seed;
int video_frames; int video_frames;
int fps;
float vace_strength; float vace_strength;
sd_tiling_params_t vae_tiling_params; sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache; sd_cache_params_t cache;
sd_hires_params_t hires;
} sd_vid_gen_params_t; } sd_vid_gen_params_t;
typedef struct sd_ctx_t sd_ctx_t; typedef struct sd_ctx_t sd_ctx_t;
@ -347,6 +410,8 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data); SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
SD_API int32_t sd_get_num_physical_cores(); SD_API int32_t sd_get_num_physical_cores();
SD_API const char* sd_get_system_info(); SD_API const char* sd_get_system_info();
SD_API bool sd_ctx_supports_image_generation(const sd_ctx_t* sd_ctx);
SD_API bool sd_ctx_supports_video_generation(const sd_ctx_t* sd_ctx);
SD_API const char* sd_type_name(enum sd_type_t type); SD_API const char* sd_type_name(enum sd_type_t type);
SD_API enum sd_type_t str_to_sd_type(const char* str); SD_API enum sd_type_t str_to_sd_type(const char* str);
@ -362,14 +427,18 @@ SD_API const char* sd_preview_name(enum preview_t preview);
SD_API enum preview_t str_to_preview(const char* str); SD_API enum preview_t str_to_preview(const char* str);
SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode); SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str); SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
SD_API const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler);
SD_API enum sd_hires_upscaler_t str_to_sd_hires_upscaler(const char* str);
SD_API void sd_cache_params_init(sd_cache_params_t* cache_params); SD_API void sd_cache_params_init(sd_cache_params_t* cache_params);
SD_API void sd_hires_params_init(sd_hires_params_t* hires_params);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params); SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
SD_API void free_sd_audio(sd_audio_t* audio);
SD_API void sd_sample_params_init(sd_sample_params_t* sample_params); SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params); SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);
@ -382,7 +451,11 @@ SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_para
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params); SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params); SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out); SD_API bool generate_video(sd_ctx_t* sd_ctx,
const sd_vid_gen_params_t* sd_vid_gen_params,
sd_image_t** frames_out,
int* num_frames_out,
sd_audio_t** audio_out);
typedef struct upscaler_ctx_t upscaler_ctx_t; typedef struct upscaler_ctx_t upscaler_ctx_t;
@ -390,7 +463,9 @@ SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
bool offload_params_to_cpu, bool offload_params_to_cpu,
bool direct, bool direct,
int n_threads, int n_threads,
int tile_size); int tile_size,
const char* backend,
const char* params_backend);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,

View File

@ -0,0 +1,283 @@
#!/usr/bin/env python
import argparse
import json
import math
import os
import struct
from collections import Counter
from pathlib import Path
import torch
from safetensors import safe_open
FLOAT_DTYPES = {
"BF16",
"F16",
"F32",
"F64",
"F8_E4M3",
"F8_E4M3FN",
"F8_E5M2",
}
FP8_DTYPES = {
"F8_E4M3",
"F8_E4M3FN",
"F8_E5M2",
}
DTYPE_SIZES = {
"BOOL": 1,
"U8": 1,
"I8": 1,
"F8_E4M3": 1,
"F8_E4M3FN": 1,
"F8_E5M2": 1,
"U16": 2,
"I16": 2,
"F16": 2,
"BF16": 2,
"U32": 4,
"I32": 4,
"F32": 4,
"U64": 8,
"I64": 8,
"F64": 8,
}
def read_safetensors_header(path: Path):
with path.open("rb") as f:
header_len = struct.unpack("<Q", f.read(8))[0]
header = f.read(header_len).decode("utf-8").rstrip()
return json.loads(header)
def numel(shape):
return math.prod(shape) if shape else 1
def scale_key_for_weight(name: str):
if name.endswith(".weight"):
return name[:-len(".weight")] + ".weight_scale"
if name.endswith("weight"):
return name + "_scale"
return None
def tensor_nbytes(dtype: str, shape):
return numel(shape) * DTYPE_SIZES[dtype]
def build_output_plan(header):
entries = {k: v for k, v in header.items() if k != "__metadata__"}
paired_scale_keys = set()
plan = []
for name, info in entries.items():
scale_key = scale_key_for_weight(name)
if info["dtype"] in FP8_DTYPES and scale_key in entries:
paired_scale_keys.add(scale_key)
for name, info in entries.items():
if name in paired_scale_keys:
continue
dtype = info["dtype"]
shape = info["shape"]
scale_key = scale_key_for_weight(name)
if dtype in FP8_DTYPES and scale_key in entries:
scale_info = entries[scale_key]
plan.append(
{
"name": name,
"source_dtype": dtype,
"output_dtype": "BF16",
"shape": shape,
"mode": "fp8_scaled_weight",
"scale_key": scale_key,
}
)
continue
if dtype in FLOAT_DTYPES:
plan.append(
{
"name": name,
"source_dtype": dtype,
"output_dtype": "BF16",
"shape": shape,
"mode": "float_to_bf16",
}
)
else:
plan.append(
{
"name": name,
"source_dtype": dtype,
"output_dtype": dtype,
"shape": shape,
"mode": "copy",
}
)
metadata = dict(header.get("__metadata__", {}) or {})
metadata["format"] = "pt"
metadata["conversion"] = "fp8_weight_scale_to_bf16"
output_header = {"__metadata__": metadata}
offset = 0
for item in plan:
size = tensor_nbytes(item["output_dtype"], item["shape"])
output_header[item["name"]] = {
"dtype": item["output_dtype"],
"shape": item["shape"],
"data_offsets": [offset, offset + size],
}
offset += size
return plan, output_header, offset
def write_tensor_bytes(out, tensor):
tensor = tensor.detach().cpu().contiguous()
if tensor.numel() == 0:
return
if tensor.dtype == torch.bfloat16:
tensor.view(torch.uint16).numpy().tofile(out)
elif tensor.dtype in (getattr(torch, "float8_e4m3fn", None), getattr(torch, "float8_e5m2", None)):
tensor.view(torch.uint8).numpy().tofile(out)
else:
tensor.numpy().tofile(out)
def scale_view_for_chunk(scale, chunk, first_dim_start=0, first_dim_end=None):
scale = scale.to(torch.float32)
if scale.numel() == 1:
return scale.reshape((1,) * chunk.ndim)
if chunk.ndim > 0 and scale.ndim == 1:
if first_dim_end is not None and scale.shape[0] >= first_dim_end:
scale = scale[first_dim_start:first_dim_end]
if scale.shape[0] == chunk.shape[0]:
return scale.reshape((scale.shape[0],) + (1,) * (chunk.ndim - 1))
return scale
def write_scaled_fp8_weight(out, weight, scale, chunk_rows):
if weight.ndim == 0:
result = weight.to(torch.float32) * scale_view_for_chunk(scale, weight)
write_tensor_bytes(out, result.to(torch.bfloat16))
return
rows = weight.shape[0]
for start in range(0, rows, chunk_rows):
end = min(start + chunk_rows, rows)
chunk = weight[start:end].to(torch.float32)
scale_view = scale_view_for_chunk(scale, chunk, start, end)
result = chunk * scale_view
write_tensor_bytes(out, result.to(torch.bfloat16))
def write_float_as_bf16(out, tensor, chunk_rows):
if tensor.dtype == torch.bfloat16:
write_tensor_bytes(out, tensor)
return
if tensor.ndim == 0:
write_tensor_bytes(out, tensor.to(torch.bfloat16))
return
rows = tensor.shape[0]
for start in range(0, rows, chunk_rows):
end = min(start + chunk_rows, rows)
write_tensor_bytes(out, tensor[start:end].to(torch.bfloat16))
def convert(input_path: Path, output_path: Path, chunk_rows: int, dry_run: bool):
header = read_safetensors_header(input_path)
plan, output_header, data_size = build_output_plan(header)
source_counts = Counter(item["source_dtype"] for item in plan)
output_counts = Counter(item["output_dtype"] for item in plan)
scaled_count = sum(item["mode"] == "fp8_scaled_weight" for item in plan)
dropped_scales = sum(item["mode"] == "fp8_scaled_weight" for item in plan)
header_bytes = json.dumps(output_header, separators=(",", ":")).encode("utf-8")
expected_size = 8 + len(header_bytes) + data_size
print(f"input: {input_path}")
print(f"output: {output_path}")
print(f"tensors written: {len(plan)}")
print(f"scaled fp8 weights dequantized: {scaled_count}")
print(f"weight_scale tensors dropped: {dropped_scales}")
print(f"source dtypes: {dict(sorted(source_counts.items()))}")
print(f"output dtypes: {dict(sorted(output_counts.items()))}")
print(f"expected output size: {expected_size / (1024 ** 3):.2f} GiB")
if dry_run:
return
if output_path.exists():
raise FileExistsError(f"{output_path} already exists; pass --overwrite to replace it")
tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
if tmp_path.exists():
raise FileExistsError(f"{tmp_path} already exists; remove it or choose another output")
with safe_open(str(input_path), framework="pt", device="cpu") as sf, tmp_path.open("wb") as out:
out.write(struct.pack("<Q", len(header_bytes)))
out.write(header_bytes)
for index, item in enumerate(plan, 1):
name = item["name"]
print(f"[{index:04d}/{len(plan):04d}] {name} -> {item['output_dtype']}")
tensor = sf.get_tensor(name)
if item["mode"] == "fp8_scaled_weight":
scale = sf.get_tensor(item["scale_key"])
write_scaled_fp8_weight(out, tensor, scale, chunk_rows)
elif item["mode"] == "float_to_bf16":
write_float_as_bf16(out, tensor, chunk_rows)
else:
write_tensor_bytes(out, tensor)
actual_size = out.tell()
if actual_size != expected_size:
tmp_path.unlink(missing_ok=True)
raise RuntimeError(f"wrote {actual_size} bytes, expected {expected_size} bytes")
tmp_path.replace(output_path)
print("done")
def main():
parser = argparse.ArgumentParser(
description="Convert an fp8 safetensors checkpoint with weight_scale tensors to bf16."
)
parser.add_argument("--input", default="ideogram4_fp8.safetensors", type=Path)
parser.add_argument("--output", default="ideogram4_bf16.safetensors", type=Path)
parser.add_argument("--chunk-rows", default=1024, type=int)
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--overwrite", action="store_true")
args = parser.parse_args()
input_path = args.input.resolve()
output_path = args.output.resolve()
if args.chunk_rows < 1:
raise ValueError("--chunk-rows must be >= 1")
if not input_path.exists():
raise FileNotFoundError(input_path)
if args.overwrite and output_path.exists():
output_path.unlink()
convert(input_path, output_path, args.chunk_rows, args.dry_run)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

138
src/convert.cpp Normal file
View File

@ -0,0 +1,138 @@
#include <cstring>
#include <mutex>
#include <regex>
#include <vector>
#include "model_io/gguf_io.h"
#include "model_io/safetensors_io.h"
#include "model_loader.h"
#include "util.h"
#include "ggml_extend_backend.h"
static ggml_type get_export_tensor_type(ModelLoader& model_loader,
const TensorStorage& tensor_storage,
ggml_type type,
const TensorTypeRules& tensor_type_rules) {
const std::string& name = tensor_storage.name;
ggml_type tensor_type = tensor_storage.type;
ggml_type dst_type = type;
for (const auto& tensor_type_rule : tensor_type_rules) {
std::regex pattern(tensor_type_rule.first);
if (std::regex_search(name, pattern)) {
dst_type = tensor_type_rule.second;
break;
}
}
if (model_loader.tensor_should_be_converted(tensor_storage, dst_type)) {
tensor_type = dst_type;
}
return tensor_type;
}
static bool load_tensors_for_export(ModelLoader& model_loader,
ggml_context* ggml_ctx,
ggml_type type,
const TensorTypeRules& tensor_type_rules,
std::vector<TensorWriteInfo>& tensors) {
std::mutex tensor_mutex;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
ggml_type tensor_type = get_export_tensor_type(model_loader, tensor_storage, type, tensor_type_rules);
std::lock_guard<std::mutex> lock(tensor_mutex);
ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
if (tensor == nullptr) {
LOG_ERROR("ggml_new_tensor failed");
return false;
}
ggml_set_name(tensor, name.c_str());
if (!tensor->data) {
GGML_ASSERT(ggml_nelements(tensor) == 0);
// Avoid crashing writers by setting a dummy pointer for zero-sized tensors.
LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str());
tensor->data = ggml_get_mem_buffer(ggml_ctx);
}
TensorWriteInfo write_info;
write_info.tensor = tensor;
write_info.n_dims = tensor_storage.n_dims;
for (int i = 0; i < tensor_storage.n_dims; ++i) {
write_info.ne[i] = tensor_storage.ne[i];
}
*dst_tensor = tensor;
tensors.push_back(std::move(write_info));
return true;
};
bool success = model_loader.load_tensors(on_new_tensor_cb);
LOG_INFO("load tensors done");
return success;
}
bool convert(const char* input_path,
const char* vae_path,
const char* output_path,
sd_type_t output_type,
const char* tensor_type_rules,
bool convert_name) {
ModelLoader model_loader;
if (!model_loader.init_from_file(input_path)) {
LOG_ERROR("init model loader from file failed: '%s'", input_path);
return false;
}
if (vae_path != nullptr && strlen(vae_path) > 0) {
if (!model_loader.init_from_file(vae_path, "vae.")) {
LOG_ERROR("init model loader from file failed: '%s'", vae_path);
return false;
}
}
if (convert_name) {
model_loader.convert_tensors_name();
}
ggml_type type = (ggml_type)output_type;
bool output_is_safetensors = ends_with(output_path, ".safetensors");
TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);
auto backend = sd_backend_cpu_init();
size_t mem_size = 1 * 1024 * 1024; // for padding
mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead();
mem_size += model_loader.get_params_mem_size(backend, type);
LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false});
if (ggml_ctx == nullptr) {
LOG_ERROR("ggml_init failed for converter");
ggml_backend_free(backend);
return false;
}
std::vector<TensorWriteInfo> tensors;
bool success = load_tensors_for_export(model_loader, ggml_ctx, type, type_rules, tensors);
ggml_backend_free(backend);
std::string error;
if (success) {
if (output_is_safetensors) {
success = write_safetensors_file(output_path, tensors, &error);
} else {
success = write_gguf_file(output_path, tensors, &error);
}
}
if (!success && !error.empty()) {
LOG_ERROR("%s", error.c_str());
}
ggml_free(ggml_ctx);
return success;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,656 @@
#include "core/ggml_extend_backend.h"
#include <algorithm>
#include <cctype>
#include <cstdlib>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <vector>
#include "core/util.h"
#include "stable-diffusion.h"
static std::string trim_copy(const std::string& value) {
size_t begin = 0;
while (begin < value.size() && std::isspace(static_cast<unsigned char>(value[begin]))) {
++begin;
}
size_t end = value.size();
while (end > begin && std::isspace(static_cast<unsigned char>(value[end - 1]))) {
--end;
}
return value.substr(begin, end - begin);
}
static std::string lower_copy(std::string value) {
std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
return value;
}
static std::vector<std::string> split_copy(const std::string& value, char delimiter) {
std::vector<std::string> parts;
std::string part;
std::istringstream stream(value);
while (std::getline(stream, part, delimiter)) {
parts.push_back(part);
}
return parts;
}
static bool is_default_backend_token(const std::string& name) {
const std::string lower = lower_copy(trim_copy(name));
return lower.empty() || lower == "default" || lower == "auto";
}
static bool parse_backend_module(const std::string& raw_name, SDBackendModule* module) {
std::string name = lower_copy(trim_copy(raw_name));
name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
name.erase(std::remove(name.begin(), name.end(), '_'), name.end());
if (name == "diffusion" || name == "model" || name == "unet" || name == "dit") {
*module = SDBackendModule::DIFFUSION;
return true;
}
if (name == "te" || name == "clip" || name == "text" || name == "textencoder" || name == "textencoders" || name == "conditioner" || name == "cond" || name == "llm" || name == "t5" || name == "t5xxl") {
*module = SDBackendModule::TE;
return true;
}
if (name == "clipvision" || name == "vision") {
*module = SDBackendModule::CLIP_VISION;
return true;
}
if (name == "vae" || name == "firststage" || name == "autoencoder" || name == "tae") {
*module = SDBackendModule::VAE;
return true;
}
if (name == "controlnet" || name == "control") {
*module = SDBackendModule::CONTROL_NET;
return true;
}
if (name == "photomaker" || name == "photomakerid" || name == "pmid" || name == "photo") {
*module = SDBackendModule::PHOTOMAKER;
return true;
}
if (name == "upscaler" || name == "esrgan" || name == "hires") {
*module = SDBackendModule::UPSCALER;
return true;
}
return false;
}
static std::string module_assignment_name(const SDBackendAssignment& assignment, SDBackendModule module) {
auto it = assignment.module_names.find(module);
if (it != assignment.module_names.end()) {
return it->second;
}
return assignment.default_name;
}
static std::string backend_cache_key(ggml_backend_t backend) {
if (backend == nullptr) {
return "";
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (dev != nullptr) {
return lower_copy(ggml_backend_dev_name(dev));
}
const char* backend_name = ggml_backend_name(backend);
return backend_name != nullptr ? lower_copy(backend_name) : "";
}
static std::string resolve_first_device_by_type(enum ggml_backend_dev_type type) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
if (dev == nullptr) {
return "";
}
return ggml_backend_dev_name(dev);
}
static ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
if (tensor == nullptr) {
return nullptr;
}
return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
}
static bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) {
if (tensor == nullptr || tensor->data == nullptr) {
return false;
}
ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor);
return buffer == nullptr || ggml_backend_buffer_is_host(buffer);
}
static size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
return static_cast<size_t>(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]);
}
template <typename T>
static void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) {
const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3);
if (ggml_backend_tensor_is_host_accessible(tensor)) {
auto* dst = reinterpret_cast<T*>(reinterpret_cast<char*>(tensor->data) + offset);
*dst = value;
return;
}
ggml_backend_tensor_set(const_cast<struct ggml_tensor*>(tensor), &value, offset, sizeof(T));
}
static void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) {
switch (tensor->type) {
case GGML_TYPE_I8:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int8_t>(value));
break;
case GGML_TYPE_I16:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int16_t>(value));
break;
case GGML_TYPE_I32:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int32_t>(value));
break;
case GGML_TYPE_F16:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value));
break;
case GGML_TYPE_BF16:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value));
break;
case GGML_TYPE_F32:
ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value);
break;
default:
GGML_ABORT("fatal error");
}
}
void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) {
if (!ggml_is_contiguous(tensor)) {
int64_t id[4] = {0, 0, 0, 0};
ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
return;
}
switch (tensor->type) {
case GGML_TYPE_I8:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int8_t>(value));
break;
case GGML_TYPE_I16:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int16_t>(value));
break;
case GGML_TYPE_I32:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int32_t>(value));
break;
case GGML_TYPE_F16:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value));
break;
case GGML_TYPE_BF16:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value));
break;
case GGML_TYPE_F32:
ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value);
break;
default:
GGML_ABORT("fatal error");
}
}
static void ggml_backend_load_all_once() {
// If the registry already has devices and the CPU backend is present,
// assume either static registration or explicit host-side preloading has
// completed and avoid rescanning the default paths.
if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
return;
}
// In dynamic-backend mode the backend modules are discovered at runtime,
// so we must load them before asking for the CPU backend or its proc table.
// If the host preloaded only a subset of backends, allow one default-path
// scan so missing modules can still be discovered.
static std::once_flag once;
std::call_once(once, []() {
if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
return;
}
ggml_backend_load_all();
});
}
bool sd_backend_is(ggml_backend_t backend, const std::string& name) {
if (!backend) {
return false;
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (!dev) {
return false;
}
std::string dev_name = ggml_backend_dev_name(dev);
return lower_copy(dev_name).find(lower_copy(name)) != std::string::npos;
}
static std::string get_default_backend_name() {
ggml_backend_load_all_once();
// should pick the same backend preference as ggml_backend_init_best
std::string name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!name.empty()) {
return name;
}
name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
if (!name.empty()) {
return name;
}
return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
static std::string sd_resolve_backend_name(const std::string& name) {
ggml_backend_load_all_once();
std::string requested = trim_copy(name);
std::string lower = lower_copy(requested);
if (is_default_backend_token(lower)) {
return get_default_backend_name();
}
if (lower == "gpu") {
std::string result = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!result.empty()) {
return result;
}
return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
}
const size_t device_count = ggml_backend_dev_count();
for (size_t i = 0; i < device_count; ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
std::string dev_name = ggml_backend_dev_name(dev);
if (lower_copy(dev_name) == lower) {
return dev_name;
}
}
for (size_t i = 0; i < device_count; ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
std::string dev_name = ggml_backend_dev_name(dev);
std::string dev_lower = lower_copy(dev_name);
if (dev_lower.rfind(lower, 0) == 0) {
return dev_name;
}
}
return "";
}
static bool backend_name_exists(const std::string& name) {
return !sd_resolve_backend_name(name).empty();
}
static ggml_backend_t init_named_backend(const std::string& name) {
ggml_backend_load_all_once();
LOG_DEBUG("Initializing backend: %s", name.c_str());
if (trim_copy(name).empty()) {
return ggml_backend_init_best();
}
std::string resolved = sd_resolve_backend_name(name);
if (resolved.empty()) {
return nullptr;
}
return ggml_backend_init_by_name(resolved.c_str(), nullptr);
}
bool sd_backend_is_cpu(ggml_backend_t backend) {
if (backend == nullptr) {
return false;
}
auto dev = ggml_backend_get_device(backend);
return dev != nullptr && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU;
}
ggml_backend_t sd_backend_cpu_init() {
ggml_backend_load_all_once();
return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
}
bool sd_backend_cpu_set_n_threads(ggml_backend_t backend, int n_threads) {
if (backend == nullptr) {
return false;
}
auto dev = ggml_backend_get_device(backend);
if (dev != nullptr && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
auto reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn != nullptr) {
ggml_backend_set_n_threads_fn(backend, n_threads);
return true;
}
}
return false;
}
const char* sd_get_system_info() {
static std::string cache_info = []() -> std::string {
ggml_backend_load_all_once();
std::stringstream ss;
ss << "System Info: \n";
auto dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (dev != nullptr) {
auto reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_get_features_fn = (ggml_backend_get_features_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
if (ggml_backend_get_features_fn != nullptr) {
ggml_backend_feature* feat = ggml_backend_get_features_fn(reg);
while (feat->name && feat->value) {
ss << " " << feat->name << " = " << feat->value << " | ";
feat++;
}
} else {
LOG_WARN("unable to get CPU features");
}
} else {
LOG_WARN("unable to get CPU features");
}
return ss.str();
}();
return cache_info.c_str();
}
static ggml_backend_t sd_get_default_backend() {
ggml_backend_load_all_once();
static std::once_flag once;
std::call_once(once, []() {
size_t dev_count = ggml_backend_dev_count();
if (dev_count == 0) {
LOG_ERROR("No devices found!");
} else {
LOG_DEBUG("Found %zu backend devices:", dev_count);
for (size_t i = 0; i < dev_count; ++i) {
auto dev = ggml_backend_dev_get(i);
LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev));
}
}
});
ggml_backend_t backend = nullptr;
const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
if (SD_VK_DEVICE != nullptr) {
std::string sd_vk_device_str = SD_VK_DEVICE;
try {
unsigned long long device = std::stoull(sd_vk_device_str);
std::string vk_device_name = "Vulkan" + std::to_string(device);
if (backend_name_exists(vk_device_name)) {
LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str());
backend = init_named_backend(vk_device_name);
if (!backend) {
LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str());
}
} else {
LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str());
}
} catch (const std::invalid_argument&) {
LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE);
} catch (const std::out_of_range&) {
LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE);
}
}
if (!backend) {
std::string dev_name = get_default_backend_name();
backend = init_named_backend(dev_name);
if (!backend && !dev_name.empty()) {
LOG_WARN("device %s failed to init", dev_name.c_str());
}
}
if (!backend) {
LOG_WARN("loading CPU backend");
backend = sd_backend_cpu_init();
}
if (sd_backend_is_cpu(backend)) {
LOG_DEBUG("Using CPU backend");
}
return backend;
}
static bool sd_parse_backend_assignment(const std::string& spec, SDBackendAssignment* assignment, std::string* error) {
if (assignment == nullptr) {
return false;
}
*assignment = {};
const std::string in = trim_copy(spec);
if (in.empty()) {
return true;
}
for (const std::string& raw_part : split_copy(in, ',')) {
const std::string part = trim_copy(raw_part);
if (part.empty()) {
continue;
}
const size_t eq = part.find('=');
if (eq == std::string::npos) {
assignment->set_default(part);
continue;
}
const std::string key = trim_copy(part.substr(0, eq));
const std::string value = trim_copy(part.substr(eq + 1));
if (key.empty() || value.empty()) {
if (error != nullptr) {
*error = "invalid backend assignment '" + part + "'";
}
return false;
}
const std::string key_lower = lower_copy(key);
if (key_lower == "all" || key_lower == "default" || key_lower == "*") {
assignment->set_default(value);
continue;
}
SDBackendModule module = SDBackendModule::DIFFUSION;
if (!parse_backend_module(key, &module)) {
if (error != nullptr) {
*error = "unknown backend module '" + key + "'";
}
return false;
}
assignment->set_module(module, value);
}
return true;
}
bool SDBackendAssignment::empty() const {
return default_name.empty() && module_names.empty();
}
std::string SDBackendAssignment::get(SDBackendModule module) const {
return module_assignment_name(*this, module);
}
void SDBackendAssignment::set_default(const std::string& name) {
default_name = trim_copy(name);
}
void SDBackendAssignment::set_module(SDBackendModule module, const std::string& name) {
module_names[module] = trim_copy(name);
}
void SDBackendHandleDeleter::operator()(ggml_backend_t backend) const {
ggml_backend_free(backend);
}
SDBackendManager::~SDBackendManager() {
reset();
}
void SDBackendManager::reset() {
backends_.clear();
runtime_assignment_ = {};
params_assignment_ = {};
}
ggml_backend_t SDBackendManager::runtime_backend(SDBackendModule module) {
return init_cached_backend(runtime_assignment_.get(module));
}
ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) {
std::string name = params_assignment_.get(module);
if (name.empty()) {
return runtime_backend(module);
}
return init_cached_backend(name);
}
bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) {
return sd_backend_is_cpu(runtime_backend(module));
}
bool SDBackendManager::params_backend_is_cpu(SDBackendModule module) {
return sd_backend_is_cpu(params_backend(module));
}
bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) {
ggml_backend_t backend = runtime_backend(module);
if (backend == nullptr) {
return false;
}
if (sd_backend_is_cpu(backend)) {
return true;
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (dev == nullptr) {
return false;
}
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
return props.caps.buffer_from_host_ptr;
}
bool SDBackendManager::init(const char* backend_spec,
const char* params_backend_spec,
bool offload_params_to_cpu,
bool keep_clip_on_cpu,
bool keep_vae_on_cpu,
bool keep_control_net_on_cpu,
std::string* error) {
reset();
if (!sd_parse_backend_assignment(SAFE_STR(backend_spec), &runtime_assignment_, error)) {
return false;
}
if (!sd_parse_backend_assignment(SAFE_STR(params_backend_spec), &params_assignment_, error)) {
return false;
}
if (runtime_assignment_.empty()) {
if (keep_clip_on_cpu) {
runtime_assignment_.set_module(SDBackendModule::TE, "cpu");
}
if (keep_vae_on_cpu) {
runtime_assignment_.set_module(SDBackendModule::VAE, "cpu");
}
if (keep_control_net_on_cpu) {
runtime_assignment_.set_module(SDBackendModule::CONTROL_NET, "cpu");
}
}
if (params_assignment_.empty() && offload_params_to_cpu) {
params_assignment_.set_default("cpu");
}
return validate(error);
}
bool SDBackendManager::validate(std::string* error) const {
auto validate_name = [&](const std::string& name) -> bool {
if (is_default_backend_token(name)) {
return true;
}
if (!sd_resolve_backend_name(name).empty()) {
return true;
}
if (error != nullptr) {
*error = "backend '" + name + "' was not found";
}
return false;
};
if (!validate_name(runtime_assignment_.default_name) ||
!validate_name(params_assignment_.default_name)) {
return false;
}
for (const auto& kv : runtime_assignment_.module_names) {
if (!validate_name(kv.second)) {
return false;
}
}
for (const auto& kv : params_assignment_.module_names) {
if (!validate_name(kv.second)) {
return false;
}
}
return true;
}
ggml_backend_t SDBackendManager::init_cached_backend(const std::string& name) {
std::string resolved = sd_resolve_backend_name(name);
std::string key = lower_copy(resolved);
ggml_backend_t backend = nullptr;
if (!key.empty()) {
auto it = backends_.find(key);
if (it != backends_.end()) {
return it->second.get();
}
} else if (!is_default_backend_token(name)) {
LOG_ERROR("backend '%s' was not found", name.c_str());
return nullptr;
}
backend = is_default_backend_token(name) ? sd_get_default_backend() : init_named_backend(resolved);
if (backend == nullptr) {
LOG_ERROR("failed to initialize backend '%s'", name.c_str());
return nullptr;
}
std::string actual_key = backend_cache_key(backend);
if (actual_key.empty()) {
actual_key = !key.empty() ? key : lower_copy(trim_copy(name));
}
auto it = backends_.find(actual_key);
if (it != backends_.end()) {
ggml_backend_free(backend);
return it->second.get();
}
SDBackendHandle handle(backend);
backends_.emplace(actual_key, std::move(handle));
return backend;
}
const char* sd_backend_module_name(SDBackendModule module) {
switch (module) {
case SDBackendModule::DIFFUSION:
return "diffusion";
case SDBackendModule::TE:
return "te";
case SDBackendModule::CLIP_VISION:
return "clip_vision";
case SDBackendModule::VAE:
return "vae";
case SDBackendModule::CONTROL_NET:
return "controlnet";
case SDBackendModule::PHOTOMAKER:
return "photomaker";
case SDBackendModule::UPSCALER:
return "upscaler";
}
return "unknown";
}

View File

@ -0,0 +1,79 @@
#ifndef __SD_CORE_GGML_EXTEND_BACKEND_H__
#define __SD_CORE_GGML_EXTEND_BACKEND_H__
#include <cstdint>
#include <cstring>
#include <memory>
#include <string>
#include <unordered_map>
#include "ggml-backend.h"
#include "ggml.h"
enum class SDBackendModule {
DIFFUSION,
TE,
CLIP_VISION,
VAE,
CONTROL_NET,
PHOTOMAKER,
UPSCALER,
};
struct SDBackendAssignment {
std::string default_name;
std::unordered_map<SDBackendModule, std::string> module_names;
bool empty() const;
std::string get(SDBackendModule module) const;
void set_default(const std::string& name);
void set_module(SDBackendModule module, const std::string& name);
};
struct SDBackendHandleDeleter {
void operator()(ggml_backend_t backend) const;
};
using SDBackendHandle = std::unique_ptr<struct ggml_backend, SDBackendHandleDeleter>;
class SDBackendManager {
private:
SDBackendAssignment runtime_assignment_;
SDBackendAssignment params_assignment_;
std::unordered_map<std::string, SDBackendHandle> backends_;
public:
SDBackendManager() = default;
~SDBackendManager();
SDBackendManager(const SDBackendManager&) = delete;
SDBackendManager& operator=(const SDBackendManager&) = delete;
bool init(const char* backend_spec,
const char* params_backend_spec,
bool offload_params_to_cpu,
bool keep_clip_on_cpu,
bool keep_vae_on_cpu,
bool keep_control_net_on_cpu,
std::string* error);
void reset();
ggml_backend_t runtime_backend(SDBackendModule module);
ggml_backend_t params_backend(SDBackendModule module);
bool runtime_backend_is_cpu(SDBackendModule module);
bool params_backend_is_cpu(SDBackendModule module);
bool runtime_backend_supports_host_buffer(SDBackendModule module);
private:
bool validate(std::string* error) const;
ggml_backend_t init_cached_backend(const std::string& name);
};
bool sd_backend_is(ggml_backend_t backend, const std::string& name);
bool sd_backend_is_cpu(ggml_backend_t backend);
ggml_backend_t sd_backend_cpu_init();
bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
const char* sd_backend_module_name(SDBackendModule module);
void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
#endif // __SD_CORE_GGML_EXTEND_BACKEND_H__

806
src/core/ggml_graph_cut.cpp Normal file
View File

@ -0,0 +1,806 @@
#include "core/ggml_graph_cut.h"
#include <algorithm>
#include <cstring>
#include <map>
#include <set>
#include <sstream>
#include <stack>
#include <unordered_map>
#include "core/util.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml/src/ggml-impl.h"
namespace sd::ggml_graph_cut {
static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return "<null>";
}
if (tensor->name[0] != '\0') {
return tensor->name;
}
return sd_format("<tensor@%p>", (const void*)tensor);
}
static int graph_leaf_index(ggml_cgraph* gf, const ggml_tensor* tensor) {
GGML_ASSERT(gf != nullptr);
GGML_ASSERT(tensor != nullptr);
for (int i = 0; i < gf->n_leafs; ++i) {
if (gf->leafs[i] == tensor) {
return i;
}
}
return -1;
}
static bool is_params_tensor(const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const ggml_tensor* tensor) {
if (tensor == nullptr) {
return false;
}
return params_tensor_set.find(tensor) != params_tensor_set.end();
}
static int graph_node_index_by_name(ggml_cgraph* gf, const char* name) {
GGML_ASSERT(gf != nullptr);
if (name == nullptr || name[0] == '\0') {
return -1;
}
const int n_nodes = ggml_graph_n_nodes(gf);
for (int i = 0; i < n_nodes; ++i) {
ggml_tensor* node = ggml_graph_node(gf, i);
if (node != nullptr && std::strcmp(node->name, name) == 0) {
return i;
}
}
return -1;
}
static Plan::InputShape input_shape(const ggml_tensor* tensor) {
Plan::InputShape shape;
if (tensor == nullptr) {
return shape;
}
shape.type = tensor->type;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
shape.ne[static_cast<size_t>(i)] = tensor->ne[i];
}
return shape;
}
static size_t graph_cut_segment_vram_bytes(const Segment& segment) {
return segment.compute_buffer_size +
segment.input_param_bytes +
segment.input_previous_cut_bytes +
segment.output_bytes;
}
size_t max_vram_gib_to_bytes(float max_vram) {
if (max_vram <= 0.f) {
return 0;
}
return static_cast<size_t>(static_cast<double>(max_vram) * MAX_VRAM_BYTES_PER_GIB);
}
static float max_vram_bytes_to_gib(size_t max_vram_bytes) {
return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
}
static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) {
if (backend == nullptr) {
LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting");
return 0;
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (dev == nullptr) {
LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting");
return 0;
}
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting");
return 0;
}
size_t free_vram = 0;
size_t total_vram = 0;
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
size_t spare_bytes = static_cast<size_t>(MAX_VRAM_BYTES_PER_GIB * spare_vram);
if (free_vram <= spare_bytes) {
LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget",
free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram);
return 0;
}
const size_t max_vram_bytes = free_vram - spare_bytes;
LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB",
free_vram / MAX_VRAM_BYTES_PER_GIB,
total_vram / MAX_VRAM_BYTES_PER_GIB,
spare_vram,
max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
return max_vram_bytes;
}
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
if (max_vram >= 0.f) {
return max_vram;
}
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend));
}
static Segment make_segment_seed(const Plan& plan,
size_t start_segment_index,
size_t end_segment_index) {
GGML_ASSERT(start_segment_index < plan.segments.size());
GGML_ASSERT(end_segment_index < plan.segments.size());
GGML_ASSERT(start_segment_index <= end_segment_index);
Segment seed;
const auto& start_segment = plan.segments[start_segment_index];
const auto& target_segment = plan.segments[end_segment_index];
std::unordered_set<int> seen_output_node_indices;
for (size_t seg_idx = start_segment_index; seg_idx <= end_segment_index; ++seg_idx) {
for (int output_node_index : plan.segments[seg_idx].output_node_indices) {
if (seen_output_node_indices.insert(output_node_index).second) {
seed.output_node_indices.push_back(output_node_index);
}
}
}
if (start_segment_index == end_segment_index) {
seed.group_name = target_segment.group_name;
} else {
seed.group_name = sd_format("%s..%s",
start_segment.group_name.c_str(),
target_segment.group_name.c_str());
}
return seed;
}
static void build_segment(ggml_cgraph* gf,
Plan& plan,
Segment& segment,
const std::unordered_map<const ggml_tensor*, int>& producer_index,
std::unordered_set<int>& available_cut_output_node_indices,
ggml_backend_t backend,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
std::set<int> internal_nodes;
std::unordered_set<const ggml_tensor*> input_seen;
std::vector<Segment::InputRef> input_refs;
std::stack<ggml_tensor*> work_stack;
for (int output_node_index : segment.output_node_indices) {
ggml_tensor* output = ggml_graph_node(gf, output_node_index);
if (output != nullptr) {
work_stack.push(output);
}
}
while (!work_stack.empty()) {
ggml_tensor* tensor = work_stack.top();
work_stack.pop();
if (tensor == nullptr) {
continue;
}
auto producer_it = producer_index.find(tensor);
if (producer_it == producer_index.end()) {
if (input_seen.insert(tensor).second) {
Segment::InputRef input_ref;
input_ref.type = is_params_tensor(params_tensor_set, tensor) ? Segment::INPUT_PARAM : Segment::INPUT_EXTERNAL;
input_ref.display_name = graph_cut_tensor_display_name(tensor);
input_ref.leaf_index = graph_leaf_index(gf, tensor);
input_refs.push_back(std::move(input_ref));
}
continue;
}
int node_idx = producer_it->second;
if (available_cut_output_node_indices.find(node_idx) != available_cut_output_node_indices.end()) {
if (input_seen.insert(tensor).second) {
Segment::InputRef input_ref;
input_ref.type = Segment::INPUT_PREVIOUS_CUT;
input_ref.display_name = graph_cut_tensor_display_name(tensor);
input_ref.node_index = node_idx;
input_refs.push_back(std::move(input_ref));
}
continue;
}
if (!internal_nodes.insert(node_idx).second) {
continue;
}
ggml_tensor* node = ggml_graph_node(gf, node_idx);
for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
if (node->src[src_idx] != nullptr) {
work_stack.push(node->src[src_idx]);
}
}
}
if (!internal_nodes.empty()) {
segment.internal_node_indices.assign(internal_nodes.begin(), internal_nodes.end());
}
std::sort(input_refs.begin(),
input_refs.end(),
[](const Segment::InputRef& a, const Segment::InputRef& b) {
if (a.type != b.type) {
return a.type < b.type;
}
return a.display_name < b.display_name;
});
segment.input_refs = input_refs;
for (const auto& input : input_refs) {
ggml_tensor* current_input = input_tensor(gf, input);
size_t tensor_bytes = current_input == nullptr
? 0
: (input.type == Segment::INPUT_PREVIOUS_CUT
? cache_tensor_bytes(current_input)
: ggml_nbytes(current_input));
switch (input.type) {
case Segment::INPUT_PREVIOUS_CUT:
segment.input_previous_cut_bytes += tensor_bytes;
break;
case Segment::INPUT_PARAM:
segment.input_param_bytes += tensor_bytes;
break;
case Segment::INPUT_EXTERNAL:
default:
segment.input_external_bytes += tensor_bytes;
break;
}
}
for (int output_node_index : segment.output_node_indices) {
ggml_tensor* output = ggml_graph_node(gf, output_node_index);
segment.output_bytes += cache_tensor_bytes(output);
}
segment.compute_buffer_size = measure_segment_compute_buffer(backend, gf, segment, log_desc);
for (int output_node_index : segment.output_node_indices) {
available_cut_output_node_indices.insert(output_node_index);
}
plan.segments.push_back(std::move(segment));
}
bool is_graph_cut_tensor(const ggml_tensor* tensor) {
if (tensor == nullptr || tensor->name[0] == '\0') {
return false;
}
return std::strncmp(tensor->name, GGML_RUNNER_CUT_PREFIX, std::strlen(GGML_RUNNER_CUT_PREFIX)) == 0;
}
std::string make_graph_cut_name(const std::string& group, const std::string& output) {
return std::string(GGML_RUNNER_CUT_PREFIX) + group + "|" + output;
}
void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output) {
if (tensor == nullptr) {
return;
}
auto name = make_graph_cut_name(group, output);
ggml_set_name(tensor, name.c_str());
}
int leaf_count(ggml_cgraph* gf) {
GGML_ASSERT(gf != nullptr);
return gf->n_leafs;
}
ggml_tensor* leaf_tensor(ggml_cgraph* gf, int leaf_index) {
GGML_ASSERT(gf != nullptr);
if (leaf_index < 0 || leaf_index >= gf->n_leafs) {
return nullptr;
}
return gf->leafs[leaf_index];
}
ggml_backend_buffer_t tensor_buffer(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return nullptr;
}
return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
}
ggml_tensor* cache_source_tensor(ggml_tensor* tensor) {
if (tensor == nullptr) {
return nullptr;
}
if (tensor_buffer(tensor) == nullptr && tensor->src[0] != nullptr &&
ggml_nelements(tensor->src[0]) == ggml_nelements(tensor) &&
ggml_nbytes(tensor->src[0]) == ggml_nbytes(tensor)) {
return cache_source_tensor(tensor->src[0]);
}
return tensor->view_src ? tensor->view_src : tensor;
}
size_t cache_tensor_bytes(const ggml_tensor* tensor) {
if (tensor == nullptr) {
return 0;
}
const ggml_tensor* cache_src = tensor->view_src ? tensor->view_src : tensor;
return ggml_nbytes(cache_src);
}
bool plan_matches_graph(ggml_cgraph* gf, const Plan& plan) {
GGML_ASSERT(gf != nullptr);
if (ggml_graph_n_nodes(gf) != plan.n_nodes || gf->n_leafs != plan.n_leafs) {
return false;
}
for (const auto& input_shape_ref : plan.input_shapes) {
if (input_shape_ref.leaf_index < 0 || input_shape_ref.leaf_index >= gf->n_leafs) {
return false;
}
ggml_tensor* leaf = gf->leafs[input_shape_ref.leaf_index];
if (leaf == nullptr || input_shape_ref.type != leaf->type) {
return false;
}
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
if (input_shape_ref.ne[static_cast<size_t>(d)] != leaf->ne[d]) {
return false;
}
}
}
return true;
}
ggml_tensor* output_tensor(ggml_cgraph* gf, const Segment& segment, size_t output_index) {
GGML_ASSERT(gf != nullptr);
if (output_index >= segment.output_node_indices.size()) {
return nullptr;
}
int node_index = segment.output_node_indices[output_index];
if (node_index < 0 || node_index >= ggml_graph_n_nodes(gf)) {
return nullptr;
}
return ggml_graph_node(gf, node_index);
}
ggml_tensor* input_tensor(ggml_cgraph* gf, const Segment::InputRef& input_ref) {
GGML_ASSERT(gf != nullptr);
if (input_ref.type == Segment::INPUT_PREVIOUS_CUT) {
if (input_ref.node_index < 0 || input_ref.node_index >= ggml_graph_n_nodes(gf)) {
return nullptr;
}
return ggml_graph_node(gf, input_ref.node_index);
}
if (input_ref.leaf_index < 0 || input_ref.leaf_index >= gf->n_leafs) {
return nullptr;
}
return leaf_tensor(gf, input_ref.leaf_index);
}
std::vector<ggml_tensor*> param_tensors(ggml_cgraph* gf, const Segment& segment) {
GGML_ASSERT(gf != nullptr);
std::vector<ggml_tensor*> tensors;
std::unordered_set<ggml_tensor*> seen_tensors;
tensors.reserve(segment.input_refs.size());
seen_tensors.reserve(segment.input_refs.size());
for (const auto& input_ref : segment.input_refs) {
if (input_ref.type != Segment::INPUT_PARAM) {
continue;
}
ggml_tensor* tensor = input_tensor(gf, input_ref);
if (tensor == nullptr) {
continue;
}
if (seen_tensors.insert(tensor).second) {
tensors.push_back(tensor);
}
}
return tensors;
}
std::vector<ggml_tensor*> runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc) {
std::vector<ggml_tensor*> tensors = param_tensors(gf, segment);
std::vector<ggml_tensor*> filtered_tensors;
filtered_tensors.reserve(tensors.size());
for (ggml_tensor* tensor : tensors) {
if (tensor_buffer(tensor) == nullptr) {
LOG_WARN("%s graph cut skipping param input without buffer: segment=%s tensor=%s",
log_desc == nullptr ? "unknown" : log_desc,
segment.group_name.c_str(),
tensor->name);
continue;
}
filtered_tensors.push_back(tensor);
}
return filtered_tensors;
}
std::unordered_set<std::string> collect_future_input_names(ggml_cgraph* gf,
const Plan& plan,
size_t current_segment_index) {
GGML_ASSERT(gf != nullptr);
std::unordered_set<std::string> future_input_names;
for (size_t seg_idx = current_segment_index + 1; seg_idx < plan.segments.size(); ++seg_idx) {
const auto& segment = plan.segments[seg_idx];
for (const auto& input_ref : segment.input_refs) {
if (input_ref.type != Segment::INPUT_PREVIOUS_CUT) {
continue;
}
ggml_tensor* current_input = input_tensor(gf, input_ref);
if (current_input != nullptr && current_input->name[0] != '\0') {
future_input_names.insert(current_input->name);
}
}
}
return future_input_names;
}
ggml_cgraph* build_segment_graph(ggml_cgraph* gf,
const Segment& segment,
ggml_context** graph_ctx_out) {
GGML_ASSERT(gf != nullptr);
GGML_ASSERT(graph_ctx_out != nullptr);
const size_t graph_size = segment.internal_node_indices.size() + segment.input_refs.size() + 8;
ggml_init_params params = {
/*.mem_size =*/ggml_graph_overhead_custom(graph_size, false) + 1024,
/*.mem_buffer =*/nullptr,
/*.no_alloc =*/true,
};
ggml_context* graph_ctx = ggml_init(params);
GGML_ASSERT(graph_ctx != nullptr);
ggml_cgraph* segment_graph = ggml_new_graph_custom(graph_ctx, graph_size, false);
GGML_ASSERT(segment_graph != nullptr);
for (const auto& input : segment.input_refs) {
ggml_tensor* current_input = input_tensor(gf, input);
if (current_input == nullptr) {
continue;
}
GGML_ASSERT(segment_graph->n_leafs < segment_graph->size);
segment_graph->leafs[segment_graph->n_leafs++] = current_input;
}
for (int output_node_index : segment.output_node_indices) {
ggml_tensor* output = ggml_graph_node(gf, output_node_index);
if (output == nullptr) {
continue;
}
ggml_set_output(output);
}
for (int node_idx : segment.internal_node_indices) {
ggml_graph_add_node(segment_graph, ggml_graph_node(gf, node_idx));
}
*graph_ctx_out = graph_ctx;
return segment_graph;
}
size_t measure_segment_compute_buffer(ggml_backend_t backend,
ggml_cgraph* gf,
const Segment& segment,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
if (segment.internal_node_indices.empty()) {
return 0;
}
ggml_context* graph_ctx = nullptr;
ggml_cgraph* segment_graph = build_segment_graph(gf, segment, &graph_ctx);
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
size_t sizes[1] = {0};
ggml_gallocr_reserve_n_size(
allocr,
segment_graph,
nullptr,
nullptr,
sizes);
size_t buffer_size = sizes[0];
ggml_gallocr_free(allocr);
ggml_free(graph_ctx);
return buffer_size;
}
Plan build_plan(ggml_backend_t backend,
ggml_cgraph* gf,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
Plan plan;
plan.available = true;
const int n_nodes = ggml_graph_n_nodes(gf);
if (n_nodes <= 0) {
return plan;
}
plan.n_nodes = n_nodes;
plan.n_leafs = gf->n_leafs;
for (int i = 0; i < gf->n_leafs; ++i) {
ggml_tensor* leaf = gf->leafs[i];
if (is_params_tensor(params_tensor_set, leaf)) {
continue;
}
auto shape = input_shape(leaf);
shape.leaf_index = i;
plan.input_shapes.push_back(shape);
}
std::unordered_map<const ggml_tensor*, int> producer_index;
producer_index.reserve(static_cast<size_t>(n_nodes));
for (int i = 0; i < n_nodes; ++i) {
producer_index[ggml_graph_node(gf, i)] = i;
}
std::vector<Segment> grouped_segments;
std::unordered_map<std::string, size_t> group_to_segment;
for (int i = 0; i < n_nodes; ++i) {
ggml_tensor* node = ggml_graph_node(gf, i);
if (!is_graph_cut_tensor(node)) {
continue;
}
plan.has_cuts = true;
std::string full_name(node->name);
std::string payload = full_name.substr(std::strlen(GGML_RUNNER_CUT_PREFIX));
size_t sep = payload.find('|');
std::string group = sep == std::string::npos ? payload : payload.substr(0, sep);
auto it = group_to_segment.find(group);
if (it == group_to_segment.end()) {
Segment segment;
segment.group_name = group;
segment.output_node_indices.push_back(i);
group_to_segment[group] = grouped_segments.size();
grouped_segments.push_back(std::move(segment));
} else {
auto& segment = grouped_segments[it->second];
segment.output_node_indices.push_back(i);
}
}
if (!plan.has_cuts) {
return plan;
}
std::unordered_set<int> available_cut_output_node_indices;
available_cut_output_node_indices.reserve(static_cast<size_t>(n_nodes));
for (auto& segment : grouped_segments) {
build_segment(gf,
plan,
segment,
producer_index,
available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
}
int final_output_index = graph_node_index_by_name(gf, "ggml_runner_final_result_tensor");
if (final_output_index < 0) {
final_output_index = n_nodes - 1;
}
ggml_tensor* final_output = final_output_index >= 0 ? ggml_graph_node(gf, final_output_index) : nullptr;
if (final_output != nullptr && available_cut_output_node_indices.find(final_output_index) == available_cut_output_node_indices.end()) {
Segment final_segment;
final_segment.group_name = "ggml_runner.final";
final_segment.output_node_indices.push_back(final_output_index);
build_segment(gf,
plan,
final_segment,
producer_index,
available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
}
return plan;
}
Plan apply_max_vram_budget(ggml_cgraph* gf,
const Plan& base_plan,
size_t max_graph_vram_bytes,
ggml_backend_t backend,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
int64_t t_budget_begin = ggml_time_ms();
if (max_graph_vram_bytes == 0 || !base_plan.has_cuts || base_plan.segments.size() <= 1) {
return base_plan;
}
const int n_nodes = ggml_graph_n_nodes(gf);
std::unordered_map<const ggml_tensor*, int> producer_index;
producer_index.reserve(static_cast<size_t>(n_nodes));
for (int i = 0; i < n_nodes; ++i) {
producer_index[ggml_graph_node(gf, i)] = i;
}
Plan merged_plan;
merged_plan.available = true;
merged_plan.has_cuts = base_plan.has_cuts;
merged_plan.valid = base_plan.valid;
merged_plan.n_nodes = base_plan.n_nodes;
merged_plan.n_leafs = base_plan.n_leafs;
std::unordered_set<int> available_cut_output_node_indices;
available_cut_output_node_indices.reserve(static_cast<size_t>(n_nodes));
size_t start_segment_index = 0;
while (start_segment_index < base_plan.segments.size()) {
Plan single_plan;
auto single_available_cut_output_node_indices = available_cut_output_node_indices;
auto single_seed = make_segment_seed(base_plan,
start_segment_index,
start_segment_index);
build_segment(gf,
single_plan,
single_seed,
producer_index,
single_available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
GGML_ASSERT(!single_plan.segments.empty());
size_t best_end_segment_index = start_segment_index;
bool can_merge_next_segment = graph_cut_segment_vram_bytes(single_plan.segments.back()) <= max_graph_vram_bytes;
while (can_merge_next_segment && best_end_segment_index + 1 < base_plan.segments.size()) {
const size_t next_end_segment_index = best_end_segment_index + 1;
Plan candidate_plan;
auto candidate_available_cut_output_node_indices = available_cut_output_node_indices;
auto candidate_seed = make_segment_seed(base_plan,
start_segment_index,
next_end_segment_index);
build_segment(gf,
candidate_plan,
candidate_seed,
producer_index,
candidate_available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
GGML_ASSERT(!candidate_plan.segments.empty());
const auto& candidate_segment = candidate_plan.segments.back();
if (graph_cut_segment_vram_bytes(candidate_segment) > max_graph_vram_bytes) {
break;
}
best_end_segment_index = next_end_segment_index;
}
auto best_seed = make_segment_seed(base_plan,
start_segment_index,
best_end_segment_index);
build_segment(gf,
merged_plan,
best_seed,
producer_index,
available_cut_output_node_indices,
backend,
params_tensor_set,
log_desc);
start_segment_index = best_end_segment_index + 1;
}
if (log_desc != nullptr && merged_plan.segments.size() != base_plan.segments.size()) {
LOG_INFO("%s graph cut max_vram=%.2f MB merged %zu segments -> %zu segments",
log_desc,
max_graph_vram_bytes / 1024.0 / 1024.0,
base_plan.segments.size(),
merged_plan.segments.size());
}
if (log_desc != nullptr) {
LOG_DEBUG("%s graph cut max_vram budget merge took %lld ms",
log_desc,
ggml_time_ms() - t_budget_begin);
}
return merged_plan;
}
Plan resolve_plan(ggml_backend_t backend,
ggml_cgraph* gf,
PlanCache* cache,
size_t max_graph_vram_bytes,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc) {
GGML_ASSERT(backend != nullptr);
GGML_ASSERT(gf != nullptr);
GGML_ASSERT(cache != nullptr);
int64_t t_prepare_begin = ggml_time_ms();
Plan base_plan;
int64_t t_plan_begin = ggml_time_ms();
if (cache->graph_cut_plan.available && plan_matches_graph(gf, cache->graph_cut_plan)) {
base_plan = cache->graph_cut_plan;
} else {
base_plan = build_plan(backend, gf, params_tensor_set, log_desc);
cache->graph_cut_plan = base_plan;
cache->graph_cut_plan.available = true;
cache->budgeted_graph_cut_plan.available = false;
if (log_desc != nullptr) {
LOG_INFO("%s build cached graph cut plan done (taking %lld ms)", log_desc, ggml_time_ms() - t_plan_begin);
}
}
Plan resolved_plan = base_plan;
if (max_graph_vram_bytes > 0 && base_plan.has_cuts) {
if (cache->budgeted_graph_cut_plan.available &&
cache->budgeted_graph_cut_plan_max_vram_bytes == max_graph_vram_bytes &&
plan_matches_graph(gf, cache->budgeted_graph_cut_plan)) {
resolved_plan = cache->budgeted_graph_cut_plan;
} else {
resolved_plan = apply_max_vram_budget(gf,
base_plan,
max_graph_vram_bytes,
backend,
params_tensor_set,
log_desc);
cache->budgeted_graph_cut_plan = resolved_plan;
cache->budgeted_graph_cut_plan.available = true;
cache->budgeted_graph_cut_plan_max_vram_bytes = max_graph_vram_bytes;
}
}
return resolved_plan;
}
void annotate_residency(Plan& plan, size_t max_graph_vram_bytes) {
// Cached plans may be reused with a smaller live budget.
for (auto& seg : plan.segments) {
seg.residency = SegmentResidency::STREAMED;
}
if (max_graph_vram_bytes == 0 || plan.segments.size() < 2) {
return;
}
bool any_param_bearing = false;
for (const auto& seg : plan.segments) {
if (seg.input_param_bytes > 0) {
any_param_bearing = true;
break;
}
}
if (!any_param_bearing) {
return;
}
// Leave room for the largest active streamed segment.
size_t worst_streamed_footprint = 0;
for (const auto& seg : plan.segments) {
const size_t seg_footprint = seg.input_param_bytes +
seg.compute_buffer_size +
seg.output_bytes +
seg.input_previous_cut_bytes +
seg.input_external_bytes;
if (seg_footprint > worst_streamed_footprint) {
worst_streamed_footprint = seg_footprint;
}
}
constexpr size_t safety = 512ull * 1024 * 1024;
const size_t reserved = safety + worst_streamed_footprint;
if (max_graph_vram_bytes <= reserved) {
return;
}
const size_t available = max_graph_vram_bytes - reserved;
size_t cumulative = 0;
for (auto& seg : plan.segments) {
if (cumulative + seg.input_param_bytes > available) {
break;
}
seg.residency = SegmentResidency::RESIDENT;
cumulative += seg.input_param_bytes;
}
}
} // namespace sd::ggml_graph_cut

117
src/core/ggml_graph_cut.h Normal file
View File

@ -0,0 +1,117 @@
#ifndef __SD_CORE_GGML_GRAPH_CUT_H__
#define __SD_CORE_GGML_GRAPH_CUT_H__
#include <array>
#include <cstdint>
#include <string>
#include <unordered_set>
#include <vector>
#include "ggml-backend.h"
#include "ggml.h"
namespace sd::ggml_graph_cut {
// Streaming residency for a segment's params.
enum class SegmentResidency : uint8_t {
STREAMED = 0,
RESIDENT = 1,
};
struct Segment {
enum InputType {
INPUT_EXTERNAL = 0,
INPUT_PREVIOUS_CUT,
INPUT_PARAM,
};
struct InputRef {
InputType type = INPUT_EXTERNAL;
std::string display_name;
int leaf_index = -1;
int node_index = -1;
};
size_t compute_buffer_size = 0;
size_t output_bytes = 0;
size_t input_external_bytes = 0;
size_t input_previous_cut_bytes = 0;
size_t input_param_bytes = 0;
std::string group_name;
std::vector<int> internal_node_indices;
std::vector<int> output_node_indices;
std::vector<InputRef> input_refs;
SegmentResidency residency = SegmentResidency::STREAMED;
};
struct Plan {
struct InputShape {
int leaf_index = -1;
ggml_type type = GGML_TYPE_COUNT;
std::array<int64_t, GGML_MAX_DIMS> ne = {0, 0, 0, 0};
};
bool available = false;
bool has_cuts = false;
bool valid = true;
int n_nodes = 0;
int n_leafs = 0;
std::vector<InputShape> input_shapes;
std::vector<Segment> segments;
};
struct PlanCache {
Plan graph_cut_plan;
Plan budgeted_graph_cut_plan;
size_t budgeted_graph_cut_plan_max_vram_bytes = 0;
};
static constexpr const char* GGML_RUNNER_CUT_PREFIX = "ggml_runner_cut:";
bool is_graph_cut_tensor(const ggml_tensor* tensor);
std::string make_graph_cut_name(const std::string& group, const std::string& output);
void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output);
int leaf_count(ggml_cgraph* gf);
ggml_tensor* leaf_tensor(ggml_cgraph* gf, int leaf_index);
ggml_backend_buffer_t tensor_buffer(const ggml_tensor* tensor);
ggml_tensor* cache_source_tensor(ggml_tensor* tensor);
size_t cache_tensor_bytes(const ggml_tensor* tensor);
bool plan_matches_graph(ggml_cgraph* gf, const Plan& plan);
ggml_tensor* output_tensor(ggml_cgraph* gf, const Segment& segment, size_t output_index);
ggml_tensor* input_tensor(ggml_cgraph* gf, const Segment::InputRef& input_ref);
std::vector<ggml_tensor*> param_tensors(ggml_cgraph* gf, const Segment& segment);
std::vector<ggml_tensor*> runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc);
std::unordered_set<std::string> collect_future_input_names(ggml_cgraph* gf,
const Plan& plan,
size_t current_segment_index);
ggml_cgraph* build_segment_graph(ggml_cgraph* gf,
const Segment& segment,
ggml_context** graph_ctx_out);
size_t measure_segment_compute_buffer(ggml_backend_t backend,
ggml_cgraph* gf,
const Segment& segment,
const char* log_desc);
size_t max_vram_gib_to_bytes(float max_vram);
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend);
Plan build_plan(ggml_backend_t backend,
ggml_cgraph* gf,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc);
Plan apply_max_vram_budget(ggml_cgraph* gf,
const Plan& base_plan,
size_t max_graph_vram_bytes,
ggml_backend_t backend,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc);
Plan resolve_plan(ggml_backend_t backend,
ggml_cgraph* gf,
PlanCache* cache,
size_t max_graph_vram_bytes,
const std::unordered_set<const ggml_tensor*>& params_tensor_set,
const char* log_desc);
// Mark leading segments resident when they fit after streamed-segment headroom.
void annotate_residency(Plan& plan, size_t max_graph_vram_bytes);
} // namespace sd::ggml_graph_cut
#endif // __SD_CORE_GGML_GRAPH_CUT_H__

132
src/core/layer_registry.cpp Normal file
View File

@ -0,0 +1,132 @@
#include "core/layer_registry.h"
#include <utility>
#include "core/util.h"
namespace sd::layer_registry {
void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) {
auto& info = layers_[name];
info.tensors.push_back(tensor);
info.bytes += ggml_nbytes(tensor);
}
bool LayerRegistry::move_layer_to_gpu(const std::string& name) {
auto it = layers_.find(name);
if (it == layers_.end())
return false;
LayerInfo& info = it->second;
if (info.on_gpu)
return true;
if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) {
LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU",
name.c_str());
return false;
}
if (info.tensors.empty()) {
info.on_gpu = true;
return true;
}
// 1. Build a no_alloc context big enough to hold one twin tensor per CPU
// tensor, plus a little overhead.
const size_t ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024;
ggml_init_params ctx_params{ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true};
ggml_context* twin_ctx = ggml_init(ctx_params);
if (twin_ctx == nullptr) {
LOG_ERROR("layer_registry: failed to allocate twin context for '%s'",
name.c_str());
return false;
}
// 2. Create one GPU twin per CPU tensor. The twin shares the original
// name so any name-based lookup keeps working.
std::vector<ggml_tensor*> gpu_twins;
gpu_twins.reserve(info.tensors.size());
for (ggml_tensor* cpu_t : info.tensors) {
ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t);
if (cpu_t->name[0] != '\0') {
ggml_set_name(twin, cpu_t->name);
}
gpu_twins.push_back(twin);
}
// 3. Back the twins with a GPU buffer in one alloc call.
ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_);
if (gpu_buffer == nullptr) {
LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'",
name.c_str());
ggml_free(twin_ctx);
return false;
}
// 4. H2D copy + sync.
for (size_t i = 0; i < info.tensors.size(); ++i) {
ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]);
}
ggml_backend_synchronize(gpu_backend_);
// 5. Swap buffer/data/extra so the originals now point at GPU memory.
for (size_t i = 0; i < info.tensors.size(); ++i) {
std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer);
std::swap(info.tensors[i]->data, gpu_twins[i]->data);
std::swap(info.tensors[i]->extra, gpu_twins[i]->extra);
}
info.gpu_twins = std::move(gpu_twins);
info.twin_ctx = twin_ctx;
info.gpu_buffer = gpu_buffer;
info.on_gpu = true;
return true;
}
bool LayerRegistry::move_layer_to_cpu(const std::string& name) {
auto it = layers_.find(name);
if (it == layers_.end())
return false;
LayerInfo& info = it->second;
if (!info.on_gpu)
return true;
if (info.tensors.size() != info.gpu_twins.size()) {
LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'",
name.c_str());
return false;
}
// 1. Swap back: originals point at CPU memory again.
for (size_t i = 0; i < info.tensors.size(); ++i) {
if (info.gpu_twins[i] == nullptr)
continue;
std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer);
std::swap(info.tensors[i]->data, info.gpu_twins[i]->data);
std::swap(info.tensors[i]->extra, info.gpu_twins[i]->extra);
}
// 2. Free the GPU buffer + twin context.
if (info.gpu_buffer != nullptr) {
ggml_backend_buffer_free(info.gpu_buffer);
info.gpu_buffer = nullptr;
}
if (info.twin_ctx != nullptr) {
ggml_free(info.twin_ctx);
info.twin_ctx = nullptr;
}
info.gpu_twins.clear();
info.on_gpu = false;
return true;
}
bool LayerRegistry::is_layer_on_gpu(const std::string& name) const {
auto it = layers_.find(name);
return it != layers_.end() && it->second.on_gpu;
}
size_t LayerRegistry::get_layer_size(const std::string& name) const {
auto it = layers_.find(name);
return it != layers_.end() ? it->second.bytes : 0;
}
} // namespace sd::layer_registry

50
src/core/layer_registry.h Normal file
View File

@ -0,0 +1,50 @@
#ifndef __SD_CORE_LAYER_REGISTRY_H__
#define __SD_CORE_LAYER_REGISTRY_H__
#include <map>
#include <set>
#include <string>
#include <vector>
#include "ggml-backend.h"
#include "ggml.h"
namespace sd::layer_registry {
struct LayerInfo {
std::vector<ggml_tensor*> tensors;
std::vector<ggml_tensor*> gpu_twins;
ggml_context* twin_ctx = nullptr;
ggml_backend_buffer_t gpu_buffer = nullptr;
bool on_gpu = false;
size_t bytes = 0;
};
class LayerRegistry {
public:
LayerRegistry() = default;
LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend)
: gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {}
void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) {
gpu_backend_ = gpu_backend;
cpu_backend_ = cpu_backend;
}
void register_layer(const std::string& name, ggml_tensor* tensor);
bool move_layer_to_gpu(const std::string& name);
bool move_layer_to_cpu(const std::string& name);
bool is_layer_on_gpu(const std::string& name) const;
size_t get_layer_size(const std::string& name) const;
size_t get_layer_count() const { return layers_.size(); }
const std::map<std::string, LayerInfo>& layers() const { return layers_; }
private:
ggml_backend_t gpu_backend_ = nullptr;
ggml_backend_t cpu_backend_ = nullptr;
std::map<std::string, LayerInfo> layers_;
};
} // namespace sd::layer_registry
#endif // __SD_CORE_LAYER_REGISTRY_H__

View File

@ -1,5 +1,5 @@
#ifndef __ORDERED_MAP_HPP__ #ifndef __SD_CORE_ORDERED_MAP_HPP__
#define __ORDERED_MAP_HPP__ #define __SD_CORE_ORDERED_MAP_HPP__
#include <iostream> #include <iostream>
#include <list> #include <list>
@ -174,4 +174,4 @@ public:
} }
}; };
#endif // __ORDERED_MAP_HPP__ #endif // __SD_CORE_ORDERED_MAP_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __RNG_H__ #ifndef __SD_CORE_RNG_HPP__
#define __RNG_H__ #define __SD_CORE_RNG_HPP__
#include <random> #include <random>
#include <vector> #include <vector>
@ -32,4 +32,4 @@ public:
} }
}; };
#endif // __RNG_H__ #endif // __SD_CORE_RNG_HPP__

View File

@ -1,10 +1,10 @@
#ifndef __RNG_MT19937_HPP__ #ifndef __SD_CORE_RNG_MT19937_HPP__
#define __RNG_MT19937_HPP__ #define __SD_CORE_RNG_MT19937_HPP__
#include <cmath> #include <cmath>
#include <vector> #include <vector>
#include "rng.hpp" #include "core/rng.hpp"
// RNG imitiating torch cpu randn on CPU. // RNG imitiating torch cpu randn on CPU.
// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE // Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
@ -144,4 +144,4 @@ public:
} }
}; };
#endif // __RNG_MT19937_HPP__ #endif // __SD_CORE_RNG_MT19937_HPP__

View File

@ -1,10 +1,10 @@
#ifndef __RNG_PHILOX_H__ #ifndef __SD_CORE_RNG_PHILOX_HPP__
#define __RNG_PHILOX_H__ #define __SD_CORE_RNG_PHILOX_HPP__
#include <cmath> #include <cmath>
#include <vector> #include <vector>
#include "rng.hpp" #include "core/rng.hpp"
// RNG imitiating torch cuda randn on CPU. // RNG imitiating torch cuda randn on CPU.
// Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py // Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
@ -122,4 +122,4 @@ public:
} }
}; };
#endif // __RNG_PHILOX_H__ #endif // __SD_CORE_RNG_PHILOX_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_TENSOR_HPP__ #ifndef __SD_CORE_TENSOR_HPP__
#define __SD_TENSOR_HPP__ #define __SD_CORE_TENSOR_HPP__
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
@ -16,7 +16,7 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "rng.hpp" #include "core/rng.hpp"
namespace sd { namespace sd {
@ -235,6 +235,7 @@ namespace sd {
Tensor& masked_fill_(const Tensor<uint8_t>& mask, const T& value); Tensor& masked_fill_(const Tensor<uint8_t>& mask, const T& value);
T sum() const;
T mean() const; T mean() const;
static Tensor zeros(std::vector<int64_t> shape) { static Tensor zeros(std::vector<int64_t> shape) {
@ -327,6 +328,24 @@ namespace sd {
std::vector<int64_t> shape_; std::vector<int64_t> shape_;
}; };
template <typename T>
inline T Tensor<T>::sum() const {
T total = T{};
for (const T& value : data_) {
total += value;
}
return total;
}
template <>
inline float Tensor<float>::sum() const {
double total = 0.0;
for (float value : data_) {
total += static_cast<double>(value);
}
return static_cast<float>(total);
}
template <typename T> template <typename T>
inline T Tensor<T>::mean() const { inline T Tensor<T>::mean() const {
if (empty()) { if (empty()) {
@ -815,11 +834,202 @@ namespace sd {
namespace ops { namespace ops {
enum class InterpolateMode { enum class InterpolateMode {
Nearest, Nearest,
NearestExact,
NearestMax, NearestMax,
NearestMin, NearestMin,
NearestAvg, NearestAvg,
Bilinear,
Bicubic,
Lanczos,
}; };
inline bool is_nearest_like_interpolate_mode(InterpolateMode mode) {
return mode == InterpolateMode::Nearest ||
mode == InterpolateMode::NearestExact ||
mode == InterpolateMode::NearestMax ||
mode == InterpolateMode::NearestMin ||
mode == InterpolateMode::NearestAvg;
}
inline bool is_2d_filter_interpolate_mode(InterpolateMode mode) {
return mode == InterpolateMode::Bilinear ||
mode == InterpolateMode::Bicubic ||
mode == InterpolateMode::Lanczos;
}
inline int64_t nearest_exact_interpolate_index(int64_t output_index,
int64_t input_size,
int64_t output_size) {
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
const double center = (static_cast<double>(output_index) + 0.5) * scale - 0.5;
return std::min(std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0), input_size - 1);
}
inline double linear_interpolate_weight(double x) {
x = std::abs(x);
return x < 1.0 ? 1.0 - x : 0.0;
}
inline double cubic_interpolate_weight(double x) {
constexpr double a = -0.75; // Match PyTorch bicubic interpolation.
x = std::abs(x);
if (x <= 1.0) {
return ((a + 2.0) * x - (a + 3.0)) * x * x + 1.0;
}
if (x < 2.0) {
return ((a * x - 5.0 * a) * x + 8.0 * a) * x - 4.0 * a;
}
return 0.0;
}
inline double sinc(double x) {
constexpr double pi = 3.14159265358979323846;
if (std::abs(x) < 1e-12) {
return 1.0;
}
const double pix = pi * x;
return std::sin(pix) / pix;
}
inline double lanczos_interpolate_weight(double x) {
constexpr double radius = 3.0;
x = std::abs(x);
if (x >= radius) {
return 0.0;
}
return sinc(x) * sinc(x / radius);
}
struct InterpolateContributor {
int64_t index;
double weight;
};
inline std::vector<std::vector<InterpolateContributor>> make_interpolate_contributors(
int64_t input_size,
int64_t output_size,
InterpolateMode mode,
bool antialias) {
std::vector<std::vector<InterpolateContributor>> contributors(static_cast<size_t>(output_size));
const double scale = static_cast<double>(input_size) / static_cast<double>(output_size);
const double filter_scale = antialias ? std::max(1.0, scale) : 1.0;
for (int64_t out = 0; out < output_size; ++out) {
const double center = (static_cast<double>(out) + 0.5) * scale - 0.5;
int64_t start = 0;
int64_t end = 0;
if (mode == InterpolateMode::Bilinear) {
const double support = filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else if (mode == InterpolateMode::Bicubic) {
const double support = 2.0 * filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else if (mode == InterpolateMode::Lanczos) {
const double support = 3.0 * filter_scale;
start = static_cast<int64_t>(std::ceil(center - support));
end = static_cast<int64_t>(std::floor(center + support));
} else {
tensor_throw_invalid_argument("Unsupported 2D filter interpolate mode: mode=" +
std::to_string(static_cast<int>(mode)));
}
double weight_sum = 0.0;
std::vector<InterpolateContributor>& axis_contributors = contributors[static_cast<size_t>(out)];
axis_contributors.reserve(static_cast<size_t>(end - start + 1));
for (int64_t in = start; in <= end; ++in) {
double weight = 0.0;
if (mode == InterpolateMode::Bilinear) {
weight = linear_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
} else if (mode == InterpolateMode::Bicubic) {
weight = cubic_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
} else {
weight = lanczos_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
}
if (weight == 0.0) {
continue;
}
const int64_t clamped_index = std::min(std::max<int64_t>(in, 0), input_size - 1);
axis_contributors.push_back({clamped_index, weight});
weight_sum += weight;
}
if ((antialias || mode == InterpolateMode::Lanczos) &&
std::abs(weight_sum) > 1e-12) {
for (auto& contributor : axis_contributors) {
contributor.weight /= weight_sum;
}
}
if (axis_contributors.empty()) {
const int64_t nearest = std::min(
std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0),
input_size - 1);
axis_contributors.push_back({nearest, 1.0});
}
}
return contributors;
}
template <typename T>
inline Tensor<T> interpolate_2d_filter(const Tensor<T>& input,
const std::vector<int64_t>& output_shape,
InterpolateMode mode,
bool antialias) {
if (input.dim() < 2) {
tensor_throw_invalid_argument("2D filter interpolate requires rank >= 2: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape));
}
for (size_t i = 2; i < output_shape.size(); ++i) {
if (input.shape()[i] != output_shape[i]) {
tensor_throw_invalid_argument("2D filter interpolate only supports resizing dimensions 0 and 1: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape));
}
}
Tensor<T> output(output_shape);
const int64_t input_width = input.shape()[0];
const int64_t input_height = input.shape()[1];
const int64_t output_width = output_shape[0];
const int64_t output_height = output_shape[1];
const int64_t input_plane = input_width * input_height;
const int64_t output_plane = output_width * output_height;
const int64_t plane_count = input.numel() / input_plane;
auto x_contributors = make_interpolate_contributors(input_width, output_width, mode, antialias);
auto y_contributors = make_interpolate_contributors(input_height, output_height, mode, antialias);
for (int64_t plane = 0; plane < plane_count; ++plane) {
const int64_t input_plane_offset = plane * input_plane;
const int64_t output_plane_offset = plane * output_plane;
for (int64_t y = 0; y < output_height; ++y) {
const auto& y_axis = y_contributors[static_cast<size_t>(y)];
for (int64_t x = 0; x < output_width; ++x) {
const auto& x_axis = x_contributors[static_cast<size_t>(x)];
double value = 0.0;
for (const auto& yc : y_axis) {
const int64_t input_row_offset = input_plane_offset + yc.index * input_width;
for (const auto& xc : x_axis) {
value += static_cast<double>(input.data()[input_row_offset + xc.index]) *
xc.weight * yc.weight;
}
}
output.data()[output_plane_offset + y * output_width + x] = static_cast<T>(value);
}
}
}
return output;
}
inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) { inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
if (index < 0) { if (index < 0) {
index += dim_size; index += dim_size;
@ -1014,17 +1224,20 @@ namespace sd {
inline Tensor<T> interpolate(const Tensor<T>& input, inline Tensor<T> interpolate(const Tensor<T>& input,
std::vector<int64_t> output_shape, std::vector<int64_t> output_shape,
InterpolateMode mode = InterpolateMode::Nearest, InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false) { bool align_corners = false,
const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest || bool antialias = false) {
mode == InterpolateMode::NearestMax || const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
mode == InterpolateMode::NearestMin || const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode);
mode == InterpolateMode::NearestAvg); if (!is_nearest_like_mode && !is_2d_filter_mode) {
if (!is_nearest_like_mode) { tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" + std::to_string(static_cast<int>(mode)));
}
if (antialias && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
std::to_string(static_cast<int>(mode))); std::to_string(static_cast<int>(mode)));
} }
if (align_corners) { if (align_corners) {
tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" + tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
tensor_shape_to_string(input.shape()) + ", output_shape=" + tensor_shape_to_string(input.shape()) + ", output_shape=" +
tensor_shape_to_string(output_shape)); tensor_shape_to_string(output_shape));
} }
@ -1051,6 +1264,10 @@ namespace sd {
} }
} }
if (is_2d_filter_mode) {
return interpolate_2d_filter(input, output_shape, mode, antialias);
}
bool has_downsampling = false; bool has_downsampling = false;
for (int64_t i = 0; i < input.dim(); ++i) { for (int64_t i = 0; i < input.dim(); ++i) {
if (input.shape()[i] > output_shape[i]) { if (input.shape()[i] > output_shape[i]) {
@ -1060,12 +1277,20 @@ namespace sd {
} }
Tensor<T> output(std::move(output_shape)); Tensor<T> output(std::move(output_shape));
if (mode == InterpolateMode::Nearest || !has_downsampling) { if (mode == InterpolateMode::Nearest ||
mode == InterpolateMode::NearestExact ||
!has_downsampling) {
for (int64_t flat = 0; flat < output.numel(); ++flat) { for (int64_t flat = 0; flat < output.numel(); ++flat) {
std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape()); std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0); std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) { for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i]; if (mode == InterpolateMode::NearestExact) {
input_coord[i] = nearest_exact_interpolate_index(output_coord[i],
input.shape()[i],
output.shape()[i]);
} else {
input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
}
} }
output[flat] = input.index(input_coord); output[flat] = input.index(input_coord);
} }
@ -1083,6 +1308,12 @@ namespace sd {
return T(0); return T(0);
case InterpolateMode::Nearest: case InterpolateMode::Nearest:
return T(0); return T(0);
case InterpolateMode::NearestExact:
return T(0);
case InterpolateMode::Bilinear:
case InterpolateMode::Bicubic:
case InterpolateMode::Lanczos:
break;
} }
tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" + tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
@ -1102,6 +1333,12 @@ namespace sd {
break; break;
case InterpolateMode::Nearest: case InterpolateMode::Nearest:
break; break;
case InterpolateMode::NearestExact:
break;
case InterpolateMode::Bilinear:
case InterpolateMode::Bicubic:
case InterpolateMode::Lanczos:
break;
} }
}; };
@ -1157,17 +1394,20 @@ namespace sd {
const std::optional<std::vector<int64_t>>& size, const std::optional<std::vector<int64_t>>& size,
const std::optional<std::vector<double>>& scale_factor, const std::optional<std::vector<double>>& scale_factor,
InterpolateMode mode = InterpolateMode::Nearest, InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false) { bool align_corners = false,
const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest || bool antialias = false) {
mode == InterpolateMode::NearestMax || const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
mode == InterpolateMode::NearestMin || const bool is_2d_filter_mode = is_2d_filter_interpolate_mode(mode);
mode == InterpolateMode::NearestAvg); if (!is_nearest_like_mode && !is_2d_filter_mode) {
if (!is_nearest_like_mode) { tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" + std::to_string(static_cast<int>(mode)));
}
if (antialias && !is_2d_filter_mode) {
tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
std::to_string(static_cast<int>(mode))); std::to_string(static_cast<int>(mode)));
} }
if (align_corners) { if (align_corners) {
tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" + tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
tensor_shape_to_string(input.shape())); tensor_shape_to_string(input.shape()));
} }
if (size.has_value() == scale_factor.has_value()) { if (size.has_value() == scale_factor.has_value()) {
@ -1211,7 +1451,7 @@ namespace sd {
} }
} }
return interpolate(input, std::move(output_shape), mode, align_corners); return interpolate(input, std::move(output_shape), mode, align_corners, antialias);
} }
template <typename T> template <typename T>
@ -1219,12 +1459,14 @@ namespace sd {
const std::optional<std::vector<int64_t>>& size, const std::optional<std::vector<int64_t>>& size,
double scale_factor, double scale_factor,
InterpolateMode mode = InterpolateMode::Nearest, InterpolateMode mode = InterpolateMode::Nearest,
bool align_corners = false) { bool align_corners = false,
bool antialias = false) {
return interpolate(input, return interpolate(input,
size, size,
std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor), std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor),
mode, mode,
align_corners); align_corners,
antialias);
} }
template <typename T> template <typename T>
@ -1419,4 +1661,4 @@ namespace sd {
} // namespace sd } // namespace sd
#endif #endif // __SD_CORE_TENSOR_HPP__

View File

@ -1,5 +1,5 @@
#ifndef __SD_TENSOR_GGML_HPP__ #ifndef __SD_CORE_TENSOR_GGML_HPP__
#define __SD_TENSOR_GGML_HPP__ #define __SD_CORE_TENSOR_GGML_HPP__
#include <array> #include <array>
#include <cstring> #include <cstring>
@ -8,8 +8,8 @@
#include <string> #include <string>
#include <type_traits> #include <type_traits>
#include "core/tensor.hpp"
#include "ggml.h" #include "ggml.h"
#include "tensor.hpp"
namespace sd { namespace sd {
@ -104,7 +104,7 @@ namespace sd {
throw std::invalid_argument("tensor file type does not match requested sd::Tensor type"); throw std::invalid_argument("tensor file type does not match requested sd::Tensor type");
} }
std::vector<int64_t> shape(4, 1); std::vector<int64_t> shape(n_dims, 1);
for (int i = 0; i < n_dims; ++i) { for (int i = 0; i < n_dims; ++i) {
int32_t dim = 1; int32_t dim = 1;
file.read(reinterpret_cast<char*>(&dim), sizeof(dim)); file.read(reinterpret_cast<char*>(&dim), sizeof(dim));
@ -124,4 +124,4 @@ namespace sd {
} // namespace sd } // namespace sd
#endif #endif // __SD_CORE_TENSOR_GGML_HPP__

View File

@ -1,8 +1,10 @@
#include "util.h" #include "core/util.h"
#include <algorithm> #include <algorithm>
#include <cctype>
#include <cmath> #include <cmath>
#include <codecvt> #include <codecvt>
#include <cstdarg> #include <cstdarg>
#include <exception>
#include <fstream> #include <fstream>
#include <locale> #include <locale>
#include <regex> #include <regex>
@ -11,7 +13,7 @@
#include <thread> #include <thread>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "preprocessing.hpp" #include "runtime/preprocessing.hpp"
#if defined(__APPLE__) && defined(__MACH__) #if defined(__APPLE__) && defined(__MACH__)
#include <sys/sysctl.h> #include <sys/sysctl.h>
@ -23,7 +25,6 @@
#include <unistd.h> #include <unistd.h>
#endif #endif
#include "ggml-cpu.h"
#include "ggml.h" #include "ggml.h"
#include "stable-diffusion.h" #include "stable-diffusion.h"
@ -111,7 +112,7 @@ private:
HANDLE hmapping_; HANDLE hmapping_;
}; };
std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) { std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
void* mapped_data = nullptr; void* mapped_data = nullptr;
size_t file_size = 0; size_t file_size = 0;
@ -119,10 +120,10 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
filename.c_str(), filename.c_str(),
GENERIC_READ, GENERIC_READ,
FILE_SHARE_READ, FILE_SHARE_READ,
NULL, nullptr,
OPEN_EXISTING, OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL, FILE_ATTRIBUTE_NORMAL,
NULL); nullptr);
if (file_handle == INVALID_HANDLE_VALUE) { if (file_handle == INVALID_HANDLE_VALUE) {
return nullptr; return nullptr;
@ -136,16 +137,20 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
file_size = static_cast<size_t>(size.QuadPart); file_size = static_cast<size_t>(size.QuadPart);
HANDLE mapping_handle = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL); DWORD page_prot = writable ? PAGE_WRITECOPY : PAGE_READONLY;
if (mapping_handle == NULL) { HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, page_prot, 0, 0, nullptr);
if (mapping_handle == nullptr) {
CloseHandle(file_handle); CloseHandle(file_handle);
return nullptr; return nullptr;
} }
mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size); DWORD view_access = writable ? FILE_MAP_COPY : FILE_MAP_READ;
if (mapped_data == NULL) { mapped_data = MapViewOfFile(mapping_handle, view_access, 0, 0, file_size);
if (mapped_data == nullptr) {
CloseHandle(mapping_handle); CloseHandle(mapping_handle);
CloseHandle(file_handle); CloseHandle(file_handle);
return nullptr; return nullptr;
@ -171,28 +176,85 @@ bool is_directory(const std::string& path) {
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode)); return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
} }
class MmapWrapperImpl : public MmapWrapper { struct MmapFlags {
public: bool sequential;
MmapWrapperImpl(void* data, size_t size) bool populate;
: MmapWrapper(data, size) {} bool willneed;
bool dontneed;
~MmapWrapperImpl() override {
munmap(data_, size_);
}
}; };
std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) { static MmapFlags get_mmap_flags() {
MmapFlags result = {};
const char* SD_MMAP_FLAGS = std::getenv("SD_MMAP_FLAGS");
if (SD_MMAP_FLAGS && *SD_MMAP_FLAGS) {
std::stringstream ss(SD_MMAP_FLAGS);
std::string token;
while (std::getline(ss, token, ',')) {
std::string ntoken = trim(token);
std::transform(ntoken.begin(), ntoken.end(), ntoken.begin(), ::tolower);
if (ntoken == "sequential") {
result.sequential = true;
} else if (ntoken == "populate") {
result.populate = true;
} else if (ntoken == "willneed") {
result.willneed = true;
} else if (ntoken == "dontneed") {
result.dontneed = true;
}
}
}
return result;
}
class MmapWrapperImpl : public MmapWrapper {
public:
MmapWrapperImpl(void* data, size_t size, int fd)
: MmapWrapper(data, size), fd_(fd) {}
~MmapWrapperImpl() override {
#ifdef __linux__
auto cfg_flags = get_mmap_flags();
// Drop the kernel pagecache pages for this file. madvise(DONTNEED)
// alone only unmaps from the process address space; pagecache
// entries persist (`free` reports them as buff/cache and the OOM
// killer doesn't touch them, but they ARE counted against
// overcommit and can starve other allocations on tight-RAM
// systems). posix_fadvise(POSIX_FADV_DONTNEED) is the documented
// way to evict pagecache for a specific fd's pages.
if (cfg_flags.dontneed) {
madvise(data_, size_, MADV_DONTNEED);
posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
}
#endif
munmap(data_, size_);
close(fd_);
}
private:
int fd_;
};
std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
int file_descriptor = open(filename.c_str(), O_RDONLY); int file_descriptor = open(filename.c_str(), O_RDONLY);
if (file_descriptor == -1) { if (file_descriptor == -1) {
return nullptr; return nullptr;
} }
auto cfg_flags = get_mmap_flags();
int mmap_flags = MAP_PRIVATE; int mmap_flags = MAP_PRIVATE;
#ifdef __linux__ #ifdef __linux__
// performance flags used by llama.cpp // Sequential access hint helps the kernel read-ahead efficiently and
// posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL); // also encourages eviction of already-read pages (the kernel keeps
// mmap_flags |= MAP_POPULATE; // a smaller working set when this is set).
if (cfg_flags.sequential) {
posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
}
if (cfg_flags.populate) {
mmap_flags |= MAP_POPULATE;
}
#endif #endif
struct stat sb; struct stat sb;
@ -203,20 +265,27 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
size_t file_size = sb.st_size; size_t file_size = sb.st_size;
void* mapped_data = mmap(NULL, file_size, PROT_READ, mmap_flags, file_descriptor, 0); if (file_size == 0) {
close(file_descriptor);
return nullptr;
}
close(file_descriptor); int mmap_prot = PROT_READ | (writable ? PROT_WRITE : 0);
void* mapped_data = mmap(nullptr, file_size, mmap_prot, mmap_flags, file_descriptor, 0);
if (mapped_data == MAP_FAILED) { if (mapped_data == MAP_FAILED) {
close(file_descriptor);
return nullptr; return nullptr;
} }
#ifdef __linux__ #ifdef __linux__
// performance flags used by llama.cpp if (cfg_flags.willneed) {
// posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED); posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
}
#endif #endif
return std::make_unique<MmapWrapperImpl>(mapped_data, file_size); return std::make_unique<MmapWrapperImpl>(mapped_data, file_size, file_descriptor);
} }
#endif #endif
@ -337,6 +406,88 @@ std::vector<std::string> split_string(const std::string& str, char delimiter) {
return result; return result;
} }
KeyValueArgs parse_key_value_args(const char* args, const char* context) {
KeyValueArgs pairs;
if (args == nullptr || args[0] == '\0') {
return pairs;
}
std::string raw(args);
size_t start = 0;
for (size_t pos = 0; pos <= raw.size(); ++pos) {
if (pos != raw.size() && raw[pos] != ',' && raw[pos] != ';') {
continue;
}
std::string token = trim(raw.substr(start, pos - start));
if (!token.empty()) {
size_t eq = token.find('=');
if (eq == std::string::npos) {
const char* log_context = context ? context : "key=value arg";
LOG_WARN("ignoring malformed %s '%s'", log_context, token.c_str());
} else {
std::string key = trim(token.substr(0, eq));
std::string value = trim(token.substr(eq + 1));
pairs.emplace_back(std::move(key), std::move(value));
}
}
start = pos + 1;
}
return pairs;
}
KeyValueArgs parse_key_value_args(const std::string& args, const char* context) {
return parse_key_value_args(args.c_str(), context);
}
bool parse_strict_float(const std::string& text, float& value) {
try {
size_t consumed = 0;
float parsed = std::stof(text, &consumed);
if (!trim(text.substr(consumed)).empty()) {
return false;
}
value = parsed;
return true;
} catch (const std::exception&) {
return false;
}
}
bool parse_strict_int(const std::string& text, int& value) {
try {
size_t consumed = 0;
int parsed = std::stoi(text, &consumed);
if (!trim(text.substr(consumed)).empty()) {
return false;
}
value = parsed;
return true;
} catch (const std::exception&) {
return false;
}
}
bool parse_strict_bool(const std::string& text, bool& value) {
std::string lowered = trim(text);
std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
if (lowered == "1" || lowered == "true" || lowered == "yes" || lowered == "on") {
value = true;
return true;
}
if (lowered == "0" || lowered == "false" || lowered == "no" || lowered == "off") {
value = false;
return true;
}
return false;
}
static std::string build_progress_bar(int step, int steps) { static std::string build_progress_bar(int step, int steps) {
std::string progress = " |"; std::string progress = " |";
int max_progress = 50; int max_progress = 50;
@ -495,26 +646,6 @@ sd_progress_cb_t sd_get_progress_callback() {
void* sd_get_progress_callback_data() { void* sd_get_progress_callback_data() {
return sd_progress_cb_data; return sd_progress_cb_data;
} }
const char* sd_get_system_info() {
static char buffer[1024];
std::stringstream ss;
ss << "System Info: \n";
ss << " SSE3 = " << ggml_cpu_has_sse3() << " | ";
ss << " AVX = " << ggml_cpu_has_avx() << " | ";
ss << " AVX2 = " << ggml_cpu_has_avx2() << " | ";
ss << " AVX512 = " << ggml_cpu_has_avx512() << " | ";
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
ss << " FMA = " << ggml_cpu_has_fma() << " | ";
ss << " NEON = " << ggml_cpu_has_neon() << " | ";
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
ss << " F16C = " << ggml_cpu_has_f16c() << " | ";
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
ss << " VSX = " << ggml_cpu_has_vsx() << " | ";
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer;
}
sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) { sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) {
const auto& shape = tensor.shape(); const auto& shape = tensor.shape();
@ -524,17 +655,7 @@ sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index)
int channel = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]); int channel = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel)); uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel));
GGML_ASSERT(data != nullptr); GGML_ASSERT(data != nullptr);
preprocessing_tensor_frame_to_sd_image(tensor, frame_index, data);
for (int iw = 0; iw < width; ++iw) {
for (int ih = 0; ih < height; ++ih) {
for (int ic = 0; ic < channel; ++ic) {
float value = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0)
: tensor.index(iw, ih, ic, frame_index);
value = std::clamp(value, 0.0f, 1.0f);
data[(ih * width + iw) * channel + ic] = static_cast<uint8_t>(std::round(value * 255.0f));
}
}
}
return { return {
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height), static_cast<uint32_t>(height),
@ -718,3 +839,136 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
return res; return res;
} }
static size_t get_utf8_char_len(char c) {
unsigned char uc = static_cast<unsigned char>(c);
if ((uc & 0x80) == 0) {
return 1;
}
if ((uc & 0xE0) == 0xC0) {
return 2;
}
if ((uc & 0xF0) == 0xE0) {
return 3;
}
if ((uc & 0xF8) == 0xF0) {
return 4;
}
return 1;
}
static bool is_ascii_alpha(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
static bool starts_with_at(const std::string& text, size_t pos, const std::string& needle) {
return pos + needle.size() <= text.size() && text.compare(pos, needle.size(), needle) == 0;
}
static bool is_word_internal_apostrophe(const std::string& text, size_t pos) {
return pos > 0 && pos + 1 < text.size() &&
is_ascii_alpha(text[pos - 1]) && is_ascii_alpha(text[pos + 1]);
}
static std::vector<std::pair<std::string, bool>> split_quotation(const std::string& text) {
static const std::vector<std::pair<std::string, std::string>> quote_pairs = {
{"'", "'"},
{"\"", "\""},
{"\xE2\x80\x98", "\xE2\x80\x99"},
{"\xE2\x80\x9C", "\xE2\x80\x9D"},
};
std::vector<std::pair<std::string, bool>> result;
size_t segment_start = 0;
size_t i = 0;
auto push_segment = [&](size_t begin, size_t end, bool matched) {
if (end > begin) {
result.emplace_back(text.substr(begin, end - begin), matched);
}
};
while (i < text.size()) {
bool matched_quote = false;
for (const auto& quote_pair : quote_pairs) {
const std::string& open_quote = quote_pair.first;
const std::string& close_quote = quote_pair.second;
if (!starts_with_at(text, i, open_quote)) {
continue;
}
if (open_quote == "'" && is_word_internal_apostrophe(text, i)) {
continue;
}
size_t search_pos = i + open_quote.size();
size_t close_pos = std::string::npos;
bool invalid = false;
while (search_pos < text.size()) {
if (open_quote != close_quote && starts_with_at(text, search_pos, open_quote)) {
invalid = true;
break;
}
if (starts_with_at(text, search_pos, close_quote)) {
if (close_quote == "'" && is_word_internal_apostrophe(text, search_pos)) {
search_pos += close_quote.size();
continue;
}
close_pos = search_pos;
break;
}
size_t char_len = get_utf8_char_len(text[search_pos]);
if (search_pos + char_len > text.size()) {
char_len = 1;
}
search_pos += char_len;
}
if (invalid || close_pos == std::string::npos) {
continue;
}
size_t quote_start = i;
push_segment(segment_start, quote_start, false);
i = close_pos + close_quote.size();
push_segment(quote_start, i, true);
segment_start = i;
matched_quote = true;
break;
}
if (!matched_quote) {
size_t char_len = get_utf8_char_len(text[i]);
if (i + char_len > text.size()) {
char_len = 1;
}
i += char_len;
}
}
push_segment(segment_start, text.size(), false);
return result;
}
std::vector<std::pair<std::string, float>> split_quotation_attention(
const std::vector<std::pair<std::string, float>>& parsed_attention) {
std::vector<std::pair<std::string, float>> result;
for (const auto& item : parsed_attention) {
const std::string& text = item.first;
float weight = item.second;
for (const auto& part : split_quotation(text)) {
if (part.second) {
size_t i = 0;
while (i < part.first.size()) {
size_t char_len = get_utf8_char_len(part.first[i]);
if (i + char_len > part.first.size()) {
char_len = 1;
}
result.emplace_back(part.first.substr(i, char_len), weight);
i += char_len;
}
} else {
result.emplace_back(part.first, weight);
}
}
}
return result;
}

View File

@ -1,13 +1,15 @@
#ifndef __UTIL_H__ #ifndef __SD_CORE_UTIL_H__
#define __UTIL_H__ #define __SD_CORE_UTIL_H__
#include <cstdint> #include <cstdint>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "core/tensor.hpp"
#include "ggml-backend.h"
#include "stable-diffusion.h" #include "stable-diffusion.h"
#include "tensor.hpp"
#define SAFE_STR(s) ((s) ? (s) : "") #define SAFE_STR(s) ((s) ? (s) : "")
#define BOOL_STR(b) ((b) ? "true" : "false") #define BOOL_STR(b) ((b) ? "true" : "false")
@ -41,7 +43,7 @@ sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_wid
class MmapWrapper { class MmapWrapper {
public: public:
static std::unique_ptr<MmapWrapper> create(const std::string& filename); static std::unique_ptr<MmapWrapper> create(const std::string& filename, bool writable = false);
virtual ~MmapWrapper() = default; virtual ~MmapWrapper() = default;
@ -51,6 +53,7 @@ public:
MmapWrapper& operator=(MmapWrapper&&) = delete; MmapWrapper& operator=(MmapWrapper&&) = delete;
const uint8_t* data() const { return static_cast<uint8_t*>(data_); } const uint8_t* data() const { return static_cast<uint8_t*>(data_); }
uint8_t* writable_data() { return static_cast<uint8_t*>(data_); }
size_t size() const { return size_; } size_t size() const { return size_; }
bool copy_data(void* buf, size_t n, size_t offset) const; bool copy_data(void* buf, size_t n, size_t offset) const;
@ -63,6 +66,15 @@ protected:
std::string path_join(const std::string& p1, const std::string& p2); std::string path_join(const std::string& p1, const std::string& p2);
std::vector<std::string> split_string(const std::string& str, char delimiter); std::vector<std::string> split_string(const std::string& str, char delimiter);
using KeyValueArgs = std::vector<std::pair<std::string, std::string>>;
KeyValueArgs parse_key_value_args(const char* args, const char* context = "key=value arg");
KeyValueArgs parse_key_value_args(const std::string& args, const char* context = "key=value arg");
bool parse_strict_float(const std::string& text, float& value);
bool parse_strict_int(const std::string& text, int& value);
bool parse_strict_bool(const std::string& text, bool& value);
void pretty_progress(int step, int steps, float time); void pretty_progress(int step, int steps, float time);
void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float elapsed_seconds); void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float elapsed_seconds);
@ -71,6 +83,8 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo
std::string trim(const std::string& s); std::string trim(const std::string& s);
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text); std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
std::vector<std::pair<std::string, float>> split_quotation_attention(
const std::vector<std::pair<std::string, float>>& parsed_attention);
sd_progress_cb_t sd_get_progress_callback(); sd_progress_cb_t sd_get_progress_callback();
void* sd_get_progress_callback_data(); void* sd_get_progress_callback_data();
@ -82,8 +96,11 @@ int sd_get_preview_interval();
bool sd_should_preview_denoised(); bool sd_should_preview_denoised();
bool sd_should_preview_noisy(); bool sd_should_preview_noisy();
// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
bool sd_backend_is(ggml_backend_t backend, const std::string& name);
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
#endif // __UTIL_H__ #endif // __SD_CORE_UTIL_H__

View File

@ -1,519 +0,0 @@
#ifndef __DIFFUSION_MODEL_H__
#define __DIFFUSION_MODEL_H__
#include <optional>
#include "anima.hpp"
#include "flux.hpp"
#include "mmdit.hpp"
#include "qwen_image.hpp"
#include "tensor_ggml.hpp"
#include "unet.hpp"
#include "wan.hpp"
#include "z_image.hpp"
struct DiffusionParams {
const sd::Tensor<float>* x = nullptr;
const sd::Tensor<float>* timesteps = nullptr;
const sd::Tensor<float>* context = nullptr;
const sd::Tensor<float>* c_concat = nullptr;
const sd::Tensor<float>* y = nullptr;
const sd::Tensor<int32_t>* t5_ids = nullptr;
const sd::Tensor<float>* t5_weights = nullptr;
const sd::Tensor<float>* guidance = nullptr;
const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
bool increase_ref_index = false;
int num_video_frames = -1;
const std::vector<sd::Tensor<float>>* controls = nullptr;
float control_strength = 0.f;
const sd::Tensor<float>* vace_context = nullptr;
float vace_strength = 1.f;
const std::vector<int>* skip_layers = nullptr;
};
template <typename T>
static inline const sd::Tensor<T>& tensor_or_empty(const sd::Tensor<T>* tensor) {
static const sd::Tensor<T> kEmpty;
return tensor != nullptr ? *tensor : kEmpty;
}
struct DiffusionModel {
virtual std::string get_desc() = 0;
virtual sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) = 0;
virtual void alloc_params_buffer() = 0;
virtual void free_params_buffer() = 0;
virtual void free_compute_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0;
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
virtual int64_t get_adm_in_channels() = 0;
virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
};
struct UNetModel : public DiffusionModel {
UNetModelRunner unet;
UNetModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_SD1)
: unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
}
std::string get_desc() override {
return unet.get_desc();
}
void alloc_params_buffer() override {
unet.alloc_params_buffer();
}
void free_params_buffer() override {
unet.free_params_buffer();
}
void free_compute_buffer() override {
unet.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
unet.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() override {
return unet.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
unet.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return unet.unet.adm_in_channels;
}
void set_flash_attention_enabled(bool enabled) {
unet.set_flash_attention_enabled(enabled);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
unet.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_controls;
return unet.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.c_concat),
tensor_or_empty(diffusion_params.y),
diffusion_params.num_video_frames,
diffusion_params.controls ? *diffusion_params.controls : empty_controls,
diffusion_params.control_strength);
}
};
struct MMDiTModel : public DiffusionModel {
MMDiTRunner mmdit;
MMDiTModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {})
: mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
}
std::string get_desc() override {
return mmdit.get_desc();
}
void alloc_params_buffer() override {
mmdit.alloc_params_buffer();
}
void free_params_buffer() override {
mmdit.free_params_buffer();
}
void free_compute_buffer() override {
mmdit.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
mmdit.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() override {
return mmdit.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
mmdit.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768 + 1280;
}
void set_flash_attention_enabled(bool enabled) {
mmdit.set_flash_attention_enabled(enabled);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
mmdit.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<int> empty_skip_layers;
return mmdit.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.y),
diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
}
};
struct FluxModel : public DiffusionModel {
Flux::FluxRunner flux;
FluxModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_FLUX,
bool use_mask = false)
: flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
}
std::string get_desc() override {
return flux.get_desc();
}
void alloc_params_buffer() override {
flux.alloc_params_buffer();
}
void free_params_buffer() override {
flux.free_params_buffer();
}
void free_compute_buffer() override {
flux.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
flux.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() override {
return flux.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
flux.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
flux.set_flash_attention_enabled(enabled);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
flux.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
static const std::vector<int> empty_skip_layers;
return flux.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.c_concat),
tensor_or_empty(diffusion_params.y),
tensor_or_empty(diffusion_params.guidance),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
diffusion_params.increase_ref_index,
diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
}
};
struct AnimaModel : public DiffusionModel {
std::string prefix;
Anima::AnimaRunner anima;
AnimaModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
}
std::string get_desc() override {
return anima.get_desc();
}
void alloc_params_buffer() override {
anima.alloc_params_buffer();
}
void free_params_buffer() override {
anima.free_params_buffer();
}
void free_compute_buffer() override {
anima.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
anima.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return anima.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
anima.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
anima.set_flash_attention_enabled(enabled);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
anima.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return anima.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.t5_ids),
tensor_or_empty(diffusion_params.t5_weights));
}
};
struct WanModel : public DiffusionModel {
std::string prefix;
WAN::WanRunner wan;
WanModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_WAN2)
: prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return wan.get_desc();
}
void alloc_params_buffer() override {
wan.alloc_params_buffer();
}
void free_params_buffer() override {
wan.free_params_buffer();
}
void free_compute_buffer() override {
wan.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
wan.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return wan.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
wan.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
wan.set_flash_attention_enabled(enabled);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
wan.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return wan.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.y),
tensor_or_empty(diffusion_params.c_concat),
sd::Tensor<float>(),
tensor_or_empty(diffusion_params.vace_context),
diffusion_params.vace_strength);
}
};
struct QwenImageModel : public DiffusionModel {
std::string prefix;
Qwen::QwenImageRunner qwen_image;
QwenImageModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_QWEN_IMAGE,
bool zero_cond_t = false)
: prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version, zero_cond_t) {
}
std::string get_desc() override {
return qwen_image.get_desc();
}
void alloc_params_buffer() override {
qwen_image.alloc_params_buffer();
}
void free_params_buffer() override {
qwen_image.free_params_buffer();
}
void free_compute_buffer() override {
qwen_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
qwen_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return qwen_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
qwen_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
qwen_image.set_flash_attention_enabled(enabled);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
qwen_image.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
return qwen_image.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
true);
}
};
struct ZImageModel : public DiffusionModel {
std::string prefix;
ZImage::ZImageRunner z_image;
ZImageModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_Z_IMAGE)
: prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return z_image.get_desc();
}
void alloc_params_buffer() override {
z_image.alloc_params_buffer();
}
void free_params_buffer() override {
z_image.free_params_buffer();
}
void free_compute_buffer() override {
z_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
z_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return z_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
z_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
z_image.set_flash_attention_enabled(enabled);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
z_image.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
return z_image.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
true);
}
};
#endif

View File

@ -0,0 +1,73 @@
#ifndef __SD_EXTENSIONS_GENERATION_EXTENSION_H__
#define __SD_EXTENSIONS_GENERATION_EXTENSION_H__
#include <functional>
#include <map>
#include <memory>
#include <set>
#include <string>
#include "conditioning/conditioner.hpp"
#include "core/ggml_extend_backend.h"
#include "model_loader.h"
#include "stable-diffusion.h"
struct GenerationExtensionInitContext {
const sd_ctx_params_t* params;
SDVersion version;
const String2TensorStorage& tensor_storage_map;
ModelLoader& model_loader;
int n_threads;
std::function<bool(SDBackendModule)> ensure_backend_pair;
std::function<ggml_backend_t(SDBackendModule)> backend_for;
std::function<ggml_backend_t(SDBackendModule)> params_backend_for;
};
struct GenerationExtensionTensorContext {
std::map<std::string, ggml_tensor*>& tensors;
std::map<std::string, ggml_tensor*>& mmap_able_tensors;
std::function<bool(SDBackendModule)> module_can_mmap;
};
struct GenerationExtensionConditionContext {
Conditioner* conditioner;
ConditionerParams& condition_params;
const sd_pm_params_t& pm_params;
std::map<std::string, ggml_tensor*>& tensors;
SDVersion version;
int n_threads;
int total_steps;
bool free_params_immediately;
};
struct GenerationExtension {
virtual ~GenerationExtension() = default;
virtual const char* name() const = 0;
virtual bool is_enabled() const {
return false;
}
virtual bool init(const GenerationExtensionInitContext&) {
return true;
}
virtual void collect_param_tensors(GenerationExtensionTensorContext&) {}
virtual void add_ignore_tensors(std::set<std::string>&) const {}
virtual bool alloc_params_buffer() {
return true;
}
virtual size_t get_params_buffer_size() const {
return 0;
}
virtual void reset_runtime_condition() {}
virtual bool prepare_condition(GenerationExtensionConditionContext&) {
return false;
}
virtual const SDCondition& before_condition(int step,
const SDCondition& condition) const {
return condition;
}
};
std::shared_ptr<GenerationExtension> create_photomaker_extension();
#endif

View File

@ -0,0 +1,325 @@
#include "extensions/generation_extension.h"
#include <algorithm>
#include <cstring>
#include <tuple>
#include <utility>
#include "core/tensor_ggml.hpp"
#include "core/util.h"
#include "model/adapter/lora.hpp"
#include "model/adapter/pmid.hpp"
static std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
tokenize_photomaker_trigger(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
const std::string& text,
int trigger_token_count,
int32_t image_token) {
auto tokens_and_weights = clip_conditioner.tokenize(text);
std::vector<int> source_tokens = std::move(tokens_and_weights.first);
std::vector<float> source_weights = std::move(tokens_and_weights.second);
if (!source_tokens.empty() && source_tokens.front() == clip_conditioner.tokenizer.BOS_TOKEN_ID) {
source_tokens.erase(source_tokens.begin());
source_weights.erase(source_weights.begin());
}
if (!source_tokens.empty() && source_tokens.back() == clip_conditioner.tokenizer.EOS_TOKEN_ID) {
source_tokens.pop_back();
source_weights.pop_back();
}
std::vector<int> tokens;
std::vector<float> weights;
int32_t class_idx = -1;
for (size_t i = 0; i < source_tokens.size(); i++) {
int token = source_tokens[i];
if (token == image_token) {
if (!tokens.empty()) {
class_idx = static_cast<int32_t>(tokens.size()) - 1;
int class_token = tokens.back();
float class_weight = weights.back();
for (int j = 1; j < trigger_token_count; j++) {
tokens.push_back(class_token);
weights.push_back(class_weight);
}
}
continue;
}
tokens.push_back(token);
weights.push_back(source_weights[i]);
}
clip_conditioner.tokenizer.pad_tokens(tokens,
&weights,
nullptr,
clip_conditioner.text_model->model.n_token,
clip_conditioner.text_model->model.n_token,
true);
std::vector<bool> class_token_mask;
for (int i = 0; i < tokens.size(); i++) {
class_token_mask.push_back(class_idx + 1 <= i && i < class_idx + 1 + trigger_token_count);
}
return std::make_tuple(tokens, weights, class_token_mask);
}
static std::tuple<SDCondition, std::vector<bool>>
get_photomaker_condition_with_trigger(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
int n_threads,
const ConditionerParams& conditioner_params,
const std::string& trigger_word,
int trigger_token_count) {
auto image_tokens = clip_conditioner.convert_token_to_id(trigger_word);
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = tokenize_photomaker_trigger(clip_conditioner,
conditioner_params.text,
trigger_token_count,
image_tokens[0]);
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
std::vector<float>& weights = std::get<1>(tokens_and_weights);
std::vector<bool>& trigger_mask = std::get<2>(tokens_and_weights);
auto cond = clip_conditioner.get_learned_condition_common(n_threads,
tokens,
weights,
conditioner_params.clip_skip,
conditioner_params.width,
conditioner_params.height,
conditioner_params.zero_out_masked);
return std::make_tuple(std::move(cond), trigger_mask);
}
static std::string remove_photomaker_trigger_from_prompt(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
const std::string& prompt,
const std::string& trigger_word) {
auto image_tokens = clip_conditioner.convert_token_to_id(trigger_word);
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = clip_conditioner.tokenize(prompt);
std::vector<int>& tokens = tokens_and_weights.first;
auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
GGML_ASSERT(it != tokens.end());
tokens.erase(it);
return clip_conditioner.decode(tokens);
}
struct PhotoMakerExtension : public GenerationExtension {
std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
std::shared_ptr<LoraModel> pmid_lora;
bool enabled = false;
std::string model_path;
std::string trigger_word = "img";
SDCondition id_condition;
int start_merge_step = -1;
const char* name() const override {
return "photomaker";
}
bool is_enabled() const override {
return enabled;
}
bool init(const GenerationExtensionInitContext& ctx) override {
model_path = SAFE_STR(ctx.params->photo_maker_path);
if (model_path.empty()) {
return true;
}
if (!ctx.ensure_backend_pair(SDBackendModule::PHOTOMAKER)) {
return false;
}
PMVersion pm_version = std::strstr(model_path.c_str(), "v2") != nullptr ? PM_VERSION_2 : PM_VERSION_1;
pmid_model = std::make_shared<PhotoMakerIDEncoder>(ctx.backend_for(SDBackendModule::PHOTOMAKER),
ctx.params_backend_for(SDBackendModule::PHOTOMAKER),
ctx.tensor_storage_map,
"pmid",
ctx.version,
pm_version);
if (pm_version == PM_VERSION_2) {
LOG_INFO("using PhotoMaker Version 2");
}
pmid_lora = std::make_shared<LoraModel>("pmid",
ctx.backend_for(SDBackendModule::PHOTOMAKER),
ctx.params_backend_for(SDBackendModule::PHOTOMAKER),
model_path,
"",
ctx.version);
auto lora_tensor_filter = [&](const std::string& tensor_name) {
return starts_with(tensor_name, "lora.model");
};
if (!pmid_lora->load_from_file(ctx.n_threads, lora_tensor_filter)) {
LOG_WARN("load photomaker lora tensors from %s failed", model_path.c_str());
return false;
}
LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", model_path.c_str());
if (!ctx.model_loader.init_from_file_and_convert_name(model_path, "pmid.")) {
LOG_WARN("loading stacked ID embedding from '%s' failed", model_path.c_str());
return true;
}
enabled = true;
return true;
}
void collect_param_tensors(GenerationExtensionTensorContext& ctx) override {
if (!enabled || pmid_model == nullptr) {
return;
}
std::map<std::string, ggml_tensor*> temp;
pmid_model->get_param_tensors(temp, "pmid");
bool do_mmap = ctx.module_can_mmap(SDBackendModule::PHOTOMAKER);
for (const auto& [key, tensor] : temp) {
ctx.tensors[key] = tensor;
if (do_mmap) {
ctx.mmap_able_tensors[key] = tensor;
}
}
}
void add_ignore_tensors(std::set<std::string>& ignore_tensors) const override {
if (!enabled) {
return;
}
ignore_tensors.insert("pmid.unet.");
}
bool alloc_params_buffer() override {
if (!enabled || pmid_model == nullptr) {
return true;
}
return pmid_model->alloc_params_buffer();
}
size_t get_params_buffer_size() const override {
if (!enabled || pmid_model == nullptr) {
return 0;
}
return pmid_model->get_params_buffer_size();
}
void reset_runtime_condition() override {
id_condition = {};
start_merge_step = -1;
}
bool prepare_condition(GenerationExtensionConditionContext& ctx) override {
reset_runtime_condition();
if (!enabled || pmid_model == nullptr || pmid_lora == nullptr) {
return false;
}
if (!pmid_lora->applied) {
int64_t t0 = ggml_time_ms();
pmid_lora->apply(ctx.tensors, ctx.version, ctx.n_threads);
int64_t t1 = ggml_time_ms();
pmid_lora->applied = true;
LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
if (ctx.free_params_immediately) {
pmid_lora->free_params_buffer();
}
}
bool pmv2 = pmid_model->get_version() == PM_VERSION_2;
if (ctx.pm_params.id_images_count <= 0 || ctx.pm_params.id_images == nullptr) {
LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
auto* clip_conditioner = dynamic_cast<FrozenCLIPEmbedderWithCustomWords*>(ctx.conditioner);
if (clip_conditioner == nullptr) {
LOG_WARN("PhotoMaker requires FrozenCLIPEmbedderWithCustomWords conditioner");
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
int clip_image_size = 224;
pmid_model->style_strength = ctx.pm_params.style_strength;
sd::Tensor<float> id_image_tensor;
for (int i = 0; i < ctx.pm_params.id_images_count; i++) {
auto id_image = sd_image_to_tensor(ctx.pm_params.id_images[i]);
auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
if (id_image_tensor.empty()) {
id_image_tensor = processed_id_image;
} else {
id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3);
}
}
int64_t t0 = ggml_time_ms();
int trigger_token_count = pmv2 ? 2 * ctx.pm_params.id_images_count : ctx.pm_params.id_images_count;
auto cond_tup = get_photomaker_condition_with_trigger(*clip_conditioner,
ctx.n_threads,
ctx.condition_params,
trigger_word,
trigger_token_count);
SDCondition prepared_id_condition = std::get<0>(cond_tup);
auto class_tokens_mask = std::get<1>(cond_tup);
if (std::find(class_tokens_mask.begin(), class_tokens_mask.end(), true) == class_tokens_mask.end()) {
LOG_WARN("PhotoMaker trigger word '%s' was not found in prompt", trigger_word.c_str());
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
sd::Tensor<float> id_embeds;
if (pmv2 && ctx.pm_params.id_embed_path != nullptr) {
try {
id_embeds = sd::load_tensor_from_file_as_tensor<float>(ctx.pm_params.id_embed_path);
} catch (const std::exception&) {
id_embeds = {};
}
}
if (pmv2 && id_embeds.empty()) {
LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
if (pmv2 && ctx.pm_params.id_images_count != id_embeds.shape()[1]) {
LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.",
ctx.pm_params.id_images_count,
static_cast<int>(id_embeds.shape()[1]));
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
auto res = pmid_model->compute(ctx.n_threads,
id_image_tensor,
prepared_id_condition.c_crossattn,
id_embeds,
class_tokens_mask);
if (res.empty()) {
LOG_ERROR("Photomaker ID Stacking failed");
LOG_WARN("Turn off PhotoMaker for this request");
return false;
}
prepared_id_condition.c_crossattn = std::move(res);
int64_t t1 = ggml_time_ms();
id_condition = std::move(prepared_id_condition);
start_merge_step = int(ctx.pm_params.style_strength / 100.f * ctx.total_steps);
ctx.condition_params.text = remove_photomaker_trigger_from_prompt(*clip_conditioner,
ctx.condition_params.text,
trigger_word);
LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
if (ctx.free_params_immediately) {
pmid_model->free_params_buffer();
}
return true;
}
const SDCondition& before_condition(int step,
const SDCondition& condition) const override {
if (!id_condition.empty() && start_merge_step != -1 && step > start_merge_step) {
return id_condition;
}
return condition;
}
};
std::shared_ptr<GenerationExtension> create_photomaker_extension() {
return std::make_shared<PhotoMakerExtension>();
}

View File

@ -1,349 +0,0 @@
#ifndef GITS_NOISE_INL
#define GITS_NOISE_INL
const std::vector<std::vector<float>> GITS_NOISE_0_80 = {
{ 14.61464119f, 7.49001646f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 6.77309084f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 3.07277966f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.07277966f, 1.84880662f, 0.83188516f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_0_85 = {
{ 14.61464119f, 7.49001646f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 1.84880662f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 6.77309084f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.11996698f, 3.07277966f, 1.24153244f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.60512662f, 2.63833880f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_0_90 = {
{ 14.61464119f, 6.77309084f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 3.07277966f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.11996698f, 4.86714602f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 2.95596409f, 1.61558151f, 0.69515091f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.19988537f, 1.51179266f, 0.89115214f, 0.43325692f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_0_95 = {
{ 14.61464119f, 6.77309084f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 2.84484982f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.36326075f, 0.803307f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.41535246f, 0.803307f, 0.38853383f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.78698075f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.75677586f, 3.07277966f, 2.45070267f, 1.91321158f, 1.46270394f, 1.05362725f, 0.72133851f, 0.43325692f, 0.19894916f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_00 = {
{ 14.61464119f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.36326075f, 0.803307f, 0.02916753f },
{ 14.61464119f, 7.11996698f, 3.07277966f, 1.56271636f, 0.59516323f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.41535246f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.54755926f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.12350607f, 1.56271636f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.803307f, 0.50118381f, 0.27464288f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.84880662f, 1.36964464f, 1.01931262f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.46139455f, 2.84484982f, 2.19988537f, 1.67050016f, 1.24153244f, 0.92192322f, 0.64427125f, 0.43325692f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_05 = {
{ 14.61464119f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.05039096f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.84484982f, 1.28281462f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.803307f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.52423614f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.74807048f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.59516323f, 0.34370604f, 0.13792117f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.72759056f, 1.24153244f, 0.86115354f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.67050016f, 1.28281462f, 0.95350921f, 0.72133851f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.36326075f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.61951244f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.98035145f, 1.61558151f, 1.32549286f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_10 = {
{ 14.61464119f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 1.61558151f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.45070267f, 1.08895338f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.95596409f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.89115214f, 0.4783645f, 0.19894916f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.08895338f, 0.64427125f, 0.34370604f, 0.13792117f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.54755926f, 0.27464288f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.4783645f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.41535246f, 0.95350921f, 0.64427125f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.61558151f, 1.12534678f, 0.803307f, 0.54755926f, 0.36617002f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.32507086f, 2.45070267f, 1.72759056f, 1.24153244f, 0.89115214f, 0.64427125f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.05039096f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.12350607f, 1.61558151f, 1.24153244f, 0.95350921f, 0.72133851f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_15 = {
{ 14.61464119f, 0.83188516f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.59516323f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 1.56271636f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 1.91321158f, 0.83188516f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.51179266f, 0.803307f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.56271636f, 0.89115214f, 0.50118381f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.12534678f, 0.72133851f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.36964464f, 0.95350921f, 0.69515091f, 0.4783645f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_20 = {
{ 14.61464119f, 0.803307f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 0.92192322f, 0.36617002f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.05039096f, 0.95350921f, 0.45573691f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.64427125f, 0.29807833f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.803307f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.95350921f, 0.59516323f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.83188516f, 0.59516323f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 3.07277966f, 1.98035145f, 1.36964464f, 0.95350921f, 0.69515091f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.46139455f, 2.36326075f, 1.56271636f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.46139455f, 2.45070267f, 1.61558151f, 1.162866f, 0.86115354f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.20157266f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_25 = {
{ 14.61464119f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.50118381f, 0.02916753f },
{ 14.61464119f, 2.05039096f, 0.803307f, 0.32104823f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 0.95350921f, 0.43325692f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.27464288f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.51179266f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.36326075f, 1.24153244f, 0.72133851f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.98595673f, 0.64427125f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.52423614f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.86115354f, 0.64427125f, 0.4783645f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.28281462f, 0.92192322f, 0.69515091f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.46270394f, 1.08895338f, 0.83188516f, 0.66947293f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_30 = {
{ 14.61464119f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 1.24153244f, 0.43325692f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.59516323f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.803307f, 0.36617002f, 0.13792117f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.01931262f, 0.52423614f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.36964464f, 0.74807048f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.56271636f, 0.89115214f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.61558151f, 0.95350921f, 0.61951244f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.41535246f, 0.92192322f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.6383388f, 1.56271636f, 1.01931262f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.77538133f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_35 = {
{ 14.61464119f, 0.69515091f, 0.02916753f },
{ 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.57119018f, 0.19894916f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.69515091f, 0.29807833f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.83188516f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.162866f, 0.64427125f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.36964464f, 0.803307f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.41535246f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.32104823f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.51179266f, 1.01931262f, 0.74807048f, 0.57119018f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_40 = {
{ 14.61464119f, 0.59516323f, 0.02916753f },
{ 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 1.08895338f, 0.43325692f, 0.13792117f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.64427125f, 0.27464288f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.05039096f, 0.95350921f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.24153244f, 0.72133851f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.52423614f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.41535246f, 0.86115354f, 0.59516323f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.4783645f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.72133851f, 0.54755926f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.43325692f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_45 = {
{ 14.61464119f, 0.59516323f, 0.02916753f },
{ 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 0.95350921f, 0.34370604f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.91321158f, 0.95350921f, 0.57119018f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.19988537f, 1.08895338f, 0.64427125f, 0.41087446f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.72133851f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.59516323f, 0.50118381f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_50 = {
{ 14.61464119f, 0.54755926f, 0.02916753f },
{ 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 0.86115354f, 0.32104823f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.83188516f, 0.52423614f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.38853383f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.41087446f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.95350921f, 0.61951244f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.32549286f, 0.86115354f, 0.64427125f, 0.50118381f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.36964464f, 0.92192322f, 0.69515091f, 0.54755926f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.41535246f, 0.95350921f, 0.72133851f, 0.57119018f, 0.4783645f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<const std::vector<std::vector<float>>*> GITS_NOISE = {
&GITS_NOISE_0_80,
&GITS_NOISE_0_85,
&GITS_NOISE_0_90,
&GITS_NOISE_0_95,
&GITS_NOISE_1_00,
&GITS_NOISE_1_05,
&GITS_NOISE_1_10,
&GITS_NOISE_1_15,
&GITS_NOISE_1_20,
&GITS_NOISE_1_25,
&GITS_NOISE_1_30,
&GITS_NOISE_1_35,
&GITS_NOISE_1_40,
&GITS_NOISE_1_45,
&GITS_NOISE_1_50
};
#endif // GITS_NOISE_INL

File diff suppressed because it is too large Load Diff

View File

@ -1,73 +0,0 @@
#ifndef __LTXV_HPP__
#define __LTXV_HPP__
#include "common_block.hpp"
namespace LTXV {
class CausalConv3d : public GGMLBlock {
protected:
int time_kernel_size;
public:
CausalConv3d(int64_t in_channels,
int64_t out_channels,
int kernel_size = 3,
std::tuple<int, int, int> stride = {1, 1, 1},
int dilation = 1,
bool bias = true) {
time_kernel_size = kernel_size / 2;
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
out_channels,
{kernel_size, kernel_size, kernel_size},
stride,
{0, kernel_size / 2, kernel_size / 2},
{dilation, 1, 1},
bias));
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
bool causal = true) {
// x: [N*IC, ID, IH, IW]
// result: [N*OC, OD, OH, OW]
auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
if (causal) {
auto h = ggml_cont(ctx, ggml_permute(ctx, x, 0, 1, 3, 2)); // [ID, N*IC, IH, IW]
auto first_frame = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], 0); // [N*IC, IH, IW]
first_frame = ggml_reshape_4d(ctx, first_frame, first_frame->ne[0], first_frame->ne[1], 1, first_frame->ne[2]); // [N*IC, 1, IH, IW]
auto first_frame_pad = first_frame;
for (int i = 1; i < time_kernel_size - 1; i++) {
first_frame_pad = ggml_concat(ctx, first_frame_pad, first_frame, 2);
}
x = ggml_concat(ctx, first_frame_pad, x, 2);
} else {
auto h = ggml_cont(ctx, ggml_permute(ctx, x, 0, 1, 3, 2)); // [ID, N*IC, IH, IW]
int64_t offset = h->nb[2] * h->ne[2];
auto first_frame = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], 0); // [N*IC, IH, IW]
first_frame = ggml_reshape_4d(ctx, first_frame, first_frame->ne[0], first_frame->ne[1], 1, first_frame->ne[2]); // [N*IC, 1, IH, IW]
auto first_frame_pad = first_frame;
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
first_frame_pad = ggml_concat(ctx, first_frame_pad, first_frame, 2);
}
auto last_frame = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], offset * (h->ne[3] - 1)); // [N*IC, IH, IW]
last_frame = ggml_reshape_4d(ctx, last_frame, last_frame->ne[0], last_frame->ne[1], 1, last_frame->ne[2]); // [N*IC, 1, IH, IW]
auto last_frame_pad = last_frame;
for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
last_frame_pad = ggml_concat(ctx, last_frame_pad, last_frame, 2);
}
x = ggml_concat(ctx, first_frame_pad, x, 2);
x = ggml_concat(ctx, x, last_frame_pad, 2);
}
x = conv->forward(ctx, x);
return x;
}
};
};
#endif

View File

@ -1,24 +1,14 @@
#ifndef __MODEL_H__ #ifndef __MODEL_H__
#define __MODEL_H__ #define __MODEL_H__
#include <functional>
#include <map>
#include <memory>
#include <set>
#include <sstream>
#include <string> #include <string>
#include <tuple>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "core/ordered_map.hpp"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml.h" #include "ggml.h"
#include "gguf.h" #include "model_io/tensor_storage.h"
#include "json.hpp"
#include "ordered_map.hpp"
#include "zip.h"
#define SD_MAX_DIMS 5
enum SDVersion { enum SDVersion {
VERSION_SD1, VERSION_SD1,
@ -28,7 +18,8 @@ enum SDVersion {
VERSION_SD2, VERSION_SD2,
VERSION_SD2_INPAINT, VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET, VERSION_SD2_TINY_UNET,
VERSION_SDXS, VERSION_SDXS_512_DS,
VERSION_SDXS_09,
VERSION_SDXL, VERSION_SDXL,
VERSION_SDXL_INPAINT, VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX, VERSION_SDXL_PIX2PIX,
@ -48,20 +39,27 @@ enum SDVersion {
VERSION_ANIMA, VERSION_ANIMA,
VERSION_FLUX2, VERSION_FLUX2,
VERSION_FLUX2_KLEIN, VERSION_FLUX2_KLEIN,
VERSION_LTXAV,
VERSION_HIDREAM_O1,
VERSION_Z_IMAGE, VERSION_Z_IMAGE,
VERSION_OVIS_IMAGE, VERSION_OVIS_IMAGE,
VERSION_ERNIE_IMAGE,
VERSION_LENS,
VERSION_LONGCAT,
VERSION_PID,
VERSION_IDEOGRAM4,
VERSION_COUNT, VERSION_COUNT,
}; };
static inline bool sd_version_is_sd1(SDVersion version) { static inline bool sd_version_is_sd1(SDVersion version) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) { if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) {
return true; return true;
} }
return false; return false;
} }
static inline bool sd_version_is_sd2(SDVersion version) { static inline bool sd_version_is_sd2(SDVersion version) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) { if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
return true; return true;
} }
return false; return false;
@ -109,6 +107,13 @@ static inline bool sd_version_is_flux2(SDVersion version) {
return false; return false;
} }
static inline bool sd_version_is_ltxav(SDVersion version) {
if (version == VERSION_LTXAV) {
return true;
}
return false;
}
static inline bool sd_version_is_wan(SDVersion version) { static inline bool sd_version_is_wan(SDVersion version) {
if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) { if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
return true; return true;
@ -137,6 +142,48 @@ static inline bool sd_version_is_z_image(SDVersion version) {
return false; return false;
} }
static inline bool sd_version_is_longcat(SDVersion version) {
if (version == VERSION_LONGCAT) {
return true;
}
return false;
}
static inline bool sd_version_is_ernie_image(SDVersion version) {
if (version == VERSION_ERNIE_IMAGE) {
return true;
}
return false;
}
static inline bool sd_version_is_lens(SDVersion version) {
if (version == VERSION_LENS) {
return true;
}
return false;
}
static inline bool sd_version_is_pid(SDVersion version) {
if (version == VERSION_PID) {
return true;
}
return false;
}
static inline bool sd_version_is_ideogram4(SDVersion version) {
if (version == VERSION_IDEOGRAM4) {
return true;
}
return false;
}
static inline bool sd_version_uses_flux2_vae(SDVersion version) {
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
return true;
}
return false;
}
static inline bool sd_version_is_inpaint(SDVersion version) { static inline bool sd_version_is_inpaint(SDVersion version) {
if (version == VERSION_SD1_INPAINT || if (version == VERSION_SD1_INPAINT ||
version == VERSION_SD2_INPAINT || version == VERSION_SD2_INPAINT ||
@ -151,11 +198,18 @@ static inline bool sd_version_is_inpaint(SDVersion version) {
static inline bool sd_version_is_dit(SDVersion version) { static inline bool sd_version_is_dit(SDVersion version) {
if (sd_version_is_flux(version) || if (sd_version_is_flux(version) ||
sd_version_is_flux2(version) || sd_version_is_flux2(version) ||
sd_version_is_ltxav(version) ||
sd_version_is_sd3(version) || sd_version_is_sd3(version) ||
sd_version_is_wan(version) || sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) || sd_version_is_qwen_image(version) ||
version == VERSION_HIDREAM_O1 ||
sd_version_is_anima(version) || sd_version_is_anima(version) ||
sd_version_is_z_image(version)) { sd_version_is_z_image(version) ||
sd_version_is_ernie_image(version) ||
sd_version_is_lens(version) ||
sd_version_is_longcat(version) ||
sd_version_is_pid(version) ||
sd_version_is_ideogram4(version)) {
return true; return true;
} }
return false; return false;
@ -178,168 +232,7 @@ enum PMVersion {
PM_VERSION_2, PM_VERSION_2,
}; };
struct TensorStorage {
std::string name;
ggml_type type = GGML_TYPE_F32;
ggml_type expected_type = GGML_TYPE_COUNT;
bool is_f8_e4m3 = false;
bool is_f8_e5m2 = false;
bool is_f64 = false;
bool is_i64 = false;
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
int n_dims = 0;
size_t file_index = 0;
int index_in_zip = -1; // >= means stored in a zip file
uint64_t offset = 0; // offset in file
TensorStorage() = default;
TensorStorage(std::string name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
: name(std::move(name)), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
for (int i = 0; i < n_dims; i++) {
this->ne[i] = ne[i];
}
}
int64_t nelements() const {
int64_t n = 1;
for (int i = 0; i < SD_MAX_DIMS; i++) {
n *= ne[i];
}
return n;
}
int64_t nbytes() const {
return nelements() * ggml_type_size(type) / ggml_blck_size(type);
}
int64_t nbytes_to_read() const {
if (is_f8_e4m3 || is_f8_e5m2) {
return nbytes() / 2;
} else if (is_f64 || is_i64) {
return nbytes() * 2;
} else {
return nbytes();
}
}
void unsqueeze() {
if (n_dims == 2) {
n_dims = 4;
ne[3] = ne[1];
ne[2] = ne[0];
ne[1] = 1;
ne[0] = 1;
}
}
std::vector<TensorStorage> chunk(size_t n) {
std::vector<TensorStorage> chunks;
uint64_t chunk_size = nbytes_to_read() / n;
// printf("%d/%d\n", chunk_size, nbytes_to_read());
reverse_ne();
for (size_t i = 0; i < n; i++) {
TensorStorage chunk_i = *this;
chunk_i.ne[0] = ne[0] / n;
chunk_i.offset = offset + i * chunk_size;
chunk_i.reverse_ne();
chunks.push_back(chunk_i);
}
reverse_ne();
return chunks;
}
void reverse_ne() {
int64_t new_ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
for (int i = 0; i < n_dims; i++) {
new_ne[i] = ne[n_dims - 1 - i];
}
for (int i = 0; i < n_dims; i++) {
ne[i] = new_ne[i];
}
}
std::string to_string() const {
std::stringstream ss;
const char* type_name = ggml_type_name(type);
if (is_f8_e4m3) {
type_name = "f8_e4m3";
} else if (is_f8_e5m2) {
type_name = "f8_e5m2";
} else if (is_f64) {
type_name = "f64";
} else if (is_i64) {
type_name = "i64";
}
ss << name << " | " << type_name << " | ";
ss << n_dims << " [";
for (int i = 0; i < SD_MAX_DIMS; i++) {
ss << ne[i];
if (i != SD_MAX_DIMS - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
};
typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
typedef OrderedMap<std::string, TensorStorage> String2TensorStorage; typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
using TensorTypeRules = std::vector<std::pair<std::string, ggml_type>>;
class ModelLoader {
protected:
SDVersion version_ = VERSION_COUNT;
std::vector<std::string> file_paths_;
String2TensorStorage tensor_storage_map;
void add_tensor_storage(const TensorStorage& tensor_storage);
bool parse_data_pkl(uint8_t* buffer,
size_t buffer_size,
zip_t* zip,
std::string dir,
size_t file_index,
const std::string prefix);
bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
bool init_from_safetensors_file(const std::string& file_path, const std::string& prefix = "");
bool init_from_ckpt_file(const std::string& file_path, const std::string& prefix = "");
bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
public:
bool init_from_file(const std::string& file_path, const std::string& prefix = "");
void convert_tensors_name();
bool init_from_file_and_convert_name(const std::string& file_path,
const std::string& prefix = "",
SDVersion version = VERSION_COUNT);
SDVersion get_sd_version();
std::map<ggml_type, uint32_t> get_wtype_stat();
std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();
std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
std::map<ggml_type, uint32_t> get_vae_wtype_stat();
String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
std::set<std::string> ignore_tensors = {},
int n_threads = 0,
bool use_mmap = false);
std::vector<std::string> get_tensor_names() const {
std::vector<std::string> names;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
names.push_back(name);
}
return names;
}
bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
~ModelLoader() = default;
};
#endif // __MODEL_H__ #endif // __MODEL_H__

View File

@ -1,8 +1,9 @@
#ifndef __LORA_HPP__ #ifndef __SD_MODEL_ADAPTER_LORA_HPP__
#define __LORA_HPP__ #define __SD_MODEL_ADAPTER_LORA_HPP__
#include <mutex> #include <mutex>
#include "ggml_extend.hpp" #include "core/ggml_extend.hpp"
#include "model_loader.h"
#define LORA_GRAPH_BASE_SIZE 10240 #define LORA_GRAPH_BASE_SIZE 10240
@ -22,10 +23,11 @@ struct LoraModel : public GGMLRunner {
LoraModel(const std::string& lora_id, LoraModel(const std::string& lora_id,
ggml_backend_t backend, ggml_backend_t backend,
ggml_backend_t params_backend,
const std::string& file_path = "", const std::string& file_path = "",
std::string prefix = "", std::string prefix = "",
SDVersion version = VERSION_COUNT) SDVersion version = VERSION_COUNT)
: lora_id(lora_id), file_path(file_path), GGMLRunner(backend, false) { : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, params_backend) {
prefix = "lora." + prefix; prefix = "lora." + prefix;
if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) { if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) {
load_failed = true; load_failed = true;
@ -85,7 +87,10 @@ struct LoraModel : public GGMLRunner {
lora_tensors[name] = real; lora_tensors[name] = real;
} }
alloc_params_buffer(); if (!alloc_params_buffer()) {
LOG_ERROR("lora model buffer allocation failed");
return false;
}
dry_run = false; dry_run = false;
model_loader.load_tensors(on_new_tensor_cb, n_threads); model_loader.load_tensors(on_new_tensor_cb, n_threads);
@ -129,7 +134,7 @@ struct LoraModel : public GGMLRunner {
} }
} }
ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
ggml_tensor* updown = nullptr; ggml_tensor* updown = nullptr;
int index = 0; int index = 0;
while (true) { while (true) {
@ -152,17 +157,17 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(lora_up_name); auto iter = lora_tensors.find(lora_up_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lora_up = ggml_ext_cast_f32(ctx, iter->second); lora_up = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
iter = lora_tensors.find(lora_mid_name); iter = lora_tensors.find(lora_mid_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lora_mid = ggml_ext_cast_f32(ctx, iter->second); lora_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
iter = lora_tensors.find(lora_down_name); iter = lora_tensors.find(lora_down_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lora_down = ggml_ext_cast_f32(ctx, iter->second); lora_down = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
if (lora_up == nullptr || lora_down == nullptr) { if (lora_up == nullptr || lora_down == nullptr) {
@ -208,7 +213,7 @@ struct LoraModel : public GGMLRunner {
return updown; return updown;
} }
ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
ggml_tensor* updown = nullptr; ggml_tensor* updown = nullptr;
int index = 0; int index = 0;
while (true) { while (true) {
@ -225,7 +230,7 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(diff_name); auto iter = lora_tensors.find(diff_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
curr_updown = ggml_ext_cast_f32(ctx, iter->second); curr_updown = ggml_ext_cast_f32(ctx, backend, iter->second);
} else { } else {
break; break;
} }
@ -248,7 +253,7 @@ struct LoraModel : public GGMLRunner {
return updown; return updown;
} }
ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
ggml_tensor* updown = nullptr; ggml_tensor* updown = nullptr;
int index = 0; int index = 0;
while (true) { while (true) {
@ -276,33 +281,33 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(hada_1_down_name); auto iter = lora_tensors.find(hada_1_down_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_1_down = ggml_ext_cast_f32(ctx, iter->second); hada_1_down = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
iter = lora_tensors.find(hada_1_up_name); iter = lora_tensors.find(hada_1_up_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_1_up = ggml_ext_cast_f32(ctx, iter->second); hada_1_up = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
iter = lora_tensors.find(hada_1_mid_name); iter = lora_tensors.find(hada_1_mid_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_1_mid = ggml_ext_cast_f32(ctx, iter->second); hada_1_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
hada_1_up = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up)); hada_1_up = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up));
} }
iter = lora_tensors.find(hada_2_down_name); iter = lora_tensors.find(hada_2_down_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_2_down = ggml_ext_cast_f32(ctx, iter->second); hada_2_down = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
iter = lora_tensors.find(hada_2_up_name); iter = lora_tensors.find(hada_2_up_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_2_up = ggml_ext_cast_f32(ctx, iter->second); hada_2_up = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
iter = lora_tensors.find(hada_2_mid_name); iter = lora_tensors.find(hada_2_mid_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
hada_2_mid = ggml_ext_cast_f32(ctx, iter->second); hada_2_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
hada_2_up = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up)); hada_2_up = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up));
} }
@ -351,7 +356,7 @@ struct LoraModel : public GGMLRunner {
return updown; return updown;
} }
ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
ggml_tensor* updown = nullptr; ggml_tensor* updown = nullptr;
int index = 0; int index = 0;
while (true) { while (true) {
@ -378,24 +383,24 @@ struct LoraModel : public GGMLRunner {
auto iter = lora_tensors.find(lokr_w1_name); auto iter = lora_tensors.find(lokr_w1_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w1 = ggml_ext_cast_f32(ctx, iter->second); lokr_w1 = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
iter = lora_tensors.find(lokr_w2_name); iter = lora_tensors.find(lokr_w2_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w2 = ggml_ext_cast_f32(ctx, iter->second); lokr_w2 = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
int64_t rank = 1; int64_t rank = 1;
if (lokr_w1 == nullptr) { if (lokr_w1 == nullptr) {
iter = lora_tensors.find(lokr_w1_a_name); iter = lora_tensors.find(lokr_w1_a_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second); lokr_w1_a = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
iter = lora_tensors.find(lokr_w1_b_name); iter = lora_tensors.find(lokr_w1_b_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second); lokr_w1_b = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) { if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) {
@ -410,12 +415,12 @@ struct LoraModel : public GGMLRunner {
if (lokr_w2 == nullptr) { if (lokr_w2 == nullptr) {
iter = lora_tensors.find(lokr_w2_a_name); iter = lora_tensors.find(lokr_w2_a_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second); lokr_w2_a = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
iter = lora_tensors.find(lokr_w2_b_name); iter = lora_tensors.find(lokr_w2_b_name);
if (iter != lora_tensors.end()) { if (iter != lora_tensors.end()) {
lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second); lokr_w2_b = ggml_ext_cast_f32(ctx, backend, iter->second);
} }
if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) { if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) {
@ -468,23 +473,23 @@ struct LoraModel : public GGMLRunner {
return updown; return updown;
} }
ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) { ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_backend_t backend, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) {
// lora // lora
ggml_tensor* diff = nullptr; ggml_tensor* diff = nullptr;
if (with_lora_and_lokr) { if (with_lora_and_lokr) {
diff = get_lora_weight_diff(model_tensor_name, ctx); diff = get_lora_weight_diff(model_tensor_name, ctx, backend);
} }
// diff // diff
if (diff == nullptr) { if (diff == nullptr) {
diff = get_raw_weight_diff(model_tensor_name, ctx); diff = get_raw_weight_diff(model_tensor_name, ctx, backend);
} }
// loha // loha
if (diff == nullptr) { if (diff == nullptr) {
diff = get_loha_weight_diff(model_tensor_name, ctx); diff = get_loha_weight_diff(model_tensor_name, ctx, backend);
} }
// lokr // lokr
if (diff == nullptr && with_lora_and_lokr) { if (diff == nullptr && with_lora_and_lokr) {
diff = get_lokr_weight_diff(model_tensor_name, ctx); diff = get_lokr_weight_diff(model_tensor_name, ctx, backend);
} }
if (diff != nullptr) { if (diff != nullptr) {
if (ggml_nelements(diff) < ggml_nelements(model_tensor)) { if (ggml_nelements(diff) < ggml_nelements(model_tensor)) {
@ -502,6 +507,7 @@ struct LoraModel : public GGMLRunner {
} }
ggml_tensor* get_out_diff(ggml_context* ctx, ggml_tensor* get_out_diff(ggml_context* ctx,
ggml_backend_t backend,
ggml_tensor* x, ggml_tensor* x,
WeightAdapter::ForwardParams forward_params, WeightAdapter::ForwardParams forward_params,
const std::string& model_tensor_name) { const std::string& model_tensor_name) {
@ -590,7 +596,7 @@ struct LoraModel : public GGMLRunner {
} }
scale_value *= multiplier; scale_value *= multiplier;
auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value); auto curr_out_diff = ggml_ext_lokr_forward(ctx, backend, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
if (out_diff == nullptr) { if (out_diff == nullptr) {
out_diff = curr_out_diff; out_diff = curr_out_diff;
} else { } else {
@ -761,27 +767,27 @@ struct LoraModel : public GGMLRunner {
ggml_tensor* model_tensor = it.second; ggml_tensor* model_tensor = it.second;
// lora // lora
ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor); ggml_tensor* diff = get_weight_diff(model_tensor_name, runtime_backend, compute_ctx, model_tensor);
if (diff == nullptr) { if (diff == nullptr) {
continue; continue;
} }
ggml_tensor* original_tensor = model_tensor; ggml_tensor* original_tensor = model_tensor;
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) { if (!sd_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
model_tensor = ggml_dup_tensor(compute_ctx, model_tensor); model_tensor = ggml_dup_tensor(compute_ctx, model_tensor);
set_backend_tensor_data(model_tensor, original_tensor->data); set_backend_tensor_data(model_tensor, original_tensor->data);
} }
ggml_tensor* final_tensor; ggml_tensor* final_tensor;
if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) { if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) {
final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor); final_tensor = ggml_ext_cast_f32(compute_ctx, runtime_backend, model_tensor);
final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff); final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff);
final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor); final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor);
} else { } else {
final_tensor = ggml_add_inplace(compute_ctx, model_tensor, diff); final_tensor = ggml_add_inplace(compute_ctx, model_tensor, diff);
} }
ggml_build_forward_expand(gf, final_tensor); ggml_build_forward_expand(gf, final_tensor);
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) { if (!sd_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
original_tensor_to_final_tensor[original_tensor] = final_tensor; original_tensor_to_final_tensor[original_tensor] = final_tensor;
} }
} }
@ -841,34 +847,35 @@ public:
: lora_models(lora_models) { : lora_models(lora_models) {
} }
ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) { ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) {
for (auto& lora_model : lora_models) { for (auto& lora_model : lora_models) {
ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora_and_lokr); ggml_tensor* diff = lora_model->get_weight_diff(weight_name, backend, ctx, weight, with_lora_and_lokr);
if (diff == nullptr) { if (diff == nullptr) {
continue; continue;
} }
if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
weight = ggml_ext_cast_f32(ctx, weight); weight = ggml_ext_cast_f32(ctx, backend, weight);
} }
weight = ggml_add(ctx, weight, diff); weight = ggml_add(ctx, weight, diff);
} }
return weight; return weight;
} }
ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override { ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) override {
return patch_weight(ctx, weight, weight_name, true); return patch_weight(ctx, backend, weight, weight_name, true);
} }
ggml_tensor* forward_with_lora(ggml_context* ctx, ggml_tensor* forward_with_lora(ggml_context* ctx,
ggml_backend_t backend,
ggml_tensor* x, ggml_tensor* x,
ggml_tensor* w, ggml_tensor* w,
ggml_tensor* b, ggml_tensor* b,
const std::string& prefix, const std::string& prefix,
WeightAdapter::ForwardParams forward_params) override { WeightAdapter::ForwardParams forward_params) override {
w = patch_weight(ctx, w, prefix + "weight", false); w = patch_weight(ctx, backend, w, prefix + "weight", false);
if (b) { if (b) {
b = patch_weight(ctx, b, prefix + "bias", false); b = patch_weight(ctx, backend, b, prefix + "bias", false);
} }
ggml_tensor* out; ggml_tensor* out;
if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) { if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) {
@ -890,7 +897,7 @@ public:
forward_params.conv2d.scale); forward_params.conv2d.scale);
} }
for (auto& lora_model : lora_models) { for (auto& lora_model : lora_models) {
ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight"); ggml_tensor* out_diff = lora_model->get_out_diff(ctx, backend, x, forward_params, prefix + "weight");
if (out_diff == nullptr) { if (out_diff == nullptr) {
continue; continue;
} }
@ -908,4 +915,4 @@ public:
} }
}; };
#endif // __LORA_HPP__ #endif // __SD_MODEL_ADAPTER_LORA_HPP__

View File

@ -1,10 +1,12 @@
#ifndef __PMI_HPP__ #ifndef __SD_MODEL_ADAPTER_PMID_HPP__
#define __PMI_HPP__ #define __SD_MODEL_ADAPTER_PMID_HPP__
#include "ggml_extend.hpp" #include "core/ggml_extend.hpp"
#include "clip.hpp" #include "model/adapter/lora.hpp"
#include "lora.hpp" #include "model/common/block.hpp"
#include "model/te/clip.hpp"
#include "model_loader.h"
struct FuseBlock : public GGMLBlock { struct FuseBlock : public GGMLBlock {
// network hparams // network hparams
@ -411,13 +413,13 @@ public:
public: public:
PhotoMakerIDEncoder(ggml_backend_t backend, PhotoMakerIDEncoder(ggml_backend_t backend,
bool offload_params_to_cpu, ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map, const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
SDVersion version = VERSION_SDXL, SDVersion version = VERSION_SDXL,
PMVersion pm_v = PM_VERSION_1, PMVersion pm_v = PM_VERSION_1,
float sty = 20.f) float sty = 20.f)
: GGMLRunner(backend, offload_params_to_cpu), : GGMLRunner(backend, params_backend),
version(version), version(version),
pm_version(pm_v), pm_version(pm_v),
style_strength(sty) { style_strength(sty) {
@ -568,11 +570,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
bool applied = false; bool applied = false;
PhotoMakerIDEmbed(ggml_backend_t backend, PhotoMakerIDEmbed(ggml_backend_t backend,
bool offload_params_to_cpu, ggml_backend_t params_backend,
ModelLoader* ml, ModelLoader* ml,
const std::string& file_path = "", const std::string& file_path = "",
const std::string& prefix = "") const std::string& prefix = "")
: file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) { : file_path(file_path), GGMLRunner(backend, params_backend), model_loader(ml) {
if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) { if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
load_failed = true; load_failed = true;
} }
@ -615,7 +617,10 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
}; };
model_loader->load_tensors(on_new_tensor_cb, n_threads); model_loader->load_tensors(on_new_tensor_cb, n_threads);
alloc_params_buffer(); if (!alloc_params_buffer()) {
LOG_ERROR("PhotoMaker ID embeds buffer allocation failed");
return false;
}
dry_run = false; dry_run = false;
model_loader->load_tensors(on_new_tensor_cb, n_threads); model_loader->load_tensors(on_new_tensor_cb, n_threads);
@ -633,4 +638,4 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
} }
}; };
#endif // __PMI_HPP__ #endif // __SD_MODEL_ADAPTER_PMID_HPP__

View File

@ -1,7 +1,9 @@
#ifndef __COMMON_BLOCK_HPP__ #ifndef __SD_MODEL_COMMON_BLOCK_HPP__
#define __COMMON_BLOCK_HPP__ #define __SD_MODEL_COMMON_BLOCK_HPP__
#include "ggml_extend.hpp" #include "core/ggml_extend.hpp"
#include "core/util.h"
#include "ggml-backend.h"
class DownSampleBlock : public GGMLBlock { class DownSampleBlock : public GGMLBlock {
protected: protected:
@ -225,6 +227,37 @@ public:
} }
}; };
struct Mlp : public GGMLBlock {
public:
Mlp(int64_t in_features,
int64_t hidden_features = -1,
int64_t out_features = -1,
bool bias = true) {
// act_layer is always lambda: nn.GELU(approximate="tanh")
// norm_layer is always None
// use_conv is always False
if (hidden_features == -1) {
hidden_features = in_features;
}
if (out_features == -1) {
out_features = in_features;
}
blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, n_token, in_features]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
x = fc1->forward(ctx, x);
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
x = fc2->forward(ctx, x);
return x;
}
};
class FeedForward : public GGMLBlock { class FeedForward : public GGMLBlock {
public: public:
enum class Activation { enum class Activation {
@ -248,9 +281,6 @@ public:
float scale = 1.f; float scale = 1.f;
if (precision_fix) { if (precision_fix) {
scale = 1.f / 128.f; scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
} }
// The purpose of the scale here is to prevent NaN issues in certain situations. // The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32, // For example, when using Vulkan without enabling force_prec_f32,
@ -264,6 +294,9 @@ public:
auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]); auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]); auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
if (sd_backend_is(ctx->backend, "Vulkan")) {
net_2->set_force_prec_f32(true);
}
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
@ -277,6 +310,7 @@ protected:
int64_t context_dim; int64_t context_dim;
int64_t n_head; int64_t n_head;
int64_t d_head; int64_t d_head;
bool xtra_dim = false;
public: public:
CrossAttention(int64_t query_dim, CrossAttention(int64_t query_dim,
@ -288,7 +322,11 @@ public:
query_dim(query_dim), query_dim(query_dim),
context_dim(context_dim) { context_dim(context_dim) {
int64_t inner_dim = d_head * n_head; int64_t inner_dim = d_head * n_head;
if (context_dim == 320 && d_head == 320) {
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
xtra_dim = true;
context_dim = 1024;
}
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false)); blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false)); blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false)); blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
@ -313,10 +351,16 @@ public:
int64_t n_context = context->ne[1]; int64_t n_context = context->ne[1];
int64_t inner_dim = d_head * n_head; int64_t inner_dim = d_head * n_head;
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
if (xtra_dim) {
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
context->ne[0] = 1024; // patch dim
}
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
if (xtra_dim) {
context->ne[0] = 320; // reset dim to orig
}
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
@ -590,4 +634,4 @@ public:
} }
}; };
#endif // __COMMON_BLOCK_HPP__ #endif // __SD_MODEL_COMMON_BLOCK_HPP__

View File

@ -1,12 +1,17 @@
#ifndef __ROPE_HPP__ #ifndef __SD_MODEL_COMMON_ROPE_HPP__
#define __ROPE_HPP__ #define __SD_MODEL_COMMON_ROPE_HPP__
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <vector> #include <vector>
#include "ggml_extend.hpp" #include "core/ggml_extend.hpp"
namespace Rope { namespace Rope {
enum class EmbedNDLayout {
Matrix,
ErnieImage,
};
template <class T> template <class T>
__STATIC_INLINE__ std::vector<T> linspace(T start, T end, int num) { __STATIC_INLINE__ std::vector<T> linspace(T start, T end, int num) {
std::vector<T> result(num); std::vector<T> result(num);
@ -106,6 +111,16 @@ namespace Rope {
return txt_ids; return txt_ids;
} }
__STATIC_INLINE__ std::vector<std::vector<float>> gen_longcat_txt_ids(int bs, int context_len, int axes_dim_num) {
auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
for (int i = 0; i < bs * context_len; i++) {
float token_index = static_cast<float>(i % context_len);
txt_ids[i][1] = token_index;
txt_ids[i][2] = token_index;
}
return txt_ids;
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_img_ids(int h, __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_img_ids(int h,
int w, int w,
int patch_size, int patch_size,
@ -117,7 +132,6 @@ namespace Rope {
bool scale_rope = false) { bool scale_rope = false) {
int h_len = (h + (patch_size / 2)) / patch_size; int h_len = (h + (patch_size / 2)) / patch_size;
int w_len = (w + (patch_size / 2)) / patch_size; int w_len = (w + (patch_size / 2)) / patch_size;
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0)); std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));
int h_start = h_offset; int h_start = h_offset;
@ -130,7 +144,6 @@ namespace Rope {
std::vector<float> row_ids = linspace<float>(1.f * h_start, 1.f * h_start + h_len - 1, h_len); std::vector<float> row_ids = linspace<float>(1.f * h_start, 1.f * h_start + h_len - 1, h_len);
std::vector<float> col_ids = linspace<float>(1.f * w_start, 1.f * w_start + w_len - 1, w_len); std::vector<float> col_ids = linspace<float>(1.f * w_start, 1.f * w_start + w_len - 1, w_len);
for (int i = 0; i < h_len; ++i) { for (int i = 0; i < h_len; ++i) {
for (int j = 0; j < w_len; ++j) { for (int j = 0; j < w_len; ++j) {
img_ids[i * w_len + j][0] = 1.f * index; img_ids[i * w_len + j][0] = 1.f * index;
@ -169,7 +182,8 @@ namespace Rope {
int bs, int bs,
const std::vector<float>& axis_thetas, const std::vector<float>& axis_thetas,
const std::vector<int>& axes_dim, const std::vector<int>& axes_dim,
const std::vector<std::vector<int>>& wrap_dims = {}) { const std::vector<std::vector<int>>& wrap_dims = {},
EmbedNDLayout layout = EmbedNDLayout::Matrix) {
std::vector<std::vector<float>> trans_ids = transpose(ids); std::vector<std::vector<float>> trans_ids = transpose(ids);
size_t pos_len = ids.size() / bs; size_t pos_len = ids.size() / bs;
size_t num_axes = axes_dim.size(); size_t num_axes = axes_dim.size();
@ -204,6 +218,24 @@ namespace Rope {
offset += rope_emb[0].size(); offset += rope_emb[0].size();
} }
if (layout == EmbedNDLayout::ErnieImage) {
int head_dim = emb_dim * 2;
std::vector<float> ernie_emb(bs * pos_len * head_dim * 2, 0.0f);
for (size_t pos_idx = 0; pos_idx < bs * pos_len; ++pos_idx) {
for (int i = 0; i < emb_dim; ++i) {
float cos_val = emb[pos_idx][4 * i];
float sin_val = emb[pos_idx][4 * i + 2];
size_t cos_offset = pos_idx * head_dim + 2 * i;
size_t sin_offset = bs * pos_len * head_dim + cos_offset;
ernie_emb[cos_offset] = cos_val;
ernie_emb[cos_offset + 1] = cos_val;
ernie_emb[sin_offset] = sin_val;
ernie_emb[sin_offset + 1] = sin_val;
}
}
return ernie_emb;
}
return flatten(emb); return flatten(emb);
} }
@ -211,22 +243,112 @@ namespace Rope {
int bs, int bs,
float theta, float theta,
const std::vector<int>& axes_dim, const std::vector<int>& axes_dim,
const std::vector<std::vector<int>>& wrap_dims = {}) { const std::vector<std::vector<int>>& wrap_dims = {},
EmbedNDLayout layout = EmbedNDLayout::Matrix) {
std::vector<float> axis_thetas(axes_dim.size(), theta); std::vector<float> axis_thetas(axes_dim.size(), theta);
return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims); return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims, layout);
}
__STATIC_INLINE__ std::vector<float> embed_interleaved_mrope(const std::vector<std::vector<float>>& ids,
int bs,
float theta,
int head_dim,
const std::vector<int>& mrope_section) {
GGML_ASSERT(bs > 0);
GGML_ASSERT(head_dim % 2 == 0);
GGML_ASSERT(mrope_section.size() >= 3);
std::vector<std::vector<float>> trans_ids = transpose(ids);
size_t pos_len = ids.size() / bs;
int half_dim = head_dim / 2;
std::vector<std::vector<std::vector<float>>> axis_embs;
axis_embs.reserve(3);
for (int axis = 0; axis < 3; ++axis) {
axis_embs.push_back(rope(trans_ids[axis], head_dim, theta));
}
std::vector<std::vector<float>> emb = axis_embs[0];
for (int axis = 1; axis < 3; ++axis) {
int length = std::min<int>(mrope_section[axis] * 3, half_dim);
for (int freq_idx = axis; freq_idx < length; freq_idx += 3) {
for (size_t pos_idx = 0; pos_idx < bs * pos_len; ++pos_idx) {
for (int k = 0; k < 4; ++k) {
emb[pos_idx][4 * freq_idx + k] = axis_embs[axis][pos_idx][4 * freq_idx + k];
}
}
}
}
return flatten(emb);
}
__STATIC_INLINE__ std::vector<float> embed_2d_interleaved(int height,
int width,
int dim,
float theta = 10000.f,
float scale = 16.f,
int ref_grid_h = 0,
int ref_grid_w = 0) {
assert(dim % 4 == 0);
int half_dim = dim / 2;
int dim_axis = dim / 2;
int axis_half_dim = dim_axis / 2;
float h_ntk = 1.f;
float w_ntk = 1.f;
if (ref_grid_h > 0 && ref_grid_w > 0 && dim_axis > 2) {
float power = static_cast<float>(dim_axis) / static_cast<float>(dim_axis - 2);
h_ntk = std::pow(static_cast<float>(height) / static_cast<float>(ref_grid_h), power);
w_ntk = std::pow(static_cast<float>(width) / static_cast<float>(ref_grid_w), power);
}
std::vector<float> x_pos;
std::vector<float> y_pos;
x_pos.reserve(static_cast<size_t>(height) * width);
y_pos.reserve(static_cast<size_t>(height) * width);
for (int iy = 0; iy < height; ++iy) {
float y = height == 1 ? 0.f : scale * static_cast<float>(iy) / static_cast<float>(height - 1);
for (int ix = 0; ix < width; ++ix) {
float x = width == 1 ? 0.f : scale * static_cast<float>(ix) / static_cast<float>(width - 1);
x_pos.push_back(x);
y_pos.push_back(y);
}
}
auto x_emb = rope(x_pos, dim_axis, theta * w_ntk);
auto y_emb = rope(y_pos, dim_axis, theta * h_ntk);
std::vector<float> out(static_cast<size_t>(height) * width * half_dim * 4);
for (int pos = 0; pos < height * width; ++pos) {
for (int i = 0; i < axis_half_dim; ++i) {
int jx = 2 * i;
int jy = 2 * i + 1;
size_t base_x = static_cast<size_t>(pos) * half_dim * 4 + static_cast<size_t>(jx) * 4;
size_t base_y = static_cast<size_t>(pos) * half_dim * 4 + static_cast<size_t>(jy) * 4;
size_t axis = static_cast<size_t>(i) * 4;
for (int k = 0; k < 4; ++k) {
out[base_x + k] = x_emb[pos][axis + k];
out[base_y + k] = y_emb[pos][axis + k];
}
}
}
return out;
} }
__STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size, __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
int bs, int bs,
int axes_dim_num, int axes_dim_num,
int start_index,
const std::vector<ggml_tensor*>& ref_latents, const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index, bool increase_ref_index,
float ref_index_scale, float ref_index_scale,
bool scale_rope) { bool scale_rope,
int base_offset = 0) {
std::vector<std::vector<float>> ids; std::vector<std::vector<float>> ids;
int curr_h_offset = 0; int curr_h_offset = 0;
int curr_w_offset = 0; int curr_w_offset = 0;
int index = 1; int index = start_index;
for (ggml_tensor* ref : ref_latents) { for (ggml_tensor* ref : ref_latents) {
int h_offset = 0; int h_offset = 0;
int w_offset = 0; int w_offset = 0;
@ -245,8 +367,8 @@ namespace Rope {
bs, bs,
axes_dim_num, axes_dim_num,
static_cast<int>(index * ref_index_scale), static_cast<int>(index * ref_index_scale),
h_offset, h_offset + base_offset,
w_offset, w_offset + base_offset,
scale_rope); scale_rope);
ids = concat_ids(ids, ref_ids, bs); ids = concat_ids(ids, ref_ids, bs);
@ -269,13 +391,17 @@ namespace Rope {
std::set<int> txt_arange_dims, std::set<int> txt_arange_dims,
const std::vector<ggml_tensor*>& ref_latents, const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index, bool increase_ref_index,
float ref_index_scale) { float ref_index_scale,
auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims); bool is_longcat) {
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num); int x_index = is_longcat ? 1 : 0;
auto txt_ids = is_longcat ? gen_longcat_txt_ids(bs, context_len, axes_dim_num) : gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
int offset = is_longcat ? context_len : 0;
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, x_index, offset, offset);
auto ids = concat_ids(txt_ids, img_ids, bs); auto ids = concat_ids(txt_ids, img_ids, bs);
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale, false); auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, x_index + 1, ref_latents, increase_ref_index, ref_index_scale, false, offset);
ids = concat_ids(ids, refs_ids, bs); ids = concat_ids(ids, refs_ids, bs);
} }
return ids; return ids;
@ -294,7 +420,8 @@ namespace Rope {
int theta, int theta,
bool circular_h, bool circular_h,
bool circular_w, bool circular_w,
const std::vector<int>& axes_dim) { const std::vector<int>& axes_dim,
bool is_longcat) {
std::vector<std::vector<float>> ids = gen_flux_ids(h, std::vector<std::vector<float>> ids = gen_flux_ids(h,
w, w,
patch_size, patch_size,
@ -304,7 +431,8 @@ namespace Rope {
txt_arange_dims, txt_arange_dims,
ref_latents, ref_latents,
increase_ref_index, increase_ref_index,
ref_index_scale); ref_index_scale,
is_longcat);
std::vector<std::vector<int>> wrap_dims; std::vector<std::vector<int>> wrap_dims;
if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) { if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
int h_len = (h + (patch_size / 2)) / patch_size; int h_len = (h + (patch_size / 2)) / patch_size;
@ -369,7 +497,7 @@ namespace Rope {
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, 0, 0, 0, true); auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, 0, 0, 0, true);
auto ids = concat_ids(txt_ids_repeated, img_ids, bs); auto ids = concat_ids(txt_ids_repeated, img_ids, bs);
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f, true); auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, 1, ref_latents, increase_ref_index, 1.f, true);
ids = concat_ids(ids, refs_ids, bs); ids = concat_ids(ids, refs_ids, bs);
} }
return ids; return ids;
@ -437,6 +565,120 @@ namespace Rope {
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims); return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
} }
__STATIC_INLINE__ std::vector<std::vector<float>> gen_lens_ids(int h,
int w,
int bs,
int context_len,
bool scale_rope = true) {
auto img_ids_repeated = gen_flux_img_ids(h, w, 1, bs, 3, 0, 0, 0, scale_rope);
int txt_id_start = scale_rope ? std::max(h / 2, w / 2) : 0;
auto txt_ids = linspace<float>(1.f * txt_id_start, 1.f * context_len + txt_id_start, context_len);
std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
for (int i = 0; i < bs; ++i) {
for (int j = 0; j < txt_ids.size(); ++j) {
txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
}
}
return concat_ids(img_ids_repeated, txt_ids_repeated, bs);
}
__STATIC_INLINE__ std::vector<float> gen_lens_pe(int h,
int w,
int bs,
int context_len,
int theta,
bool circular_h,
bool circular_w,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_lens_ids(h, w, bs, context_len, true);
std::vector<std::vector<int>> wrap_dims;
if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
size_t pos_len = ids.size() / bs;
wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
const size_t img_tokens = static_cast<size_t>(h) * static_cast<size_t>(w);
for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
if (circular_h) {
wrap_dims[1][token_i] = h;
}
if (circular_w) {
wrap_dims[2][token_i] = w;
}
}
}
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_ernie_image_ids(int h,
int w,
int patch_size,
int bs,
int context_len) {
int h_len = h / patch_size;
int w_len = w / patch_size;
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0f));
std::vector<float> h_ids = linspace<float>(0.f, static_cast<float>(h_len - 1), h_len);
std::vector<float> w_ids = linspace<float>(0.f, static_cast<float>(w_len - 1), w_len);
for (int i = 0; i < h_len; ++i) {
for (int j = 0; j < w_len; ++j) {
img_ids[i * w_len + j][0] = static_cast<float>(context_len);
img_ids[i * w_len + j][1] = h_ids[i];
img_ids[i * w_len + j][2] = w_ids[j];
}
}
std::vector<std::vector<float>> img_ids_repeated(bs * img_ids.size(), std::vector<float>(3, 0.0f));
for (int i = 0; i < bs; ++i) {
for (int j = 0; j < static_cast<int>(img_ids.size()); ++j) {
img_ids_repeated[i * img_ids.size() + j] = img_ids[j];
}
}
std::vector<std::vector<float>> txt_ids(bs * context_len, std::vector<float>(3, 0.0f));
for (int i = 0; i < bs; ++i) {
for (int j = 0; j < context_len; ++j) {
txt_ids[i * context_len + j][0] = static_cast<float>(j);
}
}
return concat_ids(img_ids_repeated, txt_ids, bs);
}
__STATIC_INLINE__ std::vector<float> gen_ernie_image_pe(int h,
int w,
int patch_size,
int bs,
int context_len,
int theta,
bool circular_h,
bool circular_w,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_ernie_image_ids(h, w, patch_size, bs, context_len);
std::vector<std::vector<int>> wrap_dims;
if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
int h_len = h / patch_size;
int w_len = w / patch_size;
if (h_len > 0 && w_len > 0) {
size_t pos_len = ids.size() / bs;
wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
const size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
if (circular_h) {
wrap_dims[1][token_i] = h_len;
}
if (circular_w) {
wrap_dims[2][token_i] = w_len;
}
}
}
}
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims, EmbedNDLayout::ErnieImage);
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t, __STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,
int h, int h,
int w, int w,
@ -660,4 +902,4 @@ namespace Rope {
} }
}; // namespace Rope }; // namespace Rope
#endif // __ROPE_HPP__ #endif // __SD_MODEL_COMMON_ROPE_HPP__

View File

@ -1,18 +1,61 @@
#ifndef __ANIMA_HPP__ #ifndef __SD_MODEL_DIFFUSION_ANIMA_HPP__
#define __ANIMA_HPP__ #define __SD_MODEL_DIFFUSION_ANIMA_HPP__
#include <algorithm>
#include <cmath> #include <cmath>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "common_block.hpp" #include "model/common/block.hpp"
#include "flux.hpp" #include "model/common/rope.hpp"
#include "rope.hpp" #include "model/diffusion/flux.hpp"
#include "model/diffusion/model.hpp"
namespace Anima { namespace Anima {
constexpr int ANIMA_GRAPH_SIZE = 65536; constexpr int ANIMA_GRAPH_SIZE = 65536;
struct AnimaConfig {
int64_t in_channels = 16;
int64_t out_channels = 16;
int64_t hidden_size = 2048;
int64_t text_embed_dim = 1024;
int64_t num_heads = 16;
int64_t head_dim = 128;
int patch_size = 2;
int64_t num_layers = 28;
std::vector<int> axes_dim = {44, 42, 42};
int theta = 10000;
static AnimaConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
AnimaConfig config;
int64_t detected_layers = 0;
std::string layer_tag = prefix.empty() ? "blocks." : prefix + ".blocks.";
for (const auto& [name, _] : tensor_storage_map) {
size_t pos = name.find(layer_tag);
if (pos == std::string::npos) {
continue;
}
size_t start = pos + layer_tag.size();
size_t end = name.find('.', start);
if (end == std::string::npos) {
continue;
}
int64_t layer_id = atoll(name.substr(start, end - start).c_str());
detected_layers = std::max(detected_layers, layer_id + 1);
}
if (detected_layers > 0) {
config.num_layers = detected_layers;
LOG_DEBUG("anima: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", head_dim = %" PRId64,
config.num_layers,
config.hidden_size,
config.num_heads,
config.head_dim);
}
return config;
}
};
__STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx, __STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx,
ggml_tensor* x, ggml_tensor* x,
ggml_tensor* gate) { ggml_tensor* gate) {
@ -417,31 +460,22 @@ namespace Anima {
struct AnimaNet : public GGMLBlock { struct AnimaNet : public GGMLBlock {
public: public:
int64_t in_channels = 16; AnimaConfig config;
int64_t out_channels = 16;
int64_t hidden_size = 2048;
int64_t text_embed_dim = 1024;
int64_t num_heads = 16;
int64_t head_dim = 128;
int patch_size = 2;
int64_t num_layers = 28;
std::vector<int> axes_dim = {44, 42, 42};
int theta = 10000;
public: public:
AnimaNet() = default; AnimaNet() = default;
explicit AnimaNet(int64_t num_layers) explicit AnimaNet(AnimaConfig config)
: num_layers(num_layers) { : config(config) {
blocks["x_embedder"] = std::make_shared<XEmbedder>((in_channels + 1) * patch_size * patch_size, hidden_size); blocks["x_embedder"] = std::make_shared<XEmbedder>((config.in_channels + 1) * config.patch_size * config.patch_size, config.hidden_size);
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(hidden_size, hidden_size * 3); blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(config.hidden_size, config.hidden_size * 3);
blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f); blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(config.hidden_size, 1e-6f);
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < config.num_layers; i++) {
blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(hidden_size, blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(config.hidden_size,
text_embed_dim, config.text_embed_dim,
num_heads, config.num_heads,
head_dim); config.head_dim);
} }
blocks["final_layer"] = std::make_shared<FinalLayer>(hidden_size, patch_size, out_channels); blocks["final_layer"] = std::make_shared<FinalLayer>(config.hidden_size, config.patch_size, config.out_channels);
blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16); blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16);
} }
@ -468,11 +502,11 @@ namespace Anima {
auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]); auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2); // [N, C + 1, H, W] x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2); // [N, C + 1, H, W]
x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); // [N, h*w, (C+1)*ph*pw] x = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size); // [N, h*w, (C+1)*ph*pw]
x = x_embedder->forward(ctx, x); x = x_embedder->forward(ctx, x);
auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(hidden_size)); auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(config.hidden_size));
auto temb = t_embedder->forward(ctx, timestep_proj); auto temb = t_embedder->forward(ctx, timestep_proj);
auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj); auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);
@ -499,53 +533,40 @@ namespace Anima {
encoder_hidden_states = adapted_context; encoder_hidden_states = adapted_context;
} }
for (int i = 0; i < num_layers; i++) { sd::ggml_graph_cut::mark_graph_cut(x, "anima.prelude", "x");
sd::ggml_graph_cut::mark_graph_cut(embedded_timestep, "anima.prelude", "embedded_timestep");
sd::ggml_graph_cut::mark_graph_cut(temb, "anima.prelude", "temb");
sd::ggml_graph_cut::mark_graph_cut(encoder_hidden_states, "anima.prelude", "context");
for (int i = 0; i < config.num_layers; i++) {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe); x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
sd::ggml_graph_cut::mark_graph_cut(x, "anima.blocks." + std::to_string(i), "x");
} }
x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C] x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false); // [N, C, H, W] x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, config.patch_size, config.patch_size, false); // [N, C, H, W]
return x; return x;
} }
}; };
struct AnimaRunner : public GGMLRunner { struct AnimaRunner : public DiffusionModelRunner {
public: public:
std::vector<float> image_pe_vec; std::vector<float> image_pe_vec;
std::vector<float> adapter_q_pe_vec; std::vector<float> adapter_q_pe_vec;
std::vector<float> adapter_k_pe_vec; std::vector<float> adapter_k_pe_vec;
AnimaConfig config;
AnimaNet net; AnimaNet net;
AnimaRunner(ggml_backend_t backend, AnimaRunner(ggml_backend_t backend,
bool offload_params_to_cpu, ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {}, const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model") const std::string prefix = "model.diffusion_model")
: GGMLRunner(backend, offload_params_to_cpu) { : DiffusionModelRunner(backend, params_backend, prefix),
int64_t num_layers = 0; config(AnimaConfig::detect_from_weights(tensor_storage_map, prefix + ".net")) {
std::string layer_tag = prefix + ".net.blocks."; net = AnimaNet(config);
for (const auto& kv : tensor_storage_map) {
const std::string& tensor_name = kv.first;
size_t pos = tensor_name.find(layer_tag);
if (pos == std::string::npos) {
continue;
}
size_t start = pos + layer_tag.size();
size_t end = tensor_name.find('.', start);
if (end == std::string::npos) {
continue;
}
int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
num_layers = std::max(num_layers, layer_id + 1);
}
if (num_layers <= 0) {
num_layers = 28;
}
LOG_INFO("anima net layers: %" PRId64, num_layers);
net = AnimaNet(num_layers);
net.init(params_ctx, tensor_storage_map, prefix + ".net"); net.init(params_ctx, tensor_storage_map, prefix + ".net");
} }
@ -553,7 +574,7 @@ namespace Anima {
return "anima"; return "anima";
} }
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
net.get_param_tensors(tensors, prefix + ".net"); net.get_param_tensors(tensors, prefix + ".net");
} }
@ -592,7 +613,8 @@ namespace Anima {
{}, {},
empty_ref_latents, empty_ref_latents,
false, false,
1.0f); 1.0f,
false);
std::vector<float> axis_thetas = { std::vector<float> axis_thetas = {
static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]), static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
@ -615,22 +637,22 @@ namespace Anima {
GGML_ASSERT(x->ne[3] == 1); GGML_ASSERT(x->ne[3] == 1);
ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE); ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size; int64_t pad_h = (config.patch_size - x->ne[1] % config.patch_size) % config.patch_size;
int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size; int64_t pad_w = (config.patch_size - x->ne[0] % config.patch_size) % config.patch_size;
int64_t h_pad = x->ne[1] + pad_h; int64_t h_pad = x->ne[1] + pad_h;
int64_t w_pad = x->ne[0] + pad_w; int64_t w_pad = x->ne[0] + pad_w;
image_pe_vec = gen_anima_image_pe_vec(1, image_pe_vec = gen_anima_image_pe_vec(1,
static_cast<int>(h_pad), static_cast<int>(h_pad),
static_cast<int>(w_pad), static_cast<int>(w_pad),
static_cast<int>(net.patch_size), static_cast<int>(config.patch_size),
net.theta, config.theta,
net.axes_dim, config.axes_dim,
4.0f, 4.0f,
4.0f, 4.0f,
1.0f); 1.0f);
int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2)); int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (config.head_dim / 2));
auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len); auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, image_pos_len);
set_backend_tensor_data(image_pe, image_pe_vec.data()); set_backend_tensor_data(image_pe, image_pe_vec.data());
ggml_tensor* adapter_q_pe = nullptr; ggml_tensor* adapter_q_pe = nullptr;
@ -677,7 +699,20 @@ namespace Anima {
}; };
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim()); return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
} }
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<AnimaDiffusionExtra>(diffusion_params);
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(extra->t5_ids),
tensor_or_empty(extra->t5_weights));
}
}; };
} // namespace Anima } // namespace Anima
#endif // __ANIMA_HPP__ #endif // __SD_MODEL_DIFFUSION_ANIMA_HPP__

View File

@ -1,8 +1,8 @@
#ifndef __CONTROL_HPP__ #ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__
#define __CONTROL_HPP__ #define __SD_MODEL_DIFFUSION_CONTROL_HPP__
#include "common_block.hpp" #include "model/common/block.hpp"
#include "model.h" #include "model_loader.h"
#define CONTROL_NET_GRAPH_SIZE 1536 #define CONTROL_NET_GRAPH_SIZE 1536
@ -319,10 +319,10 @@ struct ControlNet : public GGMLRunner {
bool guided_hint_cached = false; bool guided_hint_cached = false;
ControlNet(ggml_backend_t backend, ControlNet(ggml_backend_t backend,
bool offload_params_to_cpu, ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {}, const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_SD1) SDVersion version = VERSION_SD1)
: GGMLRunner(backend, offload_params_to_cpu), control_net(version) { : GGMLRunner(backend, params_backend), control_net(version) {
control_net.init(params_ctx, tensor_storage_map, ""); control_net.init(params_ctx, tensor_storage_map, "");
} }
@ -457,7 +457,11 @@ struct ControlNet : public GGMLRunner {
bool load_from_file(const std::string& file_path, int n_threads) { bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading control net from '%s'", file_path.c_str()); LOG_INFO("loading control net from '%s'", file_path.c_str());
alloc_params_buffer(); if (!alloc_params_buffer()) {
LOG_ERROR("control net model buffer allocation failed");
return false;
}
std::map<std::string, ggml_tensor*> tensors; std::map<std::string, ggml_tensor*> tensors;
control_net.get_param_tensors(tensors); control_net.get_param_tensors(tensors);
std::set<std::string> ignore_tensors; std::set<std::string> ignore_tensors;
@ -480,4 +484,4 @@ struct ControlNet : public GGMLRunner {
} }
}; };
#endif // __CONTROL_HPP__ #endif // __SD_MODEL_DIFFUSION_CONTROL_HPP__

View File

@ -1,7 +1,7 @@
#ifndef __COMMON_DIT_HPP__ #ifndef __SD_MODEL_DIFFUSION_DIT_HPP__
#define __COMMON_DIT_HPP__ #define __SD_MODEL_DIFFUSION_DIT_HPP__
#include "ggml_extend.hpp" #include "core/ggml_extend.hpp"
namespace DiT { namespace DiT {
inline ggml_tensor* patchify(ggml_context* ctx, inline ggml_tensor* patchify(ggml_context* ctx,
@ -103,6 +103,64 @@ namespace DiT {
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W] x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
return x; return x;
} }
inline ggml_tensor* patchify(ggml_context* ctx,
ggml_tensor* x,
int pt,
int ph,
int pw,
int64_t N = 1) {
// x: [N*C, T, H, W]
// return: [N, h*w, C*pt*ph*pw]
int64_t C = x->ne[3] / N;
int64_t T = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
int64_t t_len = T / pt;
int64_t h_len = H / ph;
int64_t w_len = W / pw;
GGML_ASSERT(C * N == x->ne[3]);
GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N); // [N*C*t_len, pt, h_len*ph, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, h_len*ph, pt, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N); // [N*C*t_len*h_len, ph, pt, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt, ph, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt*ph, w_len, pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, w_len, pt*ph, pw]
x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N); // [N, C, t_len*h_len*w_len, pt*ph*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N, t_len*h_len*w_len, C, pt*ph*pw]
x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1); // [N, t_len*h_len*w_len, C*pt*ph*pw]
return x;
}
inline ggml_tensor* unpatchify(ggml_context* ctx,
ggml_tensor* x,
int64_t t_len,
int64_t h_len,
int64_t w_len,
int pt,
int ph,
int pw) {
// x: [N, t_len*h_len*w_len, pt*ph*pw*C]
// return: [N*C, t_len*pt, h_len*ph, w_len*pw]
int64_t N = x->ne[3];
int64_t C = x->ne[0] / pt / ph / pw;
GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N); // [N, t_len*h_len*w_len, pt*ph*pw, C]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, t_len*h_len*w_len, pt*ph*pw]
x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N); // [N*C*t_len*h_len, w_len, pt*ph, pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt*ph, w_len, pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt, ph, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, ph, pt, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N); // [N*C*t_len, h_len*ph, pt, w_len*pw]
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, pt, h_len*ph, w_len*pw]
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N); // [N*C, t_len*pt, h_len*ph, w_len*pw]
return x;
}
} // namespace DiT } // namespace DiT
#endif // __COMMON_DIT_HPP__ #endif // __SD_MODEL_DIFFUSION_DIT_HPP__

View File

@ -0,0 +1,458 @@
#ifndef __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
#define __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
#include <memory>
#include <vector>
#include "model/common/rope.hpp"
#include "model/diffusion/dit.hpp"
#include "model/diffusion/flux.hpp"
#include "model/diffusion/model.hpp"
#include "model/diffusion/qwen_image.hpp"
namespace ErnieImage {
constexpr int ERNIE_IMAGE_GRAPH_SIZE = 40960;
struct ErnieImageConfig {
int64_t hidden_size = 4096;
int64_t num_heads = 32;
int64_t num_layers = 36;
int64_t ffn_hidden_size = 12288;
int64_t in_channels = 128;
int64_t out_channels = 128;
int patch_size = 1;
int64_t text_in_dim = 3072;
int theta = 256;
std::vector<int> axes_dim = {32, 48, 48};
int axes_dim_sum = 128;
float eps = 1e-6f;
static ErnieImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
ErnieImageConfig config;
config.num_layers = 0;
int64_t detected_head_dim = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (ends_with(name, "x_embedder.proj.weight") && tensor_storage.n_dims == 4) {
config.patch_size = static_cast<int>(tensor_storage.ne[0]);
config.in_channels = tensor_storage.ne[2];
config.hidden_size = tensor_storage.ne[3];
} else if (ends_with(name, "text_proj.weight") && tensor_storage.n_dims == 2) {
config.text_in_dim = tensor_storage.ne[0];
} else if (ends_with(name, "layers.0.self_attention.norm_q.weight")) {
detected_head_dim = tensor_storage.ne[0];
} else if (ends_with(name, "layers.0.mlp.gate_proj.weight") && tensor_storage.n_dims == 2) {
config.ffn_hidden_size = tensor_storage.ne[1];
} else if (ends_with(name, "final_linear.weight") && tensor_storage.n_dims == 2) {
int64_t out_dim = tensor_storage.ne[1];
int64_t patch_area = config.patch_size * config.patch_size;
config.out_channels = out_dim / patch_area;
}
size_t pos = name.find("layers.");
if (pos != std::string::npos) {
auto items = split_string(name.substr(pos), '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > config.num_layers) {
config.num_layers = block_index + 1;
}
}
}
}
if (config.num_layers == 0) {
config.num_layers = 36;
}
if (detected_head_dim > 0) {
config.num_heads = config.hidden_size / detected_head_dim;
}
config.axes_dim_sum = 0;
for (int axis_dim : config.axes_dim) {
config.axes_dim_sum += axis_dim;
}
LOG_DEBUG("ernie_image: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", ffn_hidden_size = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
config.num_layers,
config.hidden_size,
config.num_heads,
config.ffn_hidden_size,
config.in_channels,
config.out_channels);
return config;
}
};
__STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx,
ggml_tensor* timesteps,
int dim,
int max_period = 10000) {
auto emb = ggml_ext_timestep_embedding(ctx, timesteps, dim, max_period, 1.0f);
int64_t half = dim / 2;
auto cos_part = ggml_view_2d(ctx, emb, half, emb->ne[1], emb->nb[1], 0);
auto sin_part = ggml_view_2d(ctx, emb, half, emb->ne[1], emb->nb[1], half * emb->nb[0]);
auto sin_first = ggml_concat(ctx, sin_part, cos_part, 0);
return sin_first;
}
__STATIC_INLINE__ ggml_tensor* apply_rotary_emb(ggml_context* ctx, ggml_tensor* x, ggml_tensor* pe) {
// x: [N, S, heads, head_dim]
// pe: [2, S, 1, head_dim], stored as ggml [head_dim, 1, S, 2].
int64_t head_dim = x->ne[0];
int64_t heads = x->ne[1];
int64_t S = x->ne[2];
int64_t N = x->ne[3];
int64_t rot_dim = pe->ne[0];
GGML_ASSERT(rot_dim <= head_dim);
GGML_ASSERT(rot_dim % 2 == 0);
GGML_ASSERT(pe->ne[1] == 1 && pe->ne[2] == S && pe->ne[3] == 2);
x = ggml_cont(ctx, x);
auto x_rot = ggml_ext_slice(ctx, x, 0, 0, rot_dim, false);
auto x_pass = rot_dim < head_dim ? ggml_ext_slice(ctx, x, 0, rot_dim, head_dim, false) : nullptr;
int64_t half = rot_dim / 2;
auto x1 = ggml_view_4d(ctx, x_rot, half, heads, S, N, x_rot->nb[1], x_rot->nb[2], x_rot->nb[3], 0);
auto x2 = ggml_view_4d(ctx, x_rot, half, heads, S, N, x_rot->nb[1], x_rot->nb[2], x_rot->nb[3], half * x_rot->nb[0]);
x1 = ggml_cont(ctx, x1);
x2 = ggml_cont(ctx, x2);
auto rotated = ggml_concat(ctx, ggml_neg(ctx, x2), x1, 0);
auto cos_emb = ggml_ext_slice(ctx, pe, 3, 0, 1, false);
auto sin_emb = ggml_ext_slice(ctx, pe, 3, 1, 2, false);
auto out = ggml_add(ctx, ggml_mul(ctx, x_rot, cos_emb), ggml_mul(ctx, rotated, sin_emb));
if (x_pass != nullptr) {
out = ggml_concat(ctx, out, x_pass, 0);
}
return out;
}
struct ErnieImageAttention : public GGMLBlock {
int64_t num_heads;
int64_t head_dim;
ErnieImageAttention(int64_t query_dim,
int64_t heads,
int64_t dim_head,
float eps = 1e-6f)
: num_heads(heads), head_dim(dim_head) {
int64_t inner_dim = heads * dim_head;
blocks["to_q"] = std::make_shared<Linear>(query_dim, inner_dim, false);
blocks["to_k"] = std::make_shared<Linear>(query_dim, inner_dim, false);
blocks["to_v"] = std::make_shared<Linear>(query_dim, inner_dim, false);
blocks["norm_q"] = std::make_shared<RMSNorm>(dim_head, eps);
blocks["norm_k"] = std::make_shared<RMSNorm>(dim_head, eps);
blocks["to_out.0"] = std::make_shared<Linear>(inner_dim, query_dim, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* pe,
ggml_tensor* attention_mask = nullptr) {
// x: [N, S, hidden_size]
// pe: [S, head_dim/2, 2, 2], generated in image-token-first order.
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
auto to_k = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
auto norm_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
auto norm_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
int64_t S = x->ne[1];
int64_t N = x->ne[2];
auto q = to_q->forward(ctx, x);
auto k = to_k->forward(ctx, x);
auto v = to_v->forward(ctx, x);
q = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, S, N); // [N, S, heads, head_dim]
k = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, S, N); // [N, S, heads, head_dim]
v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, S, N); // [N, S, heads, head_dim]
q = norm_q->forward(ctx, q);
k = norm_k->forward(ctx, k);
q = apply_rotary_emb(ctx->ggml_ctx, q, pe);
k = apply_rotary_emb(ctx->ggml_ctx, k, pe);
q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 0, 2, 1, 3)); // [N, heads, S, head_dim]
q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0], q->ne[1], q->ne[2] * q->ne[3]);
k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, heads, S, head_dim]
k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled); // [N, S, hidden_size]
x = to_out_0->forward(ctx, x);
return x;
}
};
struct ErnieImageFeedForward : public GGMLBlock {
public:
ErnieImageFeedForward(int64_t hidden_size, int64_t ffn_hidden_size) {
blocks["gate_proj"] = std::make_shared<Linear>(hidden_size, ffn_hidden_size, false);
blocks["up_proj"] = std::make_shared<Linear>(hidden_size, ffn_hidden_size, false);
blocks["linear_fc2"] = std::make_shared<Linear>(ffn_hidden_size, hidden_size, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
auto up_proj = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
auto linear_fc2 = std::dynamic_pointer_cast<Linear>(blocks["linear_fc2"]);
auto gate = gate_proj->forward(ctx, x);
gate = ggml_ext_gelu(ctx->ggml_ctx, gate);
x = up_proj->forward(ctx, x);
x = ggml_mul(ctx->ggml_ctx, x, gate);
x = linear_fc2->forward(ctx, x);
return x;
}
};
struct ErnieImageSharedAdaLNBlock : public GGMLBlock {
public:
ErnieImageSharedAdaLNBlock(int64_t hidden_size,
int64_t num_heads,
int64_t ffn_hidden_size,
float eps = 1e-6f) {
blocks["adaLN_sa_ln"] = std::make_shared<RMSNorm>(hidden_size, eps);
blocks["self_attention"] = std::make_shared<ErnieImageAttention>(hidden_size,
num_heads,
hidden_size / num_heads,
eps);
blocks["adaLN_mlp_ln"] = std::make_shared<RMSNorm>(hidden_size, eps);
blocks["mlp"] = std::make_shared<ErnieImageFeedForward>(hidden_size, ffn_hidden_size);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* pe,
const std::vector<ggml_tensor*>& temb,
ggml_tensor* attention_mask = nullptr) {
// x: [N, image_tokens + text_tokens, hidden_size]
auto adaLN_sa_ln = std::dynamic_pointer_cast<RMSNorm>(blocks["adaLN_sa_ln"]);
auto self_attention = std::dynamic_pointer_cast<ErnieImageAttention>(blocks["self_attention"]);
auto adaLN_mlp_ln = std::dynamic_pointer_cast<RMSNorm>(blocks["adaLN_mlp_ln"]);
auto mlp = std::dynamic_pointer_cast<ErnieImageFeedForward>(blocks["mlp"]);
auto shift_msa = temb[0];
auto scale_msa = temb[1];
auto gate_msa = temb[2];
auto shift_mlp = temb[3];
auto scale_mlp = temb[4];
auto gate_mlp = temb[5];
auto residual = x;
x = adaLN_sa_ln->forward(ctx, x);
x = Flux::modulate(ctx->ggml_ctx, x, shift_msa, scale_msa, true);
auto attn_out = self_attention->forward(ctx, x, pe, attention_mask);
x = ggml_add(ctx->ggml_ctx, residual, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
residual = x;
x = adaLN_mlp_ln->forward(ctx, x);
x = Flux::modulate(ctx->ggml_ctx, x, shift_mlp, scale_mlp, true);
x = ggml_add(ctx->ggml_ctx, residual, ggml_mul(ctx->ggml_ctx, mlp->forward(ctx, x), gate_mlp));
return x;
}
};
struct ErnieImageAdaLNContinuous : public GGMLBlock {
public:
ErnieImageAdaLNContinuous(int64_t hidden_size, float eps = 1e-6f) {
blocks["norm"] = std::make_shared<LayerNorm>(hidden_size, eps, false);
blocks["linear"] = std::make_shared<Linear>(hidden_size, hidden_size * 2, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning) {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
auto mods = ggml_ext_chunk(ctx->ggml_ctx, linear->forward(ctx, conditioning), 2, 0);
auto scale = mods[0];
auto shift = mods[1];
x = norm->forward(ctx, x);
x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
return x;
}
};
class ErnieImageModel : public GGMLBlock {
public:
ErnieImageConfig config;
ErnieImageModel() = default;
ErnieImageModel(ErnieImageConfig config)
: config(config) {
blocks["x_embedder.proj"] = std::make_shared<Conv2d>(config.in_channels,
config.hidden_size,
std::pair<int, int>{config.patch_size, config.patch_size},
std::pair<int, int>{config.patch_size, config.patch_size},
std::pair<int, int>{0, 0},
std::pair<int, int>{1, 1},
true);
if (config.text_in_dim != config.hidden_size) {
blocks["text_proj"] = std::make_shared<Linear>(config.text_in_dim, config.hidden_size, false);
}
blocks["time_embedding"] = std::make_shared<Qwen::TimestepEmbedding>(config.hidden_size, config.hidden_size);
blocks["adaLN_modulation.1"] = std::make_shared<Linear>(config.hidden_size, 6 * config.hidden_size, true);
for (int i = 0; i < config.num_layers; i++) {
blocks["layers." + std::to_string(i)] = std::make_shared<ErnieImageSharedAdaLNBlock>(config.hidden_size,
config.num_heads,
config.ffn_hidden_size,
config.eps);
}
blocks["final_norm"] = std::make_shared<ErnieImageAdaLNContinuous>(config.hidden_size, config.eps);
blocks["final_linear"] = std::make_shared<Linear>(config.hidden_size,
config.patch_size * config.patch_size * config.out_channels,
true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* timestep,
ggml_tensor* context,
ggml_tensor* pe) {
// x: [N, C, H, W]
// context: [N, text_tokens, 3072]
// pe: [image_tokens + text_tokens, head_dim/2, 2, 2]
GGML_ASSERT(context != nullptr);
GGML_ASSERT(x->ne[1] % config.patch_size == 0 && x->ne[0] % config.patch_size == 0);
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t Hp = H / config.patch_size;
int64_t Wp = W / config.patch_size;
int64_t n_img = Hp * Wp;
int64_t N = x->ne[3];
auto x_embedder_proj = std::dynamic_pointer_cast<Conv2d>(blocks["x_embedder.proj"]);
auto time_embedding = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["time_embedding"]);
auto adaLN_mod = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
auto final_norm = std::dynamic_pointer_cast<ErnieImageAdaLNContinuous>(blocks["final_norm"]);
auto final_linear = std::dynamic_pointer_cast<Linear>(blocks["final_linear"]);
auto img = x_embedder_proj->forward(ctx, x); // [N, hidden_size, Hp, Wp]
img = ggml_reshape_3d(ctx->ggml_ctx, img, img->ne[0] * img->ne[1], img->ne[2], N); // [N, hidden_size, image_tokens]
img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3)); // [N, image_tokens, hidden_size]
auto txt = context;
auto text_proj = std::dynamic_pointer_cast<Linear>(blocks["text_proj"]);
if (text_proj) {
txt = text_proj->forward(ctx, txt);
}
auto hidden_states = ggml_concat(ctx->ggml_ctx, img, txt, 1); // [N, image_tokens + text_tokens, hidden_size]
auto sample = timestep_embedding_sin_cos(ctx->ggml_ctx, timestep, static_cast<int>(config.hidden_size));
auto c = time_embedding->forward(ctx, sample); // [N, hidden_size]
auto mod_params = adaLN_mod->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 6 * hidden_size]
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.prelude", "hidden_states");
// sd::ggml_graph_cut::mark_graph_cut(mod_params, "ernie_image.prelude", "mod_params");
auto chunks = ggml_ext_chunk(ctx->ggml_ctx, mod_params, 6, 0);
std::vector<ggml_tensor*> temb;
temb.reserve(6);
for (auto chunk : chunks) {
temb.push_back(ggml_reshape_3d(ctx->ggml_ctx, chunk, chunk->ne[0], 1, chunk->ne[1])); // [N, 1, hidden_size]
}
for (int i = 0; i < config.num_layers; i++) {
auto layer = std::dynamic_pointer_cast<ErnieImageSharedAdaLNBlock>(blocks["layers." + std::to_string(i)]);
hidden_states = layer->forward(ctx, hidden_states, pe, temb);
sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.layers." + std::to_string(i), "hidden_states");
}
hidden_states = final_norm->forward(ctx, hidden_states, c);
hidden_states = final_linear->forward(ctx, hidden_states); // [N, image_tokens, p*p*out_channels]
auto patches = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, 0, n_img); // [N, image_tokens, hidden_size]
auto out = DiT::unpatchify(ctx->ggml_ctx,
patches,
Hp,
Wp,
config.patch_size,
config.patch_size,
false); // [N, out_channels, H, W]
return out;
}
};
struct ErnieImageRunner : public DiffusionModelRunner {
ErnieImageConfig config;
ErnieImageModel ernie_image;
std::vector<float> pe_vec;
ErnieImageRunner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
: DiffusionModelRunner(backend, params_backend, prefix),
config(ErnieImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
ernie_image = ErnieImageModel(config);
ernie_image.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "ernie_image";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
ernie_image.get_param_tensors(tensors, prefix);
}
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
const sd::Tensor<float>& timesteps_tensor,
const sd::Tensor<float>& context_tensor) {
ggml_cgraph* gf = new_graph_custom(ERNIE_IMAGE_GRAPH_SIZE);
ggml_tensor* x = make_input(x_tensor);
ggml_tensor* timesteps = make_input(timesteps_tensor);
GGML_ASSERT(x->ne[3] == 1);
GGML_ASSERT(!context_tensor.empty());
ggml_tensor* context = make_input(context_tensor);
pe_vec = Rope::gen_ernie_image_pe(static_cast<int>(x->ne[1]),
static_cast<int>(x->ne[0]),
config.patch_size,
static_cast<int>(x->ne[3]),
static_cast<int>(context->ne[1]),
config.theta,
circular_y_enabled,
circular_x_enabled,
config.axes_dim);
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, config.axes_dim_sum, 1, pos_len, 2);
set_backend_tensor_data(pe, pe_vec.data());
auto runner_ctx = get_context();
ggml_tensor* out = ernie_image.forward(&runner_ctx, x, timesteps, context, pe);
ggml_build_forward_expand(gf, out);
return gf;
}
sd::Tensor<float> compute(int n_threads,
const sd::Tensor<float>& x,
const sd::Tensor<float>& timesteps,
const sd::Tensor<float>& context) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context);
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context));
}
};
} // namespace ErnieImage
#endif // __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__

View File

@ -1,17 +1,167 @@
#ifndef __FLUX_HPP__ #ifndef __SD_MODEL_DIFFUSION_FLUX_HPP__
#define __FLUX_HPP__ #define __SD_MODEL_DIFFUSION_FLUX_HPP__
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "common_dit.hpp" #include "model/common/rope.hpp"
#include "model.h" #include "model/diffusion/dit.hpp"
#include "rope.hpp" #include "model/diffusion/model.hpp"
#include "model_loader.h"
#define FLUX_GRAPH_SIZE 10240 #define FLUX_GRAPH_SIZE 10240
namespace Flux { namespace Flux {
struct ChromaRadianceConfig {
int64_t nerf_hidden_size = 64;
int nerf_mlp_ratio = 4;
int nerf_depth = 4;
int nerf_max_freqs = 8;
bool use_x0 = false;
bool fake_patch_size_x2 = false;
};
struct FluxConfig {
SDVersion version = VERSION_FLUX;
bool is_chroma = false;
int patch_size = 2;
int64_t in_channels = 64;
int64_t out_channels = 64;
int64_t vec_in_dim = 768;
int64_t context_in_dim = 4096;
int64_t hidden_size = 3072;
float mlp_ratio = 4.0f;
int num_heads = 24;
int depth = 19;
int depth_single_blocks = 38;
std::vector<int> axes_dim = {16, 56, 56};
int axes_dim_sum = 128;
int theta = 10000;
bool qkv_bias = true;
bool guidance_embed = true;
int64_t in_dim = 64;
bool disable_bias = false;
bool share_modulation = false;
bool semantic_txt_norm = false;
bool use_yak_mlp = false;
bool use_mlp_silu_act = false;
float ref_index_scale = 1.f;
ChromaRadianceConfig chroma_radiance_params;
static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix,
SDVersion version = VERSION_FLUX) {
FluxConfig config;
config.version = version;
config.guidance_embed = false;
config.depth = 0;
config.depth_single_blocks = 0;
if (version == VERSION_FLUX_FILL) {
config.in_channels = 384;
} else if (version == VERSION_FLUX_CONTROLS) {
config.in_channels = 128;
} else if (version == VERSION_FLEX_2) {
config.in_channels = 196;
} else if (version == VERSION_CHROMA_RADIANCE) {
config.in_channels = 3;
config.patch_size = 16;
} else if (version == VERSION_OVIS_IMAGE) {
config.semantic_txt_norm = true;
config.use_yak_mlp = true;
config.vec_in_dim = 0;
} else if (sd_version_is_flux2(version)) {
config.in_channels = 128;
config.patch_size = 1;
config.out_channels = 128;
config.mlp_ratio = 3.f;
config.theta = 2000;
config.axes_dim = {32, 32, 32, 32};
config.vec_in_dim = 0;
config.qkv_bias = false;
config.disable_bias = true;
config.share_modulation = true;
config.ref_index_scale = 10.f;
config.use_mlp_silu_act = true;
} else if (sd_version_is_longcat(version)) {
config.context_in_dim = 3584;
config.vec_in_dim = 0;
}
int64_t head_dim = 0;
int64_t actual_radiance_patch_size = -1;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (name.find("guidance_in.in_layer.weight") != std::string::npos) {
config.guidance_embed = true;
}
if (name.find("__x0__") != std::string::npos) {
LOG_DEBUG("using x0 prediction");
config.chroma_radiance_params.use_x0 = true;
}
if (name.find("__32x32__") != std::string::npos) {
LOG_DEBUG("using patch size 32");
config.patch_size = 32;
}
if (name.find("img_in_patch.weight") != std::string::npos) {
actual_radiance_patch_size = tensor_storage.ne[0];
LOG_DEBUG("actual radiance patch size: %" PRId64, actual_radiance_patch_size);
}
if (name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
config.is_chroma = true;
}
size_t db = name.find("double_blocks.");
if (db != std::string::npos) {
std::string block_name = name.substr(db);
int block_depth = atoi(block_name.substr(14, block_name.find(".", 14)).c_str());
if (block_depth + 1 > config.depth) {
config.depth = block_depth + 1;
}
}
size_t sb = name.find("single_blocks.");
if (sb != std::string::npos) {
std::string block_name = name.substr(sb);
int block_depth = atoi(block_name.substr(14, block_name.find(".", 14)).c_str());
if (block_depth + 1 > config.depth_single_blocks) {
config.depth_single_blocks = block_depth + 1;
}
}
if (ends_with(name, "txt_in.weight")) {
config.context_in_dim = tensor_storage.ne[0];
config.hidden_size = tensor_storage.ne[1];
}
if (ends_with(name, "single_blocks.0.norm.key_norm.scale")) {
head_dim = tensor_storage.ne[0];
}
if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
head_dim = tensor_storage.ne[0];
}
}
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != config.patch_size) {
GGML_ASSERT(config.patch_size == 2 * actual_radiance_patch_size);
LOG_DEBUG("using fake x2 patch size");
config.chroma_radiance_params.fake_patch_size_x2 = true;
}
if (head_dim > 0) {
config.num_heads = static_cast<int>(config.hidden_size / head_dim);
}
config.axes_dim_sum = 0;
for (int axis_dim : config.axes_dim) {
config.axes_dim_sum += axis_dim;
}
LOG_DEBUG("flux: depth = %d, depth_single_blocks = %d, guidance_embed = %s, context_in_dim = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %d",
config.depth,
config.depth_single_blocks,
config.guidance_embed ? "true" : "false",
config.context_in_dim,
config.hidden_size,
config.num_heads);
return config;
}
};
struct MLPEmbedder : public UnaryBlock { struct MLPEmbedder : public UnaryBlock {
public: public:
MLPEmbedder(int64_t in_dim, int64_t hidden_dim, bool bias = true) { MLPEmbedder(int64_t in_dim, int64_t hidden_dim, bool bias = true) {
@ -446,7 +596,6 @@ namespace Flux {
if (use_yak_mlp || use_mlp_silu_act) { if (use_yak_mlp || use_mlp_silu_act) {
mlp_mult_factor = 2; mlp_mult_factor = 2;
} }
blocks["linear1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias)); blocks["linear1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
blocks["linear2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias)); blocks["linear2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias));
blocks["norm"] = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim)); blocks["norm"] = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
@ -723,127 +872,90 @@ namespace Flux {
} }
}; };
struct ChromaRadianceParams {
int64_t nerf_hidden_size = 64;
int nerf_mlp_ratio = 4;
int nerf_depth = 4;
int nerf_max_freqs = 8;
bool use_x0 = false;
bool fake_patch_size_x2 = false;
};
struct FluxParams {
SDVersion version = VERSION_FLUX;
bool is_chroma = false;
int patch_size = 2;
int64_t in_channels = 64;
int64_t out_channels = 64;
int64_t vec_in_dim = 768;
int64_t context_in_dim = 4096;
int64_t hidden_size = 3072;
float mlp_ratio = 4.0f;
int num_heads = 24;
int depth = 19;
int depth_single_blocks = 38;
std::vector<int> axes_dim = {16, 56, 56};
int axes_dim_sum = 128;
int theta = 10000;
bool qkv_bias = true;
bool guidance_embed = true;
int64_t in_dim = 64;
bool disable_bias = false;
bool share_modulation = false;
bool semantic_txt_norm = false;
bool use_yak_mlp = false;
bool use_mlp_silu_act = false;
float ref_index_scale = 1.f;
ChromaRadianceParams chroma_radiance_params;
};
struct Flux : public GGMLBlock { struct Flux : public GGMLBlock {
public: public:
FluxParams params; FluxConfig config;
Flux() {} Flux() {}
Flux(FluxParams params) Flux(FluxConfig config)
: params(params) { : config(config) {
if (params.version == VERSION_CHROMA_RADIANCE) { if (config.version == VERSION_CHROMA_RADIANCE) {
std::pair<int, int> kernel_size = {params.patch_size, params.patch_size}; std::pair<int, int> kernel_size = {config.patch_size, config.patch_size};
if (params.chroma_radiance_params.fake_patch_size_x2) { if (config.chroma_radiance_params.fake_patch_size_x2) {
kernel_size = {params.patch_size / 2, params.patch_size / 2}; kernel_size = {config.patch_size / 2, config.patch_size / 2};
} }
std::pair<int, int> stride = kernel_size; std::pair<int, int> stride = kernel_size;
blocks["img_in_patch"] = std::make_shared<Conv2d>(params.in_channels, blocks["img_in_patch"] = std::make_shared<Conv2d>(config.in_channels,
params.hidden_size, config.hidden_size,
kernel_size, kernel_size,
stride); stride);
} else { } else {
blocks["img_in"] = std::make_shared<Linear>(params.in_channels, params.hidden_size, !params.disable_bias); blocks["img_in"] = std::make_shared<Linear>(config.in_channels, config.hidden_size, !config.disable_bias);
} }
if (params.is_chroma) { if (config.is_chroma) {
blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(params.in_dim, params.hidden_size); blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(config.in_dim, config.hidden_size);
} else { } else {
blocks["time_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias); blocks["time_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
if (params.vec_in_dim > 0) { if (config.vec_in_dim > 0) {
blocks["vector_in"] = std::make_shared<MLPEmbedder>(params.vec_in_dim, params.hidden_size, !params.disable_bias); blocks["vector_in"] = std::make_shared<MLPEmbedder>(config.vec_in_dim, config.hidden_size, !config.disable_bias);
} }
if (params.guidance_embed) { if (config.guidance_embed) {
blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias); blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
} }
} }
if (params.semantic_txt_norm) { if (config.semantic_txt_norm) {
blocks["txt_norm"] = std::make_shared<RMSNorm>(params.context_in_dim); blocks["txt_norm"] = std::make_shared<RMSNorm>(config.context_in_dim);
} }
blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, !params.disable_bias); blocks["txt_in"] = std::make_shared<Linear>(config.context_in_dim, config.hidden_size, !config.disable_bias);
for (int i = 0; i < params.depth; i++) { for (int i = 0; i < config.depth; i++) {
blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(params.hidden_size, blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(config.hidden_size,
params.num_heads, config.num_heads,
params.mlp_ratio, config.mlp_ratio,
i, i,
params.qkv_bias, config.qkv_bias,
params.is_chroma, config.is_chroma,
params.share_modulation, config.share_modulation,
!params.disable_bias, !config.disable_bias,
params.use_yak_mlp, config.use_yak_mlp,
params.use_mlp_silu_act); config.use_mlp_silu_act);
} }
for (int i = 0; i < params.depth_single_blocks; i++) { for (int i = 0; i < config.depth_single_blocks; i++) {
blocks["single_blocks." + std::to_string(i)] = std::make_shared<SingleStreamBlock>(params.hidden_size, blocks["single_blocks." + std::to_string(i)] = std::make_shared<SingleStreamBlock>(config.hidden_size,
params.num_heads, config.num_heads,
params.mlp_ratio, config.mlp_ratio,
i, i,
0.f, 0.f,
params.is_chroma, config.is_chroma,
params.share_modulation, config.share_modulation,
!params.disable_bias, !config.disable_bias,
params.use_yak_mlp, config.use_yak_mlp,
params.use_mlp_silu_act); config.use_mlp_silu_act);
} }
if (params.version == VERSION_CHROMA_RADIANCE) { if (config.version == VERSION_CHROMA_RADIANCE) {
blocks["nerf_image_embedder"] = std::make_shared<NerfEmbedder>(params.in_channels, blocks["nerf_image_embedder"] = std::make_shared<NerfEmbedder>(config.in_channels,
params.chroma_radiance_params.nerf_hidden_size, config.chroma_radiance_params.nerf_hidden_size,
params.chroma_radiance_params.nerf_max_freqs); config.chroma_radiance_params.nerf_max_freqs);
for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) { for (int i = 0; i < config.chroma_radiance_params.nerf_depth; i++) {
blocks["nerf_blocks." + std::to_string(i)] = std::make_shared<NerfGLUBlock>(params.hidden_size, blocks["nerf_blocks." + std::to_string(i)] = std::make_shared<NerfGLUBlock>(config.hidden_size,
params.chroma_radiance_params.nerf_hidden_size, config.chroma_radiance_params.nerf_hidden_size,
params.chroma_radiance_params.nerf_mlp_ratio); config.chroma_radiance_params.nerf_mlp_ratio);
} }
blocks["nerf_final_layer_conv"] = std::make_shared<NerfFinalLayerConv>(params.chroma_radiance_params.nerf_hidden_size, blocks["nerf_final_layer_conv"] = std::make_shared<NerfFinalLayerConv>(config.chroma_radiance_params.nerf_hidden_size,
params.in_channels); config.in_channels);
} else { } else {
blocks["final_layer"] = std::make_shared<LastLayer>(params.hidden_size, 1, params.out_channels, params.is_chroma, !params.disable_bias); blocks["final_layer"] = std::make_shared<LastLayer>(config.hidden_size, 1, config.out_channels, config.is_chroma, !config.disable_bias);
} }
if (params.share_modulation) { if (config.share_modulation) {
blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias); blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias); blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
blocks["single_stream_modulation"] = std::make_shared<Modulation>(params.hidden_size, false, !params.disable_bias); blocks["single_stream_modulation"] = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
} }
} }
@ -866,7 +978,7 @@ namespace Flux {
ggml_tensor* vec; ggml_tensor* vec;
ggml_tensor* txt_img_mask = nullptr; ggml_tensor* txt_img_mask = nullptr;
if (params.is_chroma) { if (config.is_chroma) {
int64_t mod_index_length = 344; int64_t mod_index_length = 344;
auto approx = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]); auto approx = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]);
auto distill_timestep = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 16, 10000, 1000.f); auto distill_timestep = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 16, 10000, 1000.f);
@ -894,7 +1006,7 @@ namespace Flux {
} else { } else {
auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]); auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
vec = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f)); vec = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
if (params.guidance_embed) { if (config.guidance_embed) {
GGML_ASSERT(guidance != nullptr); GGML_ASSERT(guidance != nullptr);
auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]); auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
// bf16 and fp16 result is different // bf16 and fp16 result is different
@ -902,7 +1014,7 @@ namespace Flux {
vec = ggml_add(ctx->ggml_ctx, vec, guidance_in->forward(ctx, g_in)); vec = ggml_add(ctx->ggml_ctx, vec, guidance_in->forward(ctx, g_in));
} }
if (params.vec_in_dim > 0) { if (config.vec_in_dim > 0) {
auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]); auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
vec = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y)); vec = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y));
} }
@ -911,7 +1023,7 @@ namespace Flux {
std::vector<ModulationOut> ds_img_mods; std::vector<ModulationOut> ds_img_mods;
std::vector<ModulationOut> ds_txt_mods; std::vector<ModulationOut> ds_txt_mods;
std::vector<ModulationOut> ss_mods; std::vector<ModulationOut> ss_mods;
if (params.share_modulation) { if (config.share_modulation) {
auto double_stream_modulation_img = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_img"]); auto double_stream_modulation_img = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_img"]);
auto double_stream_modulation_txt = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_txt"]); auto double_stream_modulation_txt = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_txt"]);
auto single_stream_modulation = std::dynamic_pointer_cast<Modulation>(blocks["single_stream_modulation"]); auto single_stream_modulation = std::dynamic_pointer_cast<Modulation>(blocks["single_stream_modulation"]);
@ -921,15 +1033,18 @@ namespace Flux {
ss_mods = single_stream_modulation->forward(ctx, vec); ss_mods = single_stream_modulation->forward(ctx, vec);
} }
if (params.semantic_txt_norm) { if (config.semantic_txt_norm) {
auto semantic_txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]); auto semantic_txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
txt = semantic_txt_norm->forward(ctx, txt); txt = semantic_txt_norm->forward(ctx, txt);
} }
txt = txt_in->forward(ctx, txt); txt = txt_in->forward(ctx, txt);
sd::ggml_graph_cut::mark_graph_cut(img, "flux.prelude", "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
for (int i = 0; i < params.depth; i++) { for (int i = 0; i < config.depth; i++) {
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) { if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
continue; continue;
} }
@ -939,16 +1054,19 @@ namespace Flux {
auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask, ds_img_mods, ds_txt_mods); auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask, ds_img_mods, ds_txt_mods);
img = img_txt.first; // [N, n_img_token, hidden_size] img = img_txt.first; // [N, n_img_token, hidden_size]
txt = img_txt.second; // [N, n_txt_token, hidden_size] txt = img_txt.second; // [N, n_txt_token, hidden_size]
sd::ggml_graph_cut::mark_graph_cut(img, "flux.double_blocks." + std::to_string(i), "img");
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.double_blocks." + std::to_string(i), "txt");
} }
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size] auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
for (int i = 0; i < params.depth_single_blocks; i++) { for (int i = 0; i < config.depth_single_blocks; i++) {
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + params.depth) != skip_layers.end()) { if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
continue; continue;
} }
auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);
txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods); txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.single_blocks." + std::to_string(i), "txt_img");
} }
img = ggml_view_3d(ctx->ggml_ctx, img = ggml_view_3d(ctx->ggml_ctx,
@ -993,14 +1111,14 @@ namespace Flux {
int64_t W = x->ne[0]; int64_t W = x->ne[0];
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int64_t C = x->ne[2]; int64_t C = x->ne[2];
int patch_size = params.patch_size; int patch_size = config.patch_size;
int pad_h = (patch_size - H % patch_size) % patch_size; int pad_h = (patch_size - H % patch_size) % patch_size;
int pad_w = (patch_size - W % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size;
auto img = DiT::pad_to_patch_size(ctx, x, params.patch_size, params.patch_size); auto img = DiT::pad_to_patch_size(ctx, x, config.patch_size, config.patch_size);
auto orig_img = img; auto orig_img = img;
if (params.chroma_radiance_params.fake_patch_size_x2) { if (config.chroma_radiance_params.fake_patch_size_x2) {
// It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable // It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable
// Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch? // Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch?
// img = F.interpolate(img, size=(H//2, W//2), mode="nearest") // img = F.interpolate(img, size=(H//2, W//2), mode="nearest")
@ -1031,7 +1149,7 @@ namespace Flux {
auto nerf_hidden = ggml_reshape_2d(ctx->ggml_ctx, out, out->ne[0], out->ne[1] * out->ne[2]); // [N*num_patches, hidden_size] auto nerf_hidden = ggml_reshape_2d(ctx->ggml_ctx, out, out->ne[0], out->ne[1] * out->ne[2]); // [N*num_patches, hidden_size]
auto img_dct = nerf_image_embedder->forward(ctx, nerf_pixels, dct); // [N*num_patches, patch_size*patch_size, nerf_hidden_size] auto img_dct = nerf_image_embedder->forward(ctx, nerf_pixels, dct); // [N*num_patches, patch_size*patch_size, nerf_hidden_size]
for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) { for (int i = 0; i < config.chroma_radiance_params.nerf_depth; i++) {
auto block = std::dynamic_pointer_cast<NerfGLUBlock>(blocks["nerf_blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<NerfGLUBlock>(blocks["nerf_blocks." + std::to_string(i)]);
img_dct = block->forward(ctx, img_dct, nerf_hidden); img_dct = block->forward(ctx, img_dct, nerf_hidden);
@ -1043,7 +1161,7 @@ namespace Flux {
out = nerf_final_layer_conv->forward(ctx, img_dct); // [N, C, H, W] out = nerf_final_layer_conv->forward(ctx, img_dct); // [N, C, H, W]
if (params.chroma_radiance_params.use_x0) { if (config.chroma_radiance_params.use_x0) {
out = _apply_x0_residual(ctx, out, orig_img, timestep); out = _apply_x0_residual(ctx, out, orig_img, timestep);
} }
@ -1067,14 +1185,14 @@ namespace Flux {
int64_t W = x->ne[0]; int64_t W = x->ne[0];
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int64_t C = x->ne[2]; int64_t C = x->ne[2];
int patch_size = params.patch_size; int patch_size = config.patch_size;
int pad_h = (patch_size - H % patch_size) % patch_size; int pad_h = (patch_size - H % patch_size) % patch_size;
int pad_w = (patch_size - W % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size;
auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size);
int64_t img_tokens = img->ne[1]; int64_t img_tokens = img->ne[1];
if (params.version == VERSION_FLUX_FILL) { if (config.version == VERSION_FLUX_FILL) {
GGML_ASSERT(c_concat != nullptr); GGML_ASSERT(c_concat != nullptr);
ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0); ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
@ -1083,7 +1201,7 @@ namespace Flux {
mask = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size); mask = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size);
img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0); img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
} else if (params.version == VERSION_FLEX_2) { } else if (config.version == VERSION_FLEX_2) {
GGML_ASSERT(c_concat != nullptr); GGML_ASSERT(c_concat != nullptr);
ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0); ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
@ -1094,7 +1212,7 @@ namespace Flux {
control = DiT::pad_and_patchify(ctx, control, patch_size, patch_size); control = DiT::pad_and_patchify(ctx, control, patch_size, patch_size);
img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0); img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
} else if (params.version == VERSION_FLUX_CONTROLS) { } else if (config.version == VERSION_FLUX_CONTROLS) {
GGML_ASSERT(c_concat != nullptr); GGML_ASSERT(c_concat != nullptr);
auto control = DiT::pad_and_patchify(ctx, c_concat, patch_size, patch_size); auto control = DiT::pad_and_patchify(ctx, c_concat, patch_size, patch_size);
@ -1141,7 +1259,7 @@ namespace Flux {
// pe: (L, d_head/2, 2, 2) // pe: (L, d_head/2, 2, 2)
// return: (N, C, H, W) // return: (N, C, H, W)
if (params.version == VERSION_CHROMA_RADIANCE) { if (config.version == VERSION_CHROMA_RADIANCE) {
return forward_chroma_radiance(ctx, return forward_chroma_radiance(ctx,
x, x,
timestep, timestep,
@ -1171,9 +1289,9 @@ namespace Flux {
} }
}; };
struct FluxRunner : public GGMLRunner { struct FluxRunner : public DiffusionModelRunner {
public: public:
FluxParams flux_params; FluxConfig config;
Flux flux; Flux flux;
std::vector<float> pe_vec; std::vector<float> pe_vec;
std::vector<float> mod_index_arange_vec; std::vector<float> mod_index_arange_vec;
@ -1183,116 +1301,20 @@ namespace Flux {
bool use_mask = false; bool use_mask = false;
FluxRunner(ggml_backend_t backend, FluxRunner(ggml_backend_t backend,
bool offload_params_to_cpu, ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {}, const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "", const std::string prefix = "",
SDVersion version = VERSION_FLUX, SDVersion version = VERSION_FLUX,
bool use_mask = false) bool use_mask = false)
: GGMLRunner(backend, offload_params_to_cpu), version(version), use_mask(use_mask) { : DiffusionModelRunner(backend, params_backend, prefix),
flux_params.version = version; config(FluxConfig::detect_from_weights(tensor_storage_map, prefix, version)),
flux_params.guidance_embed = false; version(version),
flux_params.depth = 0; use_mask(use_mask) {
flux_params.depth_single_blocks = 0; if (config.is_chroma) {
if (version == VERSION_FLUX_FILL) {
flux_params.in_channels = 384;
} else if (version == VERSION_FLUX_CONTROLS) {
flux_params.in_channels = 128;
} else if (version == VERSION_FLEX_2) {
flux_params.in_channels = 196;
} else if (version == VERSION_CHROMA_RADIANCE) {
flux_params.in_channels = 3;
flux_params.patch_size = 16;
} else if (version == VERSION_OVIS_IMAGE) {
flux_params.semantic_txt_norm = true;
flux_params.use_yak_mlp = true;
flux_params.vec_in_dim = 0;
} else if (sd_version_is_flux2(version)) {
flux_params.in_channels = 128;
flux_params.patch_size = 1;
flux_params.out_channels = 128;
flux_params.mlp_ratio = 3.f;
flux_params.theta = 2000;
flux_params.axes_dim = {32, 32, 32, 32};
flux_params.vec_in_dim = 0;
flux_params.qkv_bias = false;
flux_params.disable_bias = true;
flux_params.share_modulation = true;
flux_params.ref_index_scale = 10.f;
flux_params.use_mlp_silu_act = true;
}
int64_t head_dim = 0;
int64_t actual_radiance_patch_size = -1;
for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first;
if (!starts_with(tensor_name, prefix))
continue;
if (tensor_name.find("guidance_in.in_layer.weight") != std::string::npos) {
flux_params.guidance_embed = true;
}
if (tensor_name.find("__x0__") != std::string::npos) {
LOG_DEBUG("using x0 prediction");
flux_params.chroma_radiance_params.use_x0 = true;
}
if (tensor_name.find("__32x32__") != std::string::npos) {
LOG_DEBUG("using patch size 32");
flux_params.patch_size = 32;
}
if (tensor_name.find("img_in_patch.weight") != std::string::npos) {
actual_radiance_patch_size = pair.second.ne[0];
LOG_DEBUG("actual radiance patch size: %d", actual_radiance_patch_size);
}
if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
// Chroma
flux_params.is_chroma = true;
}
size_t db = tensor_name.find("double_blocks.");
if (db != std::string::npos) {
tensor_name = tensor_name.substr(db); // remove prefix
int block_depth = atoi(tensor_name.substr(14, tensor_name.find(".", 14)).c_str());
if (block_depth + 1 > flux_params.depth) {
flux_params.depth = block_depth + 1;
}
}
size_t sb = tensor_name.find("single_blocks.");
if (sb != std::string::npos) {
tensor_name = tensor_name.substr(sb); // remove prefix
int block_depth = atoi(tensor_name.substr(14, tensor_name.find(".", 14)).c_str());
if (block_depth + 1 > flux_params.depth_single_blocks) {
flux_params.depth_single_blocks = block_depth + 1;
}
}
if (ends_with(tensor_name, "txt_in.weight")) {
flux_params.context_in_dim = pair.second.ne[0];
flux_params.hidden_size = pair.second.ne[1];
}
if (ends_with(tensor_name, "single_blocks.0.norm.key_norm.scale")) {
head_dim = pair.second.ne[0];
}
if (ends_with(tensor_name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
head_dim = pair.second.ne[0];
}
}
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != flux_params.patch_size) {
GGML_ASSERT(flux_params.patch_size == 2 * actual_radiance_patch_size);
LOG_DEBUG("using fake x2 patch size");
flux_params.chroma_radiance_params.fake_patch_size_x2 = true;
}
flux_params.num_heads = static_cast<int>(flux_params.hidden_size / head_dim);
LOG_INFO("flux: depth = %d, depth_single_blocks = %d, guidance_embed = %s, context_in_dim = %" PRId64
", hidden_size = %" PRId64 ", num_heads = %d",
flux_params.depth,
flux_params.depth_single_blocks,
flux_params.guidance_embed ? "true" : "false",
flux_params.context_in_dim,
flux_params.hidden_size,
flux_params.num_heads);
if (flux_params.is_chroma) {
LOG_INFO("Using pruned modulation (Chroma)"); LOG_INFO("Using pruned modulation (Chroma)");
} }
flux = Flux(flux_params); flux = Flux(config);
flux.init(params_ctx, tensor_storage_map, prefix); flux.init(params_ctx, tensor_storage_map, prefix);
} }
@ -1300,7 +1322,7 @@ namespace Flux {
return "flux"; return "flux";
} }
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
flux.get_param_tensors(tensors, prefix); flux.get_param_tensors(tensors, prefix);
} }
@ -1368,10 +1390,10 @@ namespace Flux {
ggml_tensor* context = make_optional_input(context_tensor); ggml_tensor* context = make_optional_input(context_tensor);
ggml_tensor* c_concat = make_optional_input(c_concat_tensor); ggml_tensor* c_concat = make_optional_input(c_concat_tensor);
ggml_tensor* y = make_optional_input(y_tensor); ggml_tensor* y = make_optional_input(y_tensor);
if (flux_params.guidance_embed || flux_params.is_chroma) { if (config.guidance_embed || config.is_chroma) {
if (!guidance_tensor.empty()) { if (!guidance_tensor.empty()) {
this->guidance_tensor = guidance_tensor; this->guidance_tensor = guidance_tensor;
if (flux_params.is_chroma) { if (config.is_chroma) {
this->guidance_tensor.fill_(0.f); this->guidance_tensor.fill_(0.f);
} }
} }
@ -1389,7 +1411,7 @@ namespace Flux {
ggml_tensor* mod_index_arange = nullptr; ggml_tensor* mod_index_arange = nullptr;
ggml_tensor* dct = nullptr; // for chroma radiance ggml_tensor* dct = nullptr; // for chroma radiance
if (flux_params.is_chroma) { if (config.is_chroma) {
if (!use_mask) { if (!use_mask) {
y = nullptr; y = nullptr;
} }
@ -1406,31 +1428,31 @@ namespace Flux {
} else if (version == VERSION_OVIS_IMAGE) { } else if (version == VERSION_OVIS_IMAGE) {
txt_arange_dims = {1, 2}; txt_arange_dims = {1, 2};
} }
pe_vec = Rope::gen_flux_pe(static_cast<int>(x->ne[1]), pe_vec = Rope::gen_flux_pe(static_cast<int>(x->ne[1]),
static_cast<int>(x->ne[0]), static_cast<int>(x->ne[0]),
flux_params.patch_size, config.patch_size,
static_cast<int>(x->ne[3]), static_cast<int>(x->ne[3]),
static_cast<int>(context->ne[1]), static_cast<int>(context->ne[1]),
txt_arange_dims, txt_arange_dims,
ref_latents, ref_latents,
increase_ref_index, increase_ref_index,
flux_params.ref_index_scale, config.ref_index_scale,
flux_params.theta, config.theta,
circular_y_enabled, circular_y_enabled,
circular_x_enabled, circular_x_enabled,
flux_params.axes_dim); config.axes_dim,
int pos_len = static_cast<int>(pe_vec.size() / flux_params.axes_dim_sum / 2); sd_version_is_longcat(version));
int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
// LOG_DEBUG("pos_len %d", pos_len); // LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len); auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
// pe->data = pe_vec.data(); // pe->data = pe_vec.data();
// print_ggml_tensor(pe); // print_ggml_tensor(pe);
// pe->data = nullptr; // pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data()); set_backend_tensor_data(pe, pe_vec.data());
if (version == VERSION_CHROMA_RADIANCE) { if (version == VERSION_CHROMA_RADIANCE) {
int patch_size = flux_params.patch_size; int patch_size = config.patch_size;
int nerf_max_freqs = flux_params.chroma_radiance_params.nerf_max_freqs; int nerf_max_freqs = config.chroma_radiance_params.nerf_max_freqs;
dct_vec = fetch_dct_pos(patch_size, nerf_max_freqs); dct_vec = fetch_dct_pos(patch_size, nerf_max_freqs);
dct = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, nerf_max_freqs * nerf_max_freqs, patch_size * patch_size); dct = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, nerf_max_freqs * nerf_max_freqs, patch_size * patch_size);
// dct->data = dct_vec.data(); // dct->data = dct_vec.data();
@ -1482,6 +1504,25 @@ namespace Flux {
return result; return result;
} }
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<FluxDiffusionExtra>(diffusion_params);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
static const std::vector<int> empty_skip_layers;
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.c_concat),
tensor_or_empty(diffusion_params.y),
tensor_or_empty(extra->guidance),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
diffusion_params.increase_ref_index,
extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
}
void test() { void test() {
ggml_init_params params; ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
@ -1539,7 +1580,7 @@ namespace Flux {
static void load_from_file_and_test(const std::string& file_path) { static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0); // ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init(); ggml_backend_t backend = sd_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_COUNT; ggml_type model_data_type = GGML_TYPE_COUNT;
ModelLoader model_loader; ModelLoader model_loader;
@ -1558,13 +1599,17 @@ namespace Flux {
} }
std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend, std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend,
false, backend,
tensor_storage_map, tensor_storage_map,
"model.diffusion_model", "model.diffusion_model",
VERSION_FLUX2, VERSION_FLUX2,
false); false);
flux->alloc_params_buffer(); if (!flux->alloc_params_buffer()) {
LOG_ERROR("flux model allocation failed");
return;
}
std::map<std::string, ggml_tensor*> tensors; std::map<std::string, ggml_tensor*> tensors;
flux->get_param_tensors(tensors, "model.diffusion_model"); flux->get_param_tensors(tensors, "model.diffusion_model");
@ -1582,4 +1627,4 @@ namespace Flux {
} // namespace Flux } // namespace Flux
#endif // __FLUX_HPP__ #endif // __SD_MODEL_DIFFUSION_FLUX_HPP__

Some files were not shown because too many files have changed in this diff Show More