Compare commits

...

71 Commits

Author SHA1 Message Date
leejet
11ab095230
fix: resolve embedding loading issue when calling generate_image multiple times (#1078) 2025-12-12 23:08:12 +08:00
Wagner Bruna
a3a88fc9b2
fix: avoid crash loading LoRAs with bf16 weights (#1077) 2025-12-12 22:36:54 +08:00
leejet
8823dc48bc
feat: align the spatial size to the corresponding multiple (#1073) 2025-12-10 23:15:08 +08:00
Pedrito
1ac5a616de
feat: support custom upscale tile size (#896) 2025-12-10 22:25:19 +08:00
leejet
d939f6e86a
refactor: optimize the handling of LoRA models (#1070) 2025-12-10 00:26:07 +08:00
Wagner Bruna
e72aea796e
feat: embed version string and git commit hash (#1008) 2025-12-09 22:38:54 +08:00
wuhei
a908436729
docs: update download link for Stable Diffusion v1.5 (#1063) 2025-12-09 22:06:16 +08:00
stduhpf
583a02e29e
feat: add Flux.2 VAE proj matrix for previews (#1017) 2025-12-09 22:00:45 +08:00
leejet
96c3e64057
refactor: optimize the handling of embedding (#1068)
* optimize the handling of embedding

* support case-insensitive embedding names
2025-12-08 23:59:04 +08:00
Weiqi Gao
0392273e10
chore: add compute kernels to Windows CUDA build (#1062)
* Fix syntax for CUDA architecture definitions

* Extend CUDA support to GTX 10 Series to RTX 50 Series

* update cuda installer step version to install cuda 12.8.1

* Remove unsupported compute capability
2025-12-07 22:12:50 +08:00
leejet
bf1a388b44 docs: update logo 2025-12-07 15:09:32 +08:00
leejet
c9005337a8 docs: update logo 2025-12-07 14:56:21 +08:00
leejet
2f0bd31a84
feat: add ovis image support (#1057) 2025-12-07 12:32:56 +08:00
leejet
bfbb929790
feat: do not convert bf16 to f32 (#1055) 2025-12-06 23:55:51 +08:00
leejet
689e44c9a8
fix: correct ggml_ext_silu_act (#1056) 2025-12-06 23:55:28 +08:00
leejet
985aedda32
refactor: optimize the handling of pred type (#1048) 2025-12-04 23:31:55 +08:00
leejet
3f3610b5cd
chore: optimize lora log (#1047) 2025-12-04 22:44:58 +08:00
Wagner Bruna
118683de8a
fix: correct preview method selection (#1038) 2025-12-04 22:43:16 +08:00
stduhpf
bcc9c0d0b3
feat: handle ggml compute failures without crashing the program (#1003)
* Feat: handle compute failures more gracefully

* fix Unreachable code after return

Co-authored-by: idostyle <idostyl3@googlemail.com>

* adjust z_image.hpp

---------

Co-authored-by: idostyle <idostyl3@googlemail.com>
Co-authored-by: leejet <leejet714@gmail.com>
2025-12-04 22:04:27 +08:00
leejet
5865b5e703
refactor: split SDParams to SDCliParams/SDContextParams/SDGenerationParams (#1032) 2025-12-03 22:31:46 +08:00
stduhpf
edf2cb3846
fix: fix CosXL not being detected (#989) 2025-12-03 22:25:02 +08:00
Wagner Bruna
99e17232a4
fix: prevent NaN issues with Z-Image on certain ROCm setups (#1034) 2025-12-03 22:19:34 +08:00
leejet
710169df5c docs: update news 2025-12-01 22:46:15 +08:00
Wagner Bruna
e4c50f1de5
chore: add sd_ prefix to a few functions (#967) 2025-12-01 22:43:52 +08:00
rmatif
0743a1b3b5
fix: fix vae tiling for flux2 (#1025) 2025-12-01 22:41:56 +08:00
leejet
34a6fd4e60
feat: add z-image support (#1020)
* add z-image support

* use flux_latent_rgb_proj for z-image

* fix qwen3 rope type

* add support for qwen3 4b gguf

* add support for diffusers format lora

* fix nan issue that occurs when using CUDA with k-quants weights

* add z-image docs
2025-12-01 22:39:43 +08:00
leejet
3c1187ce83 docs: correct the time of adding flux2 support 2025-11-30 12:40:56 +08:00
leejet
20eb674100
fix: avoid crash when the lora file is not found using immediately mode (#1022) 2025-11-30 12:19:37 +08:00
leejet
bc80225336
fix: make the immediate LoRA apply mode work better when using Vulkan (#1021) 2025-11-30 12:08:25 +08:00
leejet
ab7e8d285e docs: update news 2025-11-30 11:51:23 +08:00
Wagner Bruna
673dbdda17
fix: add missing line cleanup for s/it progress display (#891) 2025-11-30 11:45:30 +08:00
Wagner Bruna
0249509a30
refactor: add user data pointer to the image preview callback (#1001) 2025-11-30 11:34:17 +08:00
leejet
52b67c538b
feat: add flux2 support (#1016)
* add flux2 support

* rename qwenvl to llm

* add Flux2FlowDenoiser

* update docs
2025-11-30 11:32:56 +08:00
leejet
20345888a3
refactor: optimize the handling of sample method (#999) 2025-11-22 14:00:25 +08:00
akleine
490c51d963
feat: report success/failure when saving PNG/JPG output (#912) 2025-11-22 13:57:44 +08:00
Wagner Bruna
45c46779af
feat: add LCM scheduler (#983) 2025-11-22 13:53:31 +08:00
leejet
869d023416
refactor: optimize the handling of scheduler (#998) 2025-11-22 12:48:53 +08:00
akleine
e9bc3b6c06
fix: check the PhotoMaker id_embeds tensor ONLY in PhotoMaker V2 mode (#987) 2025-11-22 12:47:40 +08:00
Wagner Bruna
b542894fb9
fix: avoid crash on default video preview path (#997)
Co-authored-by: masamaru-san
2025-11-22 12:46:27 +08:00
leejet
5498cc0d67
feat: add Wan2.1-I2V-1.3B(SkyReels) support (#988) 2025-11-19 23:56:46 +08:00
stduhpf
aa2b8e0ca5
fix: patch 1x1 conv weights at runtime (#986) 2025-11-19 23:27:23 +08:00
rmatif
a14e2b321d
feat: add easycache support (#940) 2025-11-19 23:19:32 +08:00
leejet
28ffb6c13d
fix: resolve issue with concat multiple LoRA output diffs at runtime (#985) 2025-11-17 22:56:07 +08:00
leejet
b88cc32346
fix: avoid using same type but diff instances for rng and sampler_rng (#982) 2025-11-16 23:37:14 +08:00
leejet
f532972d60
fix: avoid precision issues on vulkan backend (#980) 2025-11-16 20:57:08 +08:00
leejet
d5b05f70c6
feat: support independent sampler rng (#978) 2025-11-16 17:11:02 +08:00
akleine
6d6dc1b8ed
fix: make PhotoMakerV2 more robust by image count check (#970) 2025-11-16 17:10:48 +08:00
Wagner Bruna
199e675cc7
feat: support for --tensor-type-rules on generation modes (#932) 2025-11-16 17:07:32 +08:00
leejet
742a7333c3
feat: add cpu rng (#977) 2025-11-16 14:48:15 +08:00
Wagner Bruna
e8eb3791c8
fix: typo in --lora-apply-mode help (#972) 2025-11-16 14:48:00 +08:00
Wagner Bruna
aa44e06890
fix: avoid crash with LoRAs and type override (#974) 2025-11-16 14:47:36 +08:00
Daniele
6448430dbb
feat: add break pseudo token support (#422)
---------

Co-authored-by: Urs Ganse <urs.ganse@helsinki.fi>
2025-11-16 14:45:20 +08:00
leejet
347710f68f
feat: support applying LoRA at runtime (#969) 2025-11-13 21:48:44 +08:00
lcy
59ebdf0bb5
chrore: enable Windows ROCm(HIP) build release (#956)
* build: fix missing commit sha in macOS and Ubuntu build zip name

The build workflows for macOS and Ubuntu incorrectly check for the
"main" branch instead of "master" when retrieving the commit hash for
naming the build artifacts.

* build: correct Vulkan SDK installation condition in build workflow

* build: Enable Windows ROCm(HIP) build release

Refer to the build workflow of llama.cpp to add a Windows ROCm (HIP)
build release to the workflow.
Since there are many differences between the HIP build and other
builds, this commit add a separate "windows-latest-cmake-hip" job,
instead of enabling the ROCm matrix entry in the existing Windows
build job.

Main differences include:

- Install ROCm SDK from AMD official installer.
- Add a cache step for ROCm installation and a ccache step for build
  processing, since the HIP build takes much longer time than other
  builds.
- Include the ROCm/HIP artifact in the release assets.
2025-11-12 00:28:55 +08:00
Flavio Bizzarri
4ffcbcaed7
fix: specify enum modifier in sd_set_preview_callback signature (#959) 2025-11-12 00:27:23 +08:00
leejet
694f0d9235
refactor: optimize the logic for name conversion and the processing of the LoRA model (#955) 2025-11-10 00:12:20 +08:00
stduhpf
8ecdf053ac
feat: add image preview support (#522) 2025-11-10 00:12:02 +08:00
leejet
ee89afc878
fix: resolve issue with pmid (#957) 2025-11-09 22:47:53 +08:00
akleine
d2d3944f50
feat: add support for SD2.x with TINY U-Nets (#939) 2025-11-09 22:47:37 +08:00
akleine
0fa3e1a383
fix: prevent core dump in PM V2 in case of incomplete cmd line (#950) 2025-11-09 22:36:43 +08:00
leejet
c2d8ffc22c
fix: compatibility for models with modified tensor shapes (#951) 2025-11-07 23:04:41 +08:00
stduhpf
fb748bb8a4
fix: TAE encoding (#935) 2025-11-07 22:58:59 +08:00
leejet
8f6c5c217b
refactor: simplify the model loading logic (#933)
* remove String2GGMLType

* remove preprocess_tensor

* fix clip init

* simplify the logic for reading weights
2025-11-03 21:21:34 +08:00
leejet
6103d86e2c
refactor: introduce GGMLRunnerContext (#928)
* introduce GGMLRunnerContext

* add Flash Attention enable control through GGMLRunnerContext

* add conv2d_direct enable control through GGMLRunnerContext
2025-11-02 02:11:04 +08:00
stduhpf
c42826b77c
fix: resolve multiple inpainting issues (#926)
* Fix inpainting masked image being broken by side effect

* Fix unet inpainting concat not being set

* Fix Flex.2 inpaint mode crash (+ use scale factor)
2025-11-02 02:10:32 +08:00
Wagner Bruna
945d9a9ee3
docs: add Koboldcpp as an available UI (#930) 2025-11-02 02:03:01 +08:00
Wagner Bruna
353e708844
docs: update ggml and llama.cpp URLs (#931) 2025-11-02 02:02:44 +08:00
leejet
dd75fc081c
refactor: unify the naming style of ggml extension functions (#921) 2025-10-28 23:26:48 +08:00
stduhpf
77eb95f8e4
docs: fix taesd direct download link (#917) 2025-10-28 23:26:23 +08:00
Wagner Bruna
8a45d0ff7f
chore: clean up stb includes (#919) 2025-10-28 23:25:45 +08:00
leejet
9e28be6479
feat: add chroma radiance support (#910)
* add chroma radiance support

* fix ci

* simply generate_init_latent

* workaround: avoid ggml cuda error

* format code

* add chroma radiance doc
2025-10-25 23:56:14 +08:00
69 changed files with 499852 additions and 6392 deletions

View File

@ -65,7 +65,7 @@ jobs:
- name: Get commit hash - name: Get commit hash
id: commit id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2 uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info - name: Fetch system info
@ -118,7 +118,7 @@ jobs:
- name: Get commit hash - name: Get commit hash
id: commit id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2 uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info - name: Fetch system info
@ -163,9 +163,7 @@ jobs:
- build: "avx512" - build: "avx512"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON" defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12" - build: "cuda12"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;86;80;75" defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120'"
# - build: "rocm5.5"
# defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
- build: 'vulkan' - build: 'vulkan'
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON" defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
steps: steps:
@ -178,28 +176,15 @@ jobs:
- name: Install cuda-toolkit - name: Install cuda-toolkit
id: cuda-toolkit id: cuda-toolkit
if: ${{ matrix.build == 'cuda12' }} if: ${{ matrix.build == 'cuda12' }}
uses: Jimver/cuda-toolkit@v0.2.19 uses: Jimver/cuda-toolkit@v0.2.22
with: with:
cuda: "12.6.2" cuda: "12.8.1"
method: "network" method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Install rocm-toolkit
id: rocm-toolkit
if: ${{ matrix.build == 'rocm5.5' }}
uses: Cyberhan123/rocm-toolkit@v0.1.0
with:
rocm: "5.5.0"
- name: Install Ninja
id: install-ninja
if: ${{ matrix.build == 'rocm5.5' }}
uses: urkle/action-get-ninja@v1
with:
version: 1.11.1
- name: Install Vulkan SDK - name: Install Vulkan SDK
id: get_vulkan id: get_vulkan
if: ${{ matrix.build == 'vulkan' }} https://sdk.lunarg.com/sdk/download/1.4.328.1/windows/vulkansdk-windows-X64-1.4.328.1.exe if: ${{ matrix.build == 'vulkan' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe" curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@ -277,6 +262,104 @@ jobs:
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-hip:
runs-on: windows-2022
env:
HIPSDK_INSTALLER_VERSION: "25.Q3"
GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Cache ROCm Installation
id: cache-rocm
uses: actions/cache@v4
with:
path: C:\Program Files\AMD\ROCm
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
evict-old-files: 1d
- name: Install ROCm
if: steps.cache-rocm.outputs.cache-hit != 'true'
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$completed = $proc.WaitForExit(600000)
if (-not $completed) {
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
$proc.Kill()
exit 1
}
if ($proc.ExitCode -ne 0) {
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
exit 1
}
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
run: |
# Find and test ROCm installation
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
if (-not $clangPath) {
Write-Error "ROCm installation not found"
exit 1
}
& $clangPath.FullName --version
# Set HIP_PATH environment variable for later steps
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
- name: Build
run: |
mkdir build
cd build
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake .. `
-G "Unix Makefiles" `
-DSD_HIPBLAS=ON `
-DSD_BUILD_SHARED_LIBS=ON `
-DGGML_NATIVE=OFF `
-DCMAKE_C_COMPILER=clang `
-DCMAKE_CXX_COMPILER=clang++ `
-DCMAKE_BUILD_TYPE=Release `
-DGPU_TARGETS="${{ env.GPU_TARGETS }}"
cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Pack artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -286,6 +369,7 @@ jobs:
- ubuntu-latest-cmake - ubuntu-latest-cmake
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-latest-cmake-hip
steps: steps:
- name: Clone - name: Clone

1
.gitignore vendored
View File

@ -12,3 +12,4 @@ test/
output*.png output*.png
models* models*
*.log *.log
preview.png

View File

@ -87,6 +87,38 @@ file(GLOB SD_LIB_SOURCES
"*.hpp" "*.hpp"
) )
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
endif()
if(NOT SDCPP_BUILD_VERSION)
set(SDCPP_BUILD_VERSION unknown)
endif()
message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
if(NOT SDCPP_BUILD_COMMIT)
set(SDCPP_BUILD_COMMIT unknown)
endif()
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
set_property(
SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
APPEND PROPERTY COMPILE_DEFINITIONS
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
)
if(SD_BUILD_SHARED_LIBS) if(SD_BUILD_SHARED_LIBS)
message("-- Build shared library") message("-- Build shared library")
message(${SD_LIB_SOURCES}) message(${SD_LIB_SOURCES})

View File

@ -1,5 +1,5 @@
<p align="center"> <p align="center">
<img src="./assets/cat_with_sd_cpp_42.png" width="360x"> <img src="./assets/logo.png" width="360x">
</p> </p>
# stable-diffusion.cpp # stable-diffusion.cpp
@ -15,6 +15,12 @@ API and command-line option may change frequently.***
## 🔥Important News ## 🔥Important News
* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**
👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**
👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509** * **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**
👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877) 👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
@ -29,17 +35,21 @@ API and command-line option may change frequently.***
## Features ## Features
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp) - Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
- Super lightweight and without external dependencies - Super lightweight and without external dependencies
- Supported models - Supported models
- Image Models - Image Models
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- [some SD1.x and SDXL distilled models](./docs/distilled_sd.md) - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- [SD3/SD3.5](./docs/sd3.md) - [SD3/SD3.5](./docs/sd3.md)
- [Flux-dev/Flux-schnell](./docs/flux.md) - [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev](./docs/flux2.md)
- [Chroma](./docs/chroma.md) - [Chroma](./docs/chroma.md)
- [Chroma1-Radiance](./docs/chroma_radiance.md)
- [Qwen Image](./docs/qwen_image.md) - [Qwen Image](./docs/qwen_image.md)
- [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- Image Edit Models - Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md) - [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md) - [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
@ -80,7 +90,9 @@ API and command-line option may change frequently.***
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457) - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
- `DPM++ 2S a` - `DPM++ 2S a`
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952) - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`) - Cross-platform reproducibility
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
- `--rng cpu`, consistent with the `comfyui RNG`
- Embedds generation parameters into png output as webui-compatible text string - Embedds generation parameters into png output as webui-compatible text string
## Quick Start ## Quick Start
@ -93,7 +105,7 @@ API and command-line option may change frequently.***
### Download model weights ### Download model weights
- download weights(.ckpt or .safetensors or .gguf). For example - download weights(.ckpt or .safetensors or .gguf). For example
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5 - Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
```sh ```sh
curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
@ -115,12 +127,15 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [SD1.x/SD2.x/SDXL](./docs/sd.md) - [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md) - [SD3/SD3.5](./docs/sd3.md)
- [Flux-dev/Flux-schnell](./docs/flux.md) - [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev](./docs/flux2.md)
- [FLUX.1-Kontext-dev](./docs/kontext.md) - [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Chroma](./docs/chroma.md) - [Chroma](./docs/chroma.md)
- [🔥Qwen Image](./docs/qwen_image.md) - [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md) - [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md) - [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [LoRA](./docs/lora.md) - [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md) - [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md) - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@ -151,6 +166,7 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui) - [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
- [LocalAI](https://github.com/mudler/LocalAI) - [LocalAI](https://github.com/mudler/LocalAI)
- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel) - [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
- [KoboldCpp](https://github.com/LostRuins/koboldcpp)
## Contributors ## Contributors
@ -164,7 +180,7 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
## References ## References
- [ggml](https://github.com/ggerganov/ggml) - [ggml](https://github.com/ggml-org/ggml)
- [diffusers](https://github.com/huggingface/diffusers) - [diffusers](https://github.com/huggingface/diffusers)
- [stable-diffusion](https://github.com/CompVis/stable-diffusion) - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
- [sd3-ref](https://github.com/Stability-AI/sd3-ref) - [sd3-ref](https://github.com/Stability-AI/sd3-ref)

Binary file not shown.

After

Width:  |  Height:  |  Size: 477 KiB

BIN
assets/flux2/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 556 KiB

BIN
assets/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 401 KiB

BIN
assets/z_image/bf16.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q2_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
assets/z_image/q3_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
assets/z_image/q4_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q4_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q5_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q6_K.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

BIN
assets/z_image/q8_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

250
clip.hpp
View File

@ -3,34 +3,10 @@
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "model.h" #include "model.h"
#include "tokenize_util.h"
/*================================================== CLIPTokenizer ===================================================*/ /*================================================== CLIPTokenizer ===================================================*/
__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
std::regex re("<lora:([^:]+):([^>]+)>");
std::smatch matches;
std::unordered_map<std::string, float> filename2multiplier;
while (std::regex_search(text, matches, re)) {
std::string filename = matches[1].str();
float multiplier = std::stof(matches[2].str());
text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
if (multiplier == 0.f) {
continue;
}
if (filename2multiplier.find(filename) == filename2multiplier.end()) {
filename2multiplier[filename] = multiplier;
} else {
filename2multiplier[filename] += multiplier;
}
}
return std::make_pair(filename2multiplier, text);
}
__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() { __STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs; std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
std::set<int> byte_set; std::set<int> byte_set;
@ -72,6 +48,8 @@ private:
int encoder_len; int encoder_len;
int bpe_len; int bpe_len;
std::vector<std::string> special_tokens;
public: public:
const std::string UNK_TOKEN = "<|endoftext|>"; const std::string UNK_TOKEN = "<|endoftext|>";
const std::string BOS_TOKEN = "<|startoftext|>"; const std::string BOS_TOKEN = "<|startoftext|>";
@ -117,6 +95,15 @@ private:
return pairs; return pairs;
} }
bool is_special_token(const std::string& token) {
for (auto& special_token : special_tokens) {
if (special_token == token) {
return true;
}
}
return false;
}
public: public:
CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "") CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
: PAD_TOKEN_ID(pad_token_id) { : PAD_TOKEN_ID(pad_token_id) {
@ -125,6 +112,8 @@ public:
} else { } else {
load_from_merges(ModelLoader::load_merges()); load_from_merges(ModelLoader::load_merges());
} }
add_special_token("<|startoftext|>");
add_special_token("<|endoftext|>");
} }
void load_from_merges(const std::string& merges_utf8_str) { void load_from_merges(const std::string& merges_utf8_str) {
@ -201,6 +190,10 @@ public:
} }
} }
void add_special_token(const std::string& token) {
special_tokens.push_back(token);
}
std::u32string bpe(const std::u32string& token) { std::u32string bpe(const std::u32string& token) {
std::vector<std::u32string> word; std::vector<std::u32string> word;
@ -379,25 +372,54 @@ public:
return trim(text); return trim(text);
} }
std::vector<std::string> token_split(const std::string& text) {
std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::sregex_iterator iter(text.begin(), text.end(), pat);
std::sregex_iterator end;
std::vector<std::string> result;
for (; iter != end; ++iter) {
result.emplace_back(iter->str());
}
return result;
}
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) { std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
std::string original_text = text; std::string original_text = text;
std::vector<int32_t> bpe_tokens; std::vector<int32_t> bpe_tokens;
text = whitespace_clean(text); text = whitespace_clean(text);
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); }); std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::smatch matches;
std::string str = text; std::string str = text;
std::vector<std::string> token_strs; std::vector<std::string> token_strs;
while (std::regex_search(str, matches, pat)) {
bool skip = on_new_token_cb(str, bpe_tokens); auto splited_texts = split_with_special_tokens(text, special_tokens);
if (skip) {
for (auto& splited_text : splited_texts) {
LOG_DEBUG("token %s", splited_text.c_str());
if (is_special_token(splited_text)) {
LOG_DEBUG("special %s", splited_text.c_str());
bool skip = on_new_token_cb(splited_text, bpe_tokens);
if (skip) {
token_strs.push_back(splited_text);
continue;
}
continue; continue;
} }
for (auto& token : matches) {
std::string token_str = token.str(); auto tokens = token_split(splited_text);
for (auto& token : tokens) {
if (on_new_token_cb != nullptr) {
bool skip = on_new_token_cb(token, bpe_tokens);
if (skip) {
token_strs.push_back(token);
continue;
}
}
std::string token_str = token;
std::u32string utf32_token; std::u32string utf32_token;
for (int i = 0; i < token_str.length(); i++) { for (int i = 0; i < token_str.length(); i++) {
unsigned char b = token_str[i]; unsigned char b = token_str[i];
@ -417,14 +439,13 @@ public:
bpe_tokens.push_back(encoder[bpe_str]); bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str)); token_strs.push_back(utf32_to_utf8(bpe_str));
} }
str = matches.suffix();
} }
std::stringstream ss; // std::stringstream ss;
ss << "["; // ss << "[";
for (auto token : token_strs) { // for (auto token : token_strs) {
ss << "\"" << token << "\", "; // ss << "\"" << token << "\", ";
} // }
ss << "]"; // ss << "]";
// LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str()); // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str()); // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
return bpe_tokens; return bpe_tokens;
@ -451,16 +472,16 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, n_token, d_model] // x: [N, n_token, d_model]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]); auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]); auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
x = fc1->forward(ctx, x); x = fc1->forward(ctx, x);
if (use_gelu) { if (use_gelu) {
x = ggml_gelu_inplace(ctx, x); x = ggml_gelu_inplace(ctx->ggml_ctx, x);
} else { } else {
x = ggml_gelu_quick_inplace(ctx, x); x = ggml_gelu_quick_inplace(ctx->ggml_ctx, x);
} }
x = fc2->forward(ctx, x); x = fc2->forward(ctx, x);
return x; return x;
@ -476,11 +497,12 @@ protected:
public: public:
CLIPLayer(int64_t d_model, CLIPLayer(int64_t d_model,
int64_t n_head, int64_t n_head,
int64_t intermediate_size) int64_t intermediate_size,
bool proj_in = false)
: d_model(d_model), : d_model(d_model),
n_head(n_head), n_head(n_head),
intermediate_size(intermediate_size) { intermediate_size(intermediate_size) {
blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true)); blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true, proj_in));
blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model)); blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model)); blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
@ -488,15 +510,15 @@ public:
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size)); blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, bool mask = true) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, bool mask = true) {
// x: [N, n_token, d_model] // x: [N, n_token, d_model]
auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]); auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]); auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]); auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);
auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]); auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);
x = ggml_add(ctx, x, self_attn->forward(ctx, backend, layer_norm1->forward(ctx, x), mask)); x = ggml_add(ctx->ggml_ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x))); x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
return x; return x;
} }
}; };
@ -509,16 +531,16 @@ public:
CLIPEncoder(int64_t n_layer, CLIPEncoder(int64_t n_layer,
int64_t d_model, int64_t d_model,
int64_t n_head, int64_t n_head,
int64_t intermediate_size) int64_t intermediate_size,
bool proj_in = false)
: n_layer(n_layer) { : n_layer(n_layer) {
for (int i = 0; i < n_layer; i++) { for (int i = 0; i < n_layer; i++) {
std::string name = "layers." + std::to_string(i); std::string name = "layers." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size)); blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size, proj_in));
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
int clip_skip = -1, int clip_skip = -1,
bool mask = true) { bool mask = true) {
@ -536,7 +558,7 @@ public:
} }
std::string name = "layers." + std::to_string(i); std::string name = "layers." + std::to_string(i);
auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]); auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
x = layer->forward(ctx, backend, x, mask); // [N, n_token, d_model] x = layer->forward(ctx, x, mask); // [N, n_token, d_model]
// LOG_DEBUG("layer %d", i); // LOG_DEBUG("layer %d", i);
} }
return x; return x;
@ -550,10 +572,10 @@ protected:
int64_t num_positions; int64_t num_positions;
bool force_clip_f32; bool force_clip_f32;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type token_wtype = GGML_TYPE_F32; enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) { if (!force_clip_f32) {
token_wtype = get_type(prefix + "token_embedding.weight", tensor_types, GGML_TYPE_F32); token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
if (!support_get_rows(token_wtype)) { if (!support_get_rows(token_wtype)) {
token_wtype = GGML_TYPE_F32; token_wtype = GGML_TYPE_F32;
} }
@ -578,7 +600,7 @@ public:
return params["token_embedding.weight"]; return params["token_embedding.weight"];
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* custom_embed_weight) { struct ggml_tensor* custom_embed_weight) {
// input_ids: [N, n_token] // input_ids: [N, n_token]
@ -586,12 +608,12 @@ public:
auto position_embed_weight = params["position_embedding.weight"]; auto position_embed_weight = params["position_embedding.weight"];
GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]); GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]); input_ids = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids); auto token_embedding = ggml_get_rows(ctx->ggml_ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
token_embedding = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]); token_embedding = ggml_reshape_3d(ctx->ggml_ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
// token_embedding + position_embedding // token_embedding + position_embedding
auto x = ggml_add(ctx, auto x = ggml_add(ctx->ggml_ctx,
token_embedding, token_embedding,
position_embed_weight); // [N, n_token, embed_dim] position_embed_weight); // [N, n_token, embed_dim]
return x; return x;
@ -606,7 +628,8 @@ protected:
int64_t image_size; int64_t image_size;
int64_t num_patches; int64_t num_patches;
int64_t num_positions; int64_t num_positions;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type patch_wtype = GGML_TYPE_F16; enum ggml_type patch_wtype = GGML_TYPE_F16;
enum ggml_type class_wtype = GGML_TYPE_F32; enum ggml_type class_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32; enum ggml_type position_wtype = GGML_TYPE_F32;
@ -629,7 +652,7 @@ public:
num_positions = num_patches + 1; num_positions = num_patches + 1;
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) {
// pixel_values: [N, num_channels, image_size, image_size] // pixel_values: [N, num_channels, image_size, image_size]
// return: [N, num_positions, embed_dim] // return: [N, num_positions, embed_dim]
GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels); GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@ -641,18 +664,18 @@ public:
// concat(patch_embedding, class_embedding) + position_embedding // concat(patch_embedding, class_embedding) + position_embedding
struct ggml_tensor* patch_embedding; struct ggml_tensor* patch_embedding;
int64_t N = pixel_values->ne[3]; int64_t N = pixel_values->ne[3];
patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N); struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim] class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim]
class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1] class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1] struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim] x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
x = ggml_add(ctx, x, position_embed_weight); x = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
return x; // [N, num_positions, embed_dim] return x; // [N, num_positions, embed_dim]
} }
}; };
@ -669,7 +692,7 @@ enum CLIPVersion {
class CLIPTextModel : public GGMLBlock { class CLIPTextModel : public GGMLBlock {
protected: protected:
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
if (version == OPEN_CLIP_VIT_BIGG_14) { if (version == OPEN_CLIP_VIT_BIGG_14) {
enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size); params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
@ -690,7 +713,8 @@ public:
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true, bool with_final_ln = true,
bool force_clip_f32 = false) bool force_clip_f32 = false,
bool proj_in = false)
: version(version), with_final_ln(with_final_ln) { : version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) { if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024; hidden_size = 1024;
@ -705,7 +729,7 @@ public:
} }
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32)); blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size)); blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
} }
@ -714,8 +738,7 @@ public:
return embeddings->get_token_embed_weight(); return embeddings->get_token_embed_weight();
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* tkn_embeddings, struct ggml_tensor* tkn_embeddings,
size_t max_token_idx = 0, size_t max_token_idx = 0,
@ -727,16 +750,16 @@ public:
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]); auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size] auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
x = encoder->forward(ctx, backend, x, return_pooled ? -1 : clip_skip, true); x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
if (return_pooled || with_final_ln) { if (return_pooled || with_final_ln) {
x = final_layer_norm->forward(ctx, x); x = final_layer_norm->forward(ctx, x);
} }
if (return_pooled) { if (return_pooled) {
auto text_projection = params["text_projection"]; auto text_projection = params["text_projection"];
ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx); ggml_tensor* pooled = ggml_view_1d(ctx->ggml_ctx, x, hidden_size, x->nb[1] * max_token_idx);
if (text_projection != nullptr) { if (text_projection != nullptr) {
pooled = ggml_nn_linear(ctx, pooled, text_projection, nullptr); pooled = ggml_ext_linear(ctx->ggml_ctx, pooled, text_projection, nullptr);
} else { } else {
LOG_DEBUG("identity projection"); LOG_DEBUG("identity projection");
} }
@ -760,7 +783,7 @@ public:
int32_t n_layer = 24; int32_t n_layer = 24;
public: public:
CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) { CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) { if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280; hidden_size = 1280;
intermediate_size = 5120; intermediate_size = 5120;
@ -775,12 +798,11 @@ public:
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size)); blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
blocks["pre_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size)); blocks["pre_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size)); blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* pixel_values, struct ggml_tensor* pixel_values,
bool return_pooled = true, bool return_pooled = true,
int clip_skip = -1) { int clip_skip = -1) {
@ -792,14 +814,14 @@ public:
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim] auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x); x = pre_layernorm->forward(ctx, x);
x = encoder->forward(ctx, backend, x, clip_skip, false); x = encoder->forward(ctx, x, clip_skip, false);
// print_ggml_tensor(x, true, "ClipVisionModel x: "); // print_ggml_tensor(x, true, "ClipVisionModel x: ");
auto last_hidden_state = x; auto last_hidden_state = x;
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size] x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
GGML_ASSERT(x->ne[3] == 1); GGML_ASSERT(x->ne[3] == 1);
if (return_pooled) { if (return_pooled) {
ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0)); ggml_tensor* pooled = ggml_cont(ctx->ggml_ctx, ggml_view_2d(ctx->ggml_ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
return pooled; // [N, hidden_size] return pooled; // [N, hidden_size]
} else { } else {
// return x; // [N, n_token, hidden_size] // return x; // [N, n_token, hidden_size]
@ -814,8 +836,8 @@ protected:
int64_t out_features; int64_t out_features;
bool transpose_weight; bool transpose_weight;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
if (transpose_weight) { if (transpose_weight) {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features); params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
} else { } else {
@ -831,12 +853,12 @@ public:
out_features(out_features), out_features(out_features),
transpose_weight(transpose_weight) {} transpose_weight(transpose_weight) {}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
struct ggml_tensor* w = params["weight"]; struct ggml_tensor* w = params["weight"];
if (transpose_weight) { if (transpose_weight) {
w = ggml_cont(ctx, ggml_transpose(ctx, w)); w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
} }
return ggml_nn_linear(ctx, x, w, nullptr); return ggml_ext_linear(ctx->ggml_ctx, x, w, nullptr);
} }
}; };
@ -848,7 +870,8 @@ public:
public: public:
CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14, CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool transpose_proj_w = false) { bool transpose_proj_w = false,
bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) { if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280; hidden_size = 1280;
projection_dim = 1024; projection_dim = 1024;
@ -856,12 +879,11 @@ public:
hidden_size = 1664; hidden_size = 1664;
} }
blocks["vision_model"] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version)); blocks["vision_model"] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version, proj_in));
blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w)); blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* pixel_values, struct ggml_tensor* pixel_values,
bool return_pooled = true, bool return_pooled = true,
int clip_skip = -1) { int clip_skip = -1) {
@ -870,7 +892,7 @@ public:
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]); auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]); auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
auto x = vision_model->forward(ctx, backend, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size] auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
if (return_pooled) { if (return_pooled) {
x = visual_projection->forward(ctx, x); // [N, projection_dim] x = visual_projection->forward(ctx, x); // [N, projection_dim]
@ -885,13 +907,24 @@ struct CLIPTextModelRunner : public GGMLRunner {
CLIPTextModelRunner(ggml_backend_t backend, CLIPTextModelRunner(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14, CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true, bool with_final_ln = true,
bool force_clip_f32 = false) bool force_clip_f32 = false)
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) { : GGMLRunner(backend, offload_params_to_cpu) {
model.init(params_ctx, tensor_types, prefix); bool proj_in = false;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (contains(name, "self_attn.in_proj")) {
proj_in = true;
break;
}
}
model = CLIPTextModel(version, with_final_ln, force_clip_f32, proj_in);
model.init(params_ctx, tensor_storage_map, prefix);
} }
std::string get_desc() override { std::string get_desc() override {
@ -902,8 +935,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
model.get_param_tensors(tensors, prefix); model.get_param_tensors(tensors, prefix);
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* embeddings, struct ggml_tensor* embeddings,
size_t max_token_idx = 0, size_t max_token_idx = 0,
@ -913,10 +945,10 @@ struct CLIPTextModelRunner : public GGMLRunner {
size_t n_token = input_ids->ne[0]; size_t n_token = input_ids->ne[0];
if (input_ids->ne[0] > model.n_token) { if (input_ids->ne[0] > model.n_token) {
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0); GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token); input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
} }
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
} }
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
@ -925,7 +957,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
size_t max_token_idx = 0, size_t max_token_idx = 0,
bool return_pooled = false, bool return_pooled = false,
int clip_skip = -1) { int clip_skip = -1) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); struct ggml_cgraph* gf = new_graph_custom(2048);
input_ids = to_backend(input_ids); input_ids = to_backend(input_ids);
@ -943,14 +975,16 @@ struct CLIPTextModelRunner : public GGMLRunner {
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1); embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
} }
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); auto runner_ctx = get_context();
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
int num_custom_embeddings, int num_custom_embeddings,
void* custom_embeddings_data, void* custom_embeddings_data,
@ -962,7 +996,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
}; };
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
} }
}; };

View File

@ -23,12 +23,12 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
if (vae_downsample) { if (vae_downsample) {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]); auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_pad(ctx, x, 1, 1, 0, 0); x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
x = conv->forward(ctx, x); x = conv->forward(ctx, x);
} else { } else {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]); auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
@ -52,12 +52,12 @@ public:
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]); auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2] x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
return x; return x;
} }
}; };
@ -121,7 +121,7 @@ public:
} }
} }
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) { virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
// [N, c, t, h, w] => [N, c, t, h * w] // [N, c, t, h, w] => [N, c, t, h * w]
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w] // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@ -137,32 +137,32 @@ public:
// in_layers // in_layers
auto h = in_layers_0->forward(ctx, x); auto h = in_layers_0->forward(ctx, x);
h = ggml_silu_inplace(ctx, h); h = ggml_silu_inplace(ctx->ggml_ctx, h);
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
// emb_layers // emb_layers
if (!skip_t_emb) { if (!skip_t_emb) {
auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]); auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
auto emb_out = ggml_silu(ctx, emb); auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels] emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
if (dims == 2) { if (dims == 2) {
emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1] emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
} else { } else {
emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1] emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
if (exchange_temb_dims) { if (exchange_temb_dims) {
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...") // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1] emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
} }
} }
h = ggml_add(ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] h = ggml_add(ctx->ggml_ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
} }
// out_layers // out_layers
h = out_layers_0->forward(ctx, h); h = out_layers_0->forward(ctx, h);
h = ggml_silu_inplace(ctx, h); h = ggml_silu_inplace(ctx->ggml_ctx, h);
// dropout, skip for inference // dropout, skip for inference
h = out_layers_3->forward(ctx, h); h = out_layers_3->forward(ctx, h);
@ -172,7 +172,7 @@ public:
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
} }
h = ggml_add(ctx, h, x); h = ggml_add(ctx->ggml_ctx, h, x);
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
} }
}; };
@ -182,35 +182,25 @@ protected:
int64_t dim_in; int64_t dim_in;
int64_t dim_out; int64_t dim_out;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
enum ggml_type bias_wtype = GGML_TYPE_F32;
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
}
public: public:
GEGLU(int64_t dim_in, int64_t dim_out) GEGLU(int64_t dim_in, int64_t dim_out)
: dim_in(dim_in), dim_out(dim_out) {} : dim_in(dim_in), dim_out(dim_out) {
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in] // x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out] // return: [ne3, ne2, ne1, dim_out]
struct ggml_tensor* w = params["proj.weight"]; auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
struct ggml_tensor* b = params["proj.bias"];
auto x_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in] x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2]
auto x_b = ggml_view_1d(ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in] auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ] x = x_vec[0]; // [ne3, ne2, ne1, dim_out]
auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ] auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out]
auto x_in = x; gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
x = ggml_nn_linear(ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
gate = ggml_gelu_inplace(ctx, gate); x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out]
x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, dim_out]
return x; return x;
} }
@ -222,13 +212,13 @@ public:
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias)); blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in] // x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out] // return: [ne3, ne2, ne1, dim_out]
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]); auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
x = proj->forward(ctx, x); x = proj->forward(ctx, x);
x = ggml_gelu_inplace(ctx, x); x = ggml_gelu_inplace(ctx->ggml_ctx, x);
return x; return x;
} }
}; };
@ -252,17 +242,21 @@ public:
} }
// net_1 is nn.Dropout(), skip for inference // net_1 is nn.Dropout(), skip for inference
float scale = 1.f; bool force_prec_f32 = false;
float scale = 1.f;
if (precision_fix) { if (precision_fix) {
scale = 1.f / 128.f; scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
} }
// The purpose of the scale here is to prevent NaN issues in certain situations. // The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32, // For example, when using Vulkan without enabling force_prec_f32,
// or when using CUDA but the weights are k-quants. // or when using CUDA but the weights are k-quants.
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale)); blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [ne3, ne2, ne1, dim] // x: [ne3, ne2, ne1, dim]
// return: [ne3, ne2, ne1, dim_out] // return: [ne3, ne2, ne1, dim_out]
@ -281,19 +275,16 @@ protected:
int64_t context_dim; int64_t context_dim;
int64_t n_head; int64_t n_head;
int64_t d_head; int64_t d_head;
bool flash_attn;
public: public:
CrossAttention(int64_t query_dim, CrossAttention(int64_t query_dim,
int64_t context_dim, int64_t context_dim,
int64_t n_head, int64_t n_head,
int64_t d_head, int64_t d_head)
bool flash_attn = false)
: n_head(n_head), : n_head(n_head),
d_head(d_head), d_head(d_head),
query_dim(query_dim), query_dim(query_dim),
context_dim(context_dim), context_dim(context_dim) {
flash_attn(flash_attn) {
int64_t inner_dim = d_head * n_head; int64_t inner_dim = d_head * n_head;
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false)); blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
@ -304,8 +295,7 @@ public:
// to_out_1 is nn.Dropout(), skip for inference // to_out_1 is nn.Dropout(), skip for inference
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* context) { struct ggml_tensor* context) {
// x: [N, n_token, query_dim] // x: [N, n_token, query_dim]
@ -325,7 +315,7 @@ public:
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, nullptr, false, false, flash_attn); // [N, n_token, inner_dim] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
return x; return x;
@ -343,16 +333,15 @@ public:
int64_t n_head, int64_t n_head,
int64_t d_head, int64_t d_head,
int64_t context_dim, int64_t context_dim,
bool ff_in = false, bool ff_in = false)
bool flash_attn = false)
: n_head(n_head), d_head(d_head), ff_in(ff_in) { : n_head(n_head), d_head(d_head), ff_in(ff_in) {
// disable_self_attn is always False // disable_self_attn is always False
// disable_temporal_crossattention is always False // disable_temporal_crossattention is always False
// switch_temporal_ca_to_sa is always False // switch_temporal_ca_to_sa is always False
// inner_dim is always None or equal to dim // inner_dim is always None or equal to dim
// gated_ff is always True // gated_ff is always True
blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head, flash_attn)); blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn)); blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim)); blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim)); blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim)); blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
@ -364,8 +353,7 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* context) { struct ggml_tensor* context) {
// x: [N, n_token, query_dim] // x: [N, n_token, query_dim]
@ -387,21 +375,21 @@ public:
x = norm_in->forward(ctx, x); x = norm_in->forward(ctx, x);
x = ff_in->forward(ctx, x); x = ff_in->forward(ctx, x);
// self.is_res is always True // self.is_res is always True
x = ggml_add(ctx, x, x_skip); x = ggml_add(ctx->ggml_ctx, x, x_skip);
} }
auto r = x; auto r = x;
x = norm1->forward(ctx, x); x = norm1->forward(ctx, x);
x = attn1->forward(ctx, backend, x, x); // self-attention x = attn1->forward(ctx, x, x); // self-attention
x = ggml_add(ctx, x, r); x = ggml_add(ctx->ggml_ctx, x, r);
r = x; r = x;
x = norm2->forward(ctx, x); x = norm2->forward(ctx, x);
x = attn2->forward(ctx, backend, x, context); // cross-attention x = attn2->forward(ctx, x, context); // cross-attention
x = ggml_add(ctx, x, r); x = ggml_add(ctx->ggml_ctx, x, r);
r = x; r = x;
x = norm3->forward(ctx, x); x = norm3->forward(ctx, x);
x = ff->forward(ctx, x); x = ff->forward(ctx, x);
x = ggml_add(ctx, x, r); x = ggml_add(ctx->ggml_ctx, x, r);
return x; return x;
} }
@ -414,6 +402,23 @@ protected:
int64_t d_head; int64_t d_head;
int64_t depth = 1; // 1 int64_t depth = 1; // 1
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2 int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
bool use_linear = false;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
if (iter != tensor_storage_map.end()) {
int64_t inner_dim = n_head * d_head;
if (iter->second.n_dims == 4 && use_linear) {
use_linear = false;
blocks["proj_in"] = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
} else if (iter->second.n_dims == 2 && !use_linear) {
use_linear = true;
blocks["proj_in"] = std::make_shared<Linear>(in_channels, inner_dim);
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
}
}
}
public: public:
SpatialTransformer(int64_t in_channels, SpatialTransformer(int64_t in_channels,
@ -421,35 +426,42 @@ public:
int64_t d_head, int64_t d_head,
int64_t depth, int64_t depth,
int64_t context_dim, int64_t context_dim,
bool flash_attn = false) bool use_linear)
: in_channels(in_channels), : in_channels(in_channels),
n_head(n_head), n_head(n_head),
d_head(d_head), d_head(d_head),
depth(depth), depth(depth),
context_dim(context_dim) { context_dim(context_dim),
// We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False use_linear(use_linear) {
// disable_self_attn is always False // disable_self_attn is always False
int64_t inner_dim = n_head * d_head; // in_channels int64_t inner_dim = n_head * d_head; // in_channels
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels)); blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1})); if (use_linear) {
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
} else {
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
}
for (int i = 0; i < depth; i++) { for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i); std::string name = "transformer_blocks." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn)); blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
} }
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1})); if (use_linear) {
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
} else {
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
}
} }
virtual struct ggml_tensor* forward(struct ggml_context* ctx, virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* context) { struct ggml_tensor* context) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)] // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]); auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
auto proj_in = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]); auto proj_in = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]); auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
auto x_in = x; auto x_in = x;
int64_t n = x->ne[3]; int64_t n = x->ne[3];
@ -458,32 +470,45 @@ public:
int64_t inner_dim = n_head * d_head; int64_t inner_dim = n_head * d_head;
x = norm->forward(ctx, x); x = norm->forward(ctx, x);
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w] if (use_linear) {
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim] x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim] x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
} else {
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
}
for (int i = 0; i < depth; i++) { for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i); std::string name = "transformer_blocks." + std::to_string(i);
auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]); auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
x = transformer_block->forward(ctx, backend, x, context); x = transformer_block->forward(ctx, x, context);
} }
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w] if (use_linear) {
x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w] // proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
// proj_out x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = proj_out->forward(ctx, x); // [N, in_channels, h, w] x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
} else {
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
x = ggml_add(ctx, x, x_in); // proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
}
x = ggml_add(ctx->ggml_ctx, x, x_in);
return x; return x;
} }
}; };
class AlphaBlender : public GGMLBlock { class AlphaBlender : public GGMLBlock {
protected: protected:
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
@ -492,7 +517,7 @@ protected:
float get_alpha() { float get_alpha() {
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,] // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
// so learned_with_images is same as learned // so learned_with_images is same as learned
float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]); float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
return sigmoid(alpha); return sigmoid(alpha);
} }
@ -503,14 +528,14 @@ public:
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern // since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x_spatial, struct ggml_tensor* x_spatial,
struct ggml_tensor* x_temporal) { struct ggml_tensor* x_temporal) {
// image_only_indicator is always tensor([0.]) // image_only_indicator is always tensor([0.])
float alpha = get_alpha(); float alpha = get_alpha();
auto x = ggml_add(ctx, auto x = ggml_add(ctx->ggml_ctx,
ggml_scale(ctx, x_spatial, alpha), ggml_scale(ctx->ggml_ctx, x_spatial, alpha),
ggml_scale(ctx, x_temporal, 1.0f - alpha)); ggml_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
return x; return x;
} }
}; };
@ -528,7 +553,7 @@ public:
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender()); blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* emb, struct ggml_tensor* emb,
int num_video_frames) { int num_video_frames) {
@ -546,18 +571,18 @@ public:
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int64_t W = x->ne[0]; int64_t W = x->ne[0];
x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w) x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
auto x_mix = x; auto x_mix = x;
emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ... emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
x = time_stack->forward(ctx, x, emb); // b t c (h w) x = time_stack->forward(ctx, x, emb); // b t c (h w)
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w) x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
return x; return x;
} }

View File

@ -2,7 +2,7 @@
#define __CONDITIONER_HPP__ #define __CONDITIONER_HPP__
#include "clip.hpp" #include "clip.hpp"
#include "qwenvl.hpp" #include "llm.hpp"
#include "t5.hpp" #include "t5.hpp"
struct SDCondition { struct SDCondition {
@ -34,6 +34,7 @@ struct Conditioner {
virtual void free_params_buffer() = 0; virtual void free_params_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0; virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0; virtual size_t get_params_buffer_size() = 0;
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx, virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
int n_threads, int n_threads,
const ConditionerParams& conditioner_params) { const ConditionerParams& conditioner_params) {
@ -55,27 +56,33 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::shared_ptr<CLIPTextModelRunner> text_model2; std::shared_ptr<CLIPTextModelRunner> text_model2;
std::string trigger_word = "img"; // should be user settable std::string trigger_word = "img"; // should be user settable
std::string embd_dir; std::map<std::string, std::string> embedding_map;
int32_t num_custom_embeddings = 0; int32_t num_custom_embeddings = 0;
int32_t num_custom_embeddings_2 = 0; int32_t num_custom_embeddings_2 = 0;
std::vector<uint8_t> token_embed_custom; std::vector<uint8_t> token_embed_custom;
std::vector<std::string> readed_embeddings; std::map<std::string, std::pair<int, int>> embedding_pos_map;
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend, FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2TensorStorage& tensor_storage_map,
const std::string& embd_dir, const std::map<std::string, std::string>& orig_embedding_map,
SDVersion version = VERSION_SD1, SDVersion version = VERSION_SD1,
PMVersion pv = PM_VERSION_1) PMVersion pv = PM_VERSION_1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
bool force_clip_f32 = embd_dir.size() > 0; for (const auto& kv : orig_embedding_map) {
std::string name = kv.first;
std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
embedding_map[name] = kv.second;
tokenizer.add_special_token(name);
}
bool force_clip_f32 = !embedding_map.empty();
if (sd_version_is_sd1(version)) { if (sd_version_is_sd1(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
} else if (sd_version_is_sd2(version)) { } else if (sd_version_is_sd2(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
} else if (sd_version_is_sdxl(version)) { } else if (sd_version_is_sdxl(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
} }
} }
@ -108,15 +115,25 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
text_model->set_weight_adapter(adapter);
if (sd_version_is_sdxl(version)) {
text_model2->set_weight_adapter(adapter);
}
}
bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) { bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
// the order matters
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(embd_path)) { if (!model_loader.init_from_file_and_convert_name(embd_path)) {
LOG_ERROR("embedding '%s' failed", embd_name.c_str()); LOG_ERROR("embedding '%s' failed", embd_name.c_str());
return false; return false;
} }
if (std::find(readed_embeddings.begin(), readed_embeddings.end(), embd_name) != readed_embeddings.end()) { auto iter = embedding_pos_map.find(embd_name);
if (iter != embedding_pos_map.end()) {
LOG_DEBUG("embedding already read in: %s", embd_name.c_str()); LOG_DEBUG("embedding already read in: %s", embd_name.c_str());
for (int i = iter->second.first; i < iter->second.second; i++) {
bpe_tokens.push_back(text_model->model.vocab_size + i);
}
return true; return true;
} }
struct ggml_init_params params; struct ggml_init_params params;
@ -147,7 +164,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return true; return true;
}; };
model_loader.load_tensors(on_load, 1); model_loader.load_tensors(on_load, 1);
readed_embeddings.push_back(embd_name); int pos_start = num_custom_embeddings;
if (embd) { if (embd) {
int64_t hidden_size = text_model->model.hidden_size; int64_t hidden_size = text_model->model.hidden_size;
token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd)); token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
@ -174,6 +191,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
} }
LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2); LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
} }
int pos_end = num_custom_embeddings;
if (pos_end == pos_start) {
return false;
}
embedding_pos_map[embd_name] = std::pair{pos_start, pos_end};
return true; return true;
} }
@ -188,25 +210,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::vector<int> convert_token_to_id(std::string text) { std::vector<int> convert_token_to_id(std::string text) {
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool { auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
size_t word_end = str.find(","); auto iter = embedding_map.find(str);
std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end); if (iter == embedding_map.end()) {
embd_name = trim(embd_name); return false;
std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
if (embd_path.size() == 0) {
embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
} }
if (embd_path.size() == 0) { std::string embedding_path = iter->second;
embd_path = get_full_path(embd_dir, embd_name + ".safetensors"); if (load_embedding(str, embedding_path, bpe_tokens)) {
} return true;
if (embd_path.size() > 0) {
if (load_embedding(embd_name, embd_path, bpe_tokens)) {
if (word_end != std::string::npos) {
str = str.substr(word_end);
} else {
str = "";
}
return true;
}
} }
return false; return false;
}; };
@ -237,25 +247,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
} }
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool { auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
size_t word_end = str.find(","); auto iter = embedding_map.find(str);
std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end); if (iter == embedding_map.end()) {
embd_name = trim(embd_name); return false;
std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
if (embd_path.size() == 0) {
embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
} }
if (embd_path.size() == 0) { std::string embedding_path = iter->second;
embd_path = get_full_path(embd_dir, embd_name + ".safetensors"); if (load_embedding(str, embedding_path, bpe_tokens)) {
} return true;
if (embd_path.size() > 0) {
if (load_embedding(embd_name, embd_path, bpe_tokens)) {
if (word_end != std::string::npos) {
str = str.substr(word_end);
} else {
str = "";
}
return true;
}
} }
return false; return false;
}; };
@ -270,13 +268,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
const std::string& curr_text = item.first; const std::string& curr_text = item.first;
float curr_weight = item.second; float curr_weight = item.second;
// printf(" %s: %f \n", curr_text.c_str(), curr_weight); // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
int32_t clean_index = 0;
if (curr_text == "BREAK" && curr_weight == -1.0f) {
// Pad token array up to chunk size at this point.
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
int padding_size = 75 - (tokens_acc % 75);
for (int j = 0; j < padding_size; j++) {
clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
clean_index++;
}
// After padding, continue to the next iteration to process the following text as a new segment
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
weights.insert(weights.end(), padding_size, curr_weight);
continue;
}
// Regular token, process normally
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb); std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
int32_t clean_index = 0;
for (uint32_t i = 0; i < curr_tokens.size(); i++) { for (uint32_t i = 0; i < curr_tokens.size(); i++) {
int token_id = curr_tokens[i]; int token_id = curr_tokens[i];
if (token_id == image_token) if (token_id == image_token) {
class_token_index.push_back(clean_index - 1); class_token_index.push_back(clean_index - 1);
else { } else {
clean_input_ids.push_back(token_id); clean_input_ids.push_back(token_id);
clean_index++; clean_index++;
} }
@ -351,25 +366,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
} }
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool { auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
size_t word_end = str.find(","); auto iter = embedding_map.find(str);
std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end); if (iter == embedding_map.end()) {
embd_name = trim(embd_name); return false;
std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
if (embd_path.size() == 0) {
embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
} }
if (embd_path.size() == 0) { std::string embedding_path = iter->second;
embd_path = get_full_path(embd_dir, embd_name + ".safetensors"); if (load_embedding(str, embedding_path, bpe_tokens)) {
} return true;
if (embd_path.size() > 0) {
if (load_embedding(embd_name, embd_path, bpe_tokens)) {
if (word_end != std::string::npos) {
str = str.substr(word_end);
} else {
str = "";
}
return true;
}
} }
return false; return false;
}; };
@ -379,6 +382,22 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
for (const auto& item : parsed_attention) { for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first; const std::string& curr_text = item.first;
float curr_weight = item.second; float curr_weight = item.second;
if (curr_text == "BREAK" && curr_weight == -1.0f) {
// Pad token array up to chunk size at this point.
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
size_t current_size = tokens.size();
size_t padding_size = (75 - (current_size % 75)) % 75; // Ensure no negative padding
if (padding_size > 0) {
LOG_DEBUG("BREAK token encountered, padding current chunk by %zu tokens.", padding_size);
tokens.insert(tokens.end(), padding_size, tokenizer.EOS_TOKEN_ID);
weights.insert(weights.end(), padding_size, 1.0f);
}
continue; // Skip to the next item after handling BREAK
}
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb); std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight); weights.insert(weights.end(), curr_tokens.size(), curr_weight);
@ -462,7 +481,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
clip_skip, clip_skip,
&chunk_hidden_states2, work_ctx); &chunk_hidden_states2, work_ctx);
// concat // concat
chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0); chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
if (chunk_idx == 0) { if (chunk_idx == 0) {
text_model2->compute(n_threads, text_model2->compute(n_threads,
@ -484,18 +503,18 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states); ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states);
{ {
float original_mean = ggml_tensor_mean(chunk_hidden_states); float original_mean = ggml_ext_tensor_mean(chunk_hidden_states);
for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) { for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) {
for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) { for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) {
for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) { for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) {
float value = ggml_tensor_get_f32(chunk_hidden_states, i0, i1, i2); float value = ggml_ext_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
value *= chunk_weights[i1]; value *= chunk_weights[i1];
ggml_tensor_set_f32(result, value, i0, i1, i2); ggml_ext_tensor_set_f32(result, value, i0, i1, i2);
} }
} }
} }
float new_mean = ggml_tensor_mean(result); float new_mean = ggml_ext_tensor_mean(result);
ggml_tensor_scale(result, (original_mean / new_mean)); ggml_ext_tensor_scale_inplace(result, (original_mean / new_mean));
} }
if (zero_out_masked) { if (zero_out_masked) {
float* vec = (float*)result->data; float* vec = (float*)result->data;
@ -623,9 +642,21 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
FrozenCLIPVisionEmbedder(ggml_backend_t backend, FrozenCLIPVisionEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}) const String2TensorStorage& tensor_storage_map = {})
: vision_model(OPEN_CLIP_VIT_H_14), GGMLRunner(backend, offload_params_to_cpu) { : GGMLRunner(backend, offload_params_to_cpu) {
vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer"); std::string prefix = "cond_stage_model.transformer";
bool proj_in = false;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (contains(name, "self_attn.in_proj")) {
proj_in = true;
break;
}
}
vision_model = CLIPVisionModelProjection(OPEN_CLIP_VIT_H_14, false, proj_in);
vision_model.init(params_ctx, tensor_storage_map, prefix);
} }
std::string get_desc() override { std::string get_desc() override {
@ -641,14 +672,16 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
pixel_values = to_backend(pixel_values); pixel_values = to_backend(pixel_values);
struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, runtime_backend, pixel_values, return_pooled, clip_skip); auto runner_ctx = get_context();
struct ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
ggml_tensor* pixel_values, ggml_tensor* pixel_values,
bool return_pooled, bool return_pooled,
int clip_skip, int clip_skip,
@ -657,7 +690,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(pixel_values, return_pooled, clip_skip); return build_graph(pixel_values, return_pooled, clip_skip);
}; };
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
} }
}; };
@ -671,12 +704,12 @@ struct SD3CLIPEmbedder : public Conditioner {
SD3CLIPEmbedder(ggml_backend_t backend, SD3CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}) const String2TensorStorage& tensor_storage_map = {})
: clip_g_tokenizer(0) { : clip_g_tokenizer(0) {
bool use_clip_l = false; bool use_clip_l = false;
bool use_clip_g = false; bool use_clip_g = false;
bool use_t5 = false; bool use_t5 = false;
for (auto pair : tensor_types) { for (auto pair : tensor_storage_map) {
if (pair.first.find("text_encoders.clip_l") != std::string::npos) { if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
use_clip_l = true; use_clip_l = true;
} else if (pair.first.find("text_encoders.clip_g") != std::string::npos) { } else if (pair.first.find("text_encoders.clip_g") != std::string::npos) {
@ -690,13 +723,13 @@ struct SD3CLIPEmbedder : public Conditioner {
return; return;
} }
if (use_clip_l) { if (use_clip_l) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
} }
if (use_clip_g) { if (use_clip_g) {
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
} }
if (use_t5) { if (use_t5) {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
} }
} }
@ -750,6 +783,18 @@ struct SD3CLIPEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
if (clip_l) {
clip_l->set_weight_adapter(adapter);
}
if (clip_g) {
clip_g->set_weight_adapter(adapter);
}
if (t5) {
t5->set_weight_adapter(adapter);
}
}
std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text, std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
size_t max_length = 0, size_t max_length = 0,
bool padding = false) { bool padding = false) {
@ -874,18 +919,18 @@ struct SD3CLIPEmbedder : public Conditioner {
work_ctx); work_ctx);
{ {
auto tensor = chunk_hidden_states_l; auto tensor = chunk_hidden_states_l;
float original_mean = ggml_tensor_mean(tensor); float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_tensor_get_f32(tensor, i0, i1, i2); float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1]; value *= chunk_weights[i1];
ggml_tensor_set_f32(tensor, value, i0, i1, i2); ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
} }
} }
} }
float new_mean = ggml_tensor_mean(tensor); float new_mean = ggml_ext_tensor_mean(tensor);
ggml_tensor_scale(tensor, (original_mean / new_mean)); ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
} }
if (chunk_idx == 0) { if (chunk_idx == 0) {
@ -932,18 +977,18 @@ struct SD3CLIPEmbedder : public Conditioner {
{ {
auto tensor = chunk_hidden_states_g; auto tensor = chunk_hidden_states_g;
float original_mean = ggml_tensor_mean(tensor); float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_tensor_get_f32(tensor, i0, i1, i2); float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1]; value *= chunk_weights[i1];
ggml_tensor_set_f32(tensor, value, i0, i1, i2); ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
} }
} }
} }
float new_mean = ggml_tensor_mean(tensor); float new_mean = ggml_ext_tensor_mean(tensor);
ggml_tensor_scale(tensor, (original_mean / new_mean)); ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
} }
if (chunk_idx == 0) { if (chunk_idx == 0) {
@ -984,18 +1029,18 @@ struct SD3CLIPEmbedder : public Conditioner {
work_ctx); work_ctx);
{ {
auto tensor = chunk_hidden_states_t5; auto tensor = chunk_hidden_states_t5;
float original_mean = ggml_tensor_mean(tensor); float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_tensor_get_f32(tensor, i0, i1, i2); float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1]; value *= chunk_weights[i1];
ggml_tensor_set_f32(tensor, value, i0, i1, i2); ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
} }
} }
} }
float new_mean = ggml_tensor_mean(tensor); float new_mean = ggml_ext_tensor_mean(tensor);
ggml_tensor_scale(tensor, (original_mean / new_mean)); ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
} }
} else { } else {
chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
@ -1013,19 +1058,19 @@ struct SD3CLIPEmbedder : public Conditioner {
for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) { for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) {
float value = 0.f; float value = 0.f;
if (i0 < chunk_hidden_states_l->ne[0]) { if (i0 < chunk_hidden_states_l->ne[0]) {
value = ggml_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2); value = ggml_ext_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
} else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) { } else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) {
value = ggml_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2); value = ggml_ext_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2);
} }
ggml_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2); ggml_ext_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2);
} }
} }
} }
chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096] chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096]
if (chunk_idx == 0) { if (chunk_idx == 0) {
pooled = ggml_tensor_concat(work_ctx, pooled_l, pooled_g, 0); // [768 + 1280] pooled = ggml_ext_tensor_concat(work_ctx, pooled_l, pooled_g, 0); // [768 + 1280]
} }
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
@ -1080,10 +1125,10 @@ struct FluxCLIPEmbedder : public Conditioner {
FluxCLIPEmbedder(ggml_backend_t backend, FluxCLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}) { const String2TensorStorage& tensor_storage_map = {}) {
bool use_clip_l = false; bool use_clip_l = false;
bool use_t5 = false; bool use_t5 = false;
for (auto pair : tensor_types) { for (auto pair : tensor_storage_map) {
if (pair.first.find("text_encoders.clip_l") != std::string::npos) { if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
use_clip_l = true; use_clip_l = true;
} else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) { } else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
@ -1097,12 +1142,12 @@ struct FluxCLIPEmbedder : public Conditioner {
} }
if (use_clip_l) { if (use_clip_l) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
} else { } else {
LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded."); LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
} }
if (use_t5) { if (use_t5) {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
} else { } else {
LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded."); LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
} }
@ -1146,6 +1191,15 @@ struct FluxCLIPEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
if (clip_l) {
clip_l->set_weight_adapter(adapter);
}
if (t5) {
t5->set_weight_adapter(adapter);
}
}
std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text, std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
size_t max_length = 0, size_t max_length = 0,
bool padding = false) { bool padding = false) {
@ -1269,18 +1323,18 @@ struct FluxCLIPEmbedder : public Conditioner {
work_ctx); work_ctx);
{ {
auto tensor = chunk_hidden_states; auto tensor = chunk_hidden_states;
float original_mean = ggml_tensor_mean(tensor); float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_tensor_get_f32(tensor, i0, i1, i2); float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1]; value *= chunk_weights[i1];
ggml_tensor_set_f32(tensor, value, i0, i1, i2); ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
} }
} }
} }
float new_mean = ggml_tensor_mean(tensor); float new_mean = ggml_ext_tensor_mean(tensor);
ggml_tensor_scale(tensor, (original_mean / new_mean)); ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
} }
} else { } else {
chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
@ -1340,13 +1394,13 @@ struct T5CLIPEmbedder : public Conditioner {
T5CLIPEmbedder(ggml_backend_t backend, T5CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2TensorStorage& tensor_storage_map = {},
bool use_mask = false, bool use_mask = false,
int mask_pad = 1, int mask_pad = 1,
bool is_umt5 = false) bool is_umt5 = false)
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) { : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
bool use_t5 = false; bool use_t5 = false;
for (auto pair : tensor_types) { for (auto pair : tensor_storage_map) {
if (pair.first.find("text_encoders.t5xxl") != std::string::npos) { if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
use_t5 = true; use_t5 = true;
} }
@ -1356,7 +1410,7 @@ struct T5CLIPEmbedder : public Conditioner {
LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!"); LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
return; return;
} else { } else {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5);
} }
} }
@ -1386,6 +1440,12 @@ struct T5CLIPEmbedder : public Conditioner {
return buffer_size; return buffer_size;
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
if (t5) {
t5->set_weight_adapter(adapter);
}
}
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text, std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
size_t max_length = 0, size_t max_length = 0,
bool padding = false) { bool padding = false) {
@ -1483,18 +1543,18 @@ struct T5CLIPEmbedder : public Conditioner {
work_ctx); work_ctx);
{ {
auto tensor = chunk_hidden_states; auto tensor = chunk_hidden_states;
float original_mean = ggml_tensor_mean(tensor); float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_tensor_get_f32(tensor, i0, i1, i2); float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1]; value *= chunk_weights[i1];
ggml_tensor_set_f32(tensor, value, i0, i1, i2); ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
} }
} }
} }
float new_mean = ggml_tensor_mean(tensor); float new_mean = ggml_ext_tensor_mean(tensor);
ggml_tensor_scale(tensor, (original_mean / new_mean)); ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
} }
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
@ -1505,7 +1565,7 @@ struct T5CLIPEmbedder : public Conditioner {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
if (chunk_mask[i1] < 0.f) { if (chunk_mask[i1] < 0.f) {
ggml_tensor_set_f32(tensor, 0.f, i0, i1, i2); ggml_ext_tensor_set_f32(tensor, 0.f, i0, i1, i2);
} }
} }
} }
@ -1541,55 +1601,74 @@ struct T5CLIPEmbedder : public Conditioner {
} }
}; };
struct Qwen2_5_VLCLIPEmbedder : public Conditioner { struct LLMEmbedder : public Conditioner {
Qwen::Qwen2Tokenizer tokenizer; SDVersion version;
std::shared_ptr<Qwen::Qwen2_5_VLRunner> qwenvl; std::shared_ptr<LLM::BPETokenizer> tokenizer;
std::shared_ptr<LLM::LLMRunner> llm;
Qwen2_5_VLCLIPEmbedder(ggml_backend_t backend, LLMEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "", SDVersion version = VERSION_QWEN_IMAGE,
bool enable_vision = false) { const std::string prefix = "",
qwenvl = std::make_shared<Qwen::Qwen2_5_VLRunner>(backend, bool enable_vision = false)
offload_params_to_cpu, : version(version) {
tensor_types, LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
"text_encoders.qwen2vl", if (sd_version_is_flux2(version)) {
enable_vision); arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE) {
arch = LLM::LLMArch::QWEN3;
}
if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
tokenizer = std::make_shared<LLM::MistralTokenizer>();
} else {
tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
}
llm = std::make_shared<LLM::LLMRunner>(arch,
backend,
offload_params_to_cpu,
tensor_storage_map,
"text_encoders.llm",
enable_vision);
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
qwenvl->get_param_tensors(tensors, "text_encoders.qwen2vl"); llm->get_param_tensors(tensors, "text_encoders.llm");
} }
void alloc_params_buffer() override { void alloc_params_buffer() override {
qwenvl->alloc_params_buffer(); llm->alloc_params_buffer();
} }
void free_params_buffer() override { void free_params_buffer() override {
qwenvl->free_params_buffer(); llm->free_params_buffer();
} }
size_t get_params_buffer_size() override { size_t get_params_buffer_size() override {
size_t buffer_size = 0; size_t buffer_size = 0;
buffer_size += qwenvl->get_params_buffer_size(); buffer_size += llm->get_params_buffer_size();
return buffer_size; return buffer_size;
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
if (llm) {
llm->set_weight_adapter(adapter);
}
}
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text, std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
size_t max_length = 0, std::pair<int, int> attn_range,
size_t system_prompt_length = 0, size_t max_length = 0,
bool padding = false) { bool padding = false) {
std::vector<std::pair<std::string, float>> parsed_attention; std::vector<std::pair<std::string, float>> parsed_attention;
if (system_prompt_length > 0) { parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
parsed_attention.emplace_back(text.substr(0, system_prompt_length), 1.f); if (attn_range.second - attn_range.first > 0) {
auto new_parsed_attention = parse_prompt_attention(text.substr(system_prompt_length, text.size() - system_prompt_length)); auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
parsed_attention.insert(parsed_attention.end(), parsed_attention.insert(parsed_attention.end(),
new_parsed_attention.begin(), new_parsed_attention.begin(),
new_parsed_attention.end()); new_parsed_attention.end());
} else {
parsed_attention = parse_prompt_attention(text);
} }
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
{ {
std::stringstream ss; std::stringstream ss;
ss << "["; ss << "[";
@ -1605,12 +1684,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
for (const auto& item : parsed_attention) { for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first; const std::string& curr_text = item.first;
float curr_weight = item.second; float curr_weight = item.second;
std::vector<int> curr_tokens = tokenizer.tokenize(curr_text, nullptr); std::vector<int> curr_tokens = tokenizer->tokenize(curr_text, nullptr);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight); weights.insert(weights.end(), curr_tokens.size(), curr_weight);
} }
tokenizer.pad_tokens(tokens, weights, max_length, padding); tokenizer->pad_tokens(tokens, weights, max_length, padding);
// for (int i = 0; i < tokens.size(); i++) { // for (int i = 0; i < tokens.size(); i++) {
// std::cout << tokens[i] << ":" << weights[i] << ", " << i << std::endl; // std::cout << tokens[i] << ":" << weights[i] << ", " << i << std::endl;
@ -1625,9 +1704,11 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
const ConditionerParams& conditioner_params) override { const ConditionerParams& conditioner_params) override {
std::string prompt; std::string prompt;
std::vector<std::pair<int, ggml_tensor*>> image_embeds; std::vector<std::pair<int, ggml_tensor*>> image_embeds;
size_t system_prompt_length = 0; std::pair<int, int> prompt_attn_range;
int prompt_template_encode_start_idx = 34; int prompt_template_encode_start_idx = 34;
if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) { int max_length = 0;
std::set<int> out_layers;
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
LOG_INFO("QwenImageEditPlusPipeline"); LOG_INFO("QwenImageEditPlusPipeline");
prompt_template_encode_start_idx = 64; prompt_template_encode_start_idx = 64;
int image_embed_idx = 64 + 6; int image_embed_idx = 64 + 6;
@ -1639,7 +1720,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
for (int i = 0; i < conditioner_params.ref_images.size(); i++) { for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]); sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
double factor = qwenvl->params.vision.patch_size * qwenvl->params.vision.spatial_merge_size; double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
int height = image.height; int height = image.height;
int width = image.width; int width = image.width;
int h_bar = static_cast<int>(std::round(height / factor)) * factor; int h_bar = static_cast<int>(std::round(height / factor)) * factor;
@ -1664,12 +1745,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
image.data = nullptr; image.data = nullptr;
ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1); ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
sd_image_f32_to_tensor(resized_image, image_tensor, false); sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
free(resized_image.data); free(resized_image.data);
resized_image.data = nullptr; resized_image.data = nullptr;
ggml_tensor* image_embed = nullptr; ggml_tensor* image_embed = nullptr;
qwenvl->encode_image(n_threads, image_tensor, &image_embed, work_ctx); llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
image_embeds.emplace_back(image_embed_idx, image_embed); image_embeds.emplace_back(image_embed_idx, image_embed);
image_embed_idx += 1 + image_embed->ne[1] + 6; image_embed_idx += 1 + image_embed->ne[1] + 6;
@ -1683,17 +1764,70 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
} }
prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n"; prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
system_prompt_length = prompt.size();
prompt += img_prompt; prompt += img_prompt;
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text; prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n"; prompt += "<|im_end|>\n<|im_start|>assistant\n";
} else if (sd_version_is_flux2(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {10, 20, 30};
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "[/INST]";
} else if (sd_version_is_z_image(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {35}; // -2
prompt = "<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
} else if (sd_version_is_flux2(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {10, 20, 30};
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
prompt_attn_range.first = prompt.size();
prompt += conditioner_params.text;
prompt_attn_range.second = prompt.size();
prompt += "[/INST]";
} else if (version == VERSION_OVIS_IMAGE) {
prompt_template_encode_start_idx = 28;
max_length = prompt_template_encode_start_idx + 256;
prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += " " + conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
} else { } else {
prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n"; prompt_template_encode_start_idx = 34;
prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
} }
auto tokens_and_weights = tokenize(prompt, 0, system_prompt_length, false); auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0);
auto& tokens = std::get<0>(tokens_and_weights); auto& tokens = std::get<0>(tokens_and_weights);
auto& weights = std::get<1>(tokens_and_weights); auto& weights = std::get<1>(tokens_and_weights);
@ -1702,40 +1836,58 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
qwenvl->compute(n_threads, llm->compute(n_threads,
input_ids, input_ids,
image_embeds, image_embeds,
&hidden_states, out_layers,
work_ctx); &hidden_states,
work_ctx);
{ {
auto tensor = hidden_states; auto tensor = hidden_states;
float original_mean = ggml_tensor_mean(tensor); float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_tensor_get_f32(tensor, i0, i1, i2); float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= weights[i1]; value *= weights[i1];
ggml_tensor_set_f32(tensor, value, i0, i1, i2); ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
} }
} }
} }
float new_mean = ggml_tensor_mean(tensor); float new_mean = ggml_ext_tensor_mean(tensor);
ggml_tensor_scale(tensor, (original_mean / new_mean)); ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
} }
GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx); GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
int64_t min_length = 0;
if (sd_version_is_flux2(version)) {
min_length = 512;
}
int64_t zero_pad_len = 0;
if (min_length > 0) {
if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
}
}
ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx, ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
GGML_TYPE_F32, GGML_TYPE_F32,
hidden_states->ne[0], hidden_states->ne[0],
hidden_states->ne[1] - prompt_template_encode_start_idx, hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
hidden_states->ne[2]); hidden_states->ne[2]);
ggml_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = ggml_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3); float value = 0.f;
ggml_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3); if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
}
ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
}); });
// print_ggml_tensor(new_hidden_states);
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
return {new_hidden_states, nullptr, nullptr}; return {new_hidden_states, nullptr, nullptr};

View File

@ -27,6 +27,7 @@ protected:
int num_heads = 8; int num_heads = 8;
int num_head_channels = -1; // channels // num_heads int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
bool use_linear_projection = false;
public: public:
int model_channels = 320; int model_channels = 320;
@ -82,7 +83,7 @@ public:
int64_t d_head, int64_t d_head,
int64_t depth, int64_t depth,
int64_t context_dim) -> SpatialTransformer* { int64_t context_dim) -> SpatialTransformer* {
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim); return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
}; };
auto make_zero_conv = [&](int64_t channels) { auto make_zero_conv = [&](int64_t channels) {
@ -165,7 +166,7 @@ public:
} }
struct ggml_tensor* resblock_forward(std::string name, struct ggml_tensor* resblock_forward(std::string name,
struct ggml_context* ctx, GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* emb) { struct ggml_tensor* emb) {
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]); auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
@ -173,15 +174,14 @@ public:
} }
struct ggml_tensor* attention_layer_forward(std::string name, struct ggml_tensor* attention_layer_forward(std::string name,
struct ggml_context* ctx, GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* context) { struct ggml_tensor* context) {
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]); auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
return block->forward(ctx, backend, x, context); return block->forward(ctx, x, context);
} }
struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx, struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
struct ggml_tensor* hint, struct ggml_tensor* hint,
struct ggml_tensor* emb, struct ggml_tensor* emb,
struct ggml_tensor* context) { struct ggml_tensor* context) {
@ -193,14 +193,13 @@ public:
h = block->forward(ctx, h); h = block->forward(ctx, h);
} else { } else {
h = ggml_silu_inplace(ctx, h); h = ggml_silu_inplace(ctx->ggml_ctx, h);
} }
} }
return h; return h;
} }
std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx, std::vector<struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* hint, struct ggml_tensor* hint,
struct ggml_tensor* guided_hint, struct ggml_tensor* guided_hint,
@ -213,13 +212,13 @@ public:
// y: [N, adm_in_channels] or [1, adm_in_channels] // y: [N, adm_in_channels] or [1, adm_in_channels]
if (context != nullptr) { if (context != nullptr) {
if (context->ne[2] != x->ne[3]) { if (context->ne[2] != x->ne[3]) {
context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3])); context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
} }
} }
if (y != nullptr) { if (y != nullptr) {
if (y->ne[1] != x->ne[3]) { if (y->ne[1] != x->ne[3]) {
y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3])); y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
} }
} }
@ -230,10 +229,10 @@ public:
auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]); auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels] auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels); // [N, model_channels]
auto emb = time_embed_0->forward(ctx, t_emb); auto emb = time_embed_0->forward(ctx, t_emb);
emb = ggml_silu_inplace(ctx, emb); emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim] emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
// SDXL/SVD // SDXL/SVD
@ -242,10 +241,10 @@ public:
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]); auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
auto label_emb = label_embed_0->forward(ctx, y); auto label_emb = label_embed_0->forward(ctx, y);
label_emb = ggml_silu_inplace(ctx, label_emb); label_emb = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim] label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim] emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
} }
std::vector<struct ggml_tensor*> outs; std::vector<struct ggml_tensor*> outs;
@ -259,7 +258,7 @@ public:
// input block 0 // input block 0
auto h = input_blocks_0_0->forward(ctx, x); auto h = input_blocks_0_0->forward(ctx, x);
h = ggml_add(ctx, h, guided_hint); h = ggml_add(ctx->ggml_ctx, h, guided_hint);
outs.push_back(zero_convs_0->forward(ctx, h)); outs.push_back(zero_convs_0->forward(ctx, h));
// input block 1-11 // input block 1-11
@ -274,7 +273,7 @@ public:
h = resblock_forward(name, ctx, h, emb); // [N, mult*model_channels, h, w] h = resblock_forward(name, ctx, h, emb); // [N, mult*model_channels, h, w]
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
h = attention_layer_forward(name, ctx, backend, h, context); // [N, mult*model_channels, h, w] h = attention_layer_forward(name, ctx, h, context); // [N, mult*model_channels, h, w]
} }
auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]); auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
@ -298,9 +297,9 @@ public:
// [N, 4*model_channels, h/8, w/8] // [N, 4*model_channels, h/8, w/8]
// middle_block // middle_block
h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8] h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
h = attention_layer_forward("middle_block.1", ctx, backend, h, context); // [N, 4*model_channels, h/8, w/8] h = attention_layer_forward("middle_block.1", ctx, h, context); // [N, 4*model_channels, h/8, w/8]
h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8] h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
// out // out
outs.push_back(middle_block_out->forward(ctx, h)); outs.push_back(middle_block_out->forward(ctx, h));
@ -320,21 +319,10 @@ struct ControlNet : public GGMLRunner {
ControlNet(ggml_backend_t backend, ControlNet(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_SD1) SDVersion version = VERSION_SD1)
: GGMLRunner(backend, offload_params_to_cpu), control_net(version) { : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
control_net.init(params_ctx, tensor_types, ""); control_net.init(params_ctx, tensor_storage_map, "");
}
void enable_conv2d_direct() {
std::vector<GGMLBlock*> blocks;
control_net.get_all_blocks(blocks);
for (auto block : blocks) {
if (block->get_desc() == "Conv2d") {
auto conv_block = (Conv2d*)block;
conv_block->enable_direct();
}
}
} }
~ControlNet() override { ~ControlNet() override {
@ -392,7 +380,7 @@ struct ControlNet : public GGMLRunner {
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* y = nullptr) { struct ggml_tensor* y = nullptr) {
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false); struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
x = to_backend(x); x = to_backend(x);
if (guided_hint_cached) { if (guided_hint_cached) {
@ -404,8 +392,9 @@ struct ControlNet : public GGMLRunner {
y = to_backend(y); y = to_backend(y);
timesteps = to_backend(timesteps); timesteps = to_backend(timesteps);
auto outs = control_net.forward(compute_ctx, auto runner_ctx = get_context();
runtime_backend,
auto outs = control_net.forward(&runner_ctx,
x, x,
hint, hint,
guided_hint_cached ? guided_hint : nullptr, guided_hint_cached ? guided_hint : nullptr,
@ -425,7 +414,7 @@ struct ControlNet : public GGMLRunner {
return gf; return gf;
} }
void compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* hint, struct ggml_tensor* hint,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
@ -441,8 +430,12 @@ struct ControlNet : public GGMLRunner {
return build_graph(x, hint, timesteps, context, y); return build_graph(x, hint, timesteps, context, y);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
guided_hint_cached = true; if (res) {
// cache guided_hint
guided_hint_cached = true;
}
return res;
} }
bool load_from_file(const std::string& file_path, int n_threads) { bool load_from_file(const std::string& file_path, int n_threads) {
@ -453,7 +446,7 @@ struct ControlNet : public GGMLRunner {
std::set<std::string> ignore_tensors; std::set<std::string> ignore_tensors;
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
return false; return false;
} }

View File

@ -11,14 +11,13 @@
#define TIMESTEPS 1000 #define TIMESTEPS 1000
#define FLUX_TIMESTEPS 1000 #define FLUX_TIMESTEPS 1000
struct SigmaSchedule { struct SigmaScheduler {
int version = 0;
typedef std::function<float(float)> t_to_sigma_t; typedef std::function<float(float)> t_to_sigma_t;
virtual std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) = 0; virtual std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) = 0;
}; };
struct DiscreteSchedule : SigmaSchedule { struct DiscreteScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector<float> result; std::vector<float> result;
@ -42,7 +41,7 @@ struct DiscreteSchedule : SigmaSchedule {
} }
}; };
struct ExponentialSchedule : SigmaSchedule { struct ExponentialScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector<float> sigmas; std::vector<float> sigmas;
@ -149,7 +148,10 @@ std::vector<float> log_linear_interpolation(std::vector<float> sigma_in,
/* /*
https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
*/ */
struct AYSSchedule : SigmaSchedule { struct AYSScheduler : SigmaScheduler {
SDVersion version;
explicit AYSScheduler(SDVersion version)
: version(version) {}
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
const std::vector<float> noise_levels[] = { const std::vector<float> noise_levels[] = {
/* SD1.5 */ /* SD1.5 */
@ -169,19 +171,19 @@ struct AYSSchedule : SigmaSchedule {
std::vector<float> results(n + 1); std::vector<float> results(n + 1);
if (sd_version_is_sd2((SDVersion)version)) { if (sd_version_is_sd2((SDVersion)version)) {
LOG_WARN("AYS not designed for SD2.X models"); LOG_WARN("AYS_SCHEDULER not designed for SD2.X models");
} /* fallthrough */ } /* fallthrough */
else if (sd_version_is_sd1((SDVersion)version)) { else if (sd_version_is_sd1((SDVersion)version)) {
LOG_INFO("AYS using SD1.5 noise levels"); LOG_INFO("AYS_SCHEDULER using SD1.5 noise levels");
inputs = noise_levels[0]; inputs = noise_levels[0];
} else if (sd_version_is_sdxl((SDVersion)version)) { } else if (sd_version_is_sdxl((SDVersion)version)) {
LOG_INFO("AYS using SDXL noise levels"); LOG_INFO("AYS_SCHEDULER using SDXL noise levels");
inputs = noise_levels[1]; inputs = noise_levels[1];
} else if (version == VERSION_SVD) { } else if (version == VERSION_SVD) {
LOG_INFO("AYS using SVD noise levels"); LOG_INFO("AYS_SCHEDULER using SVD noise levels");
inputs = noise_levels[2]; inputs = noise_levels[2];
} else { } else {
LOG_ERROR("Version not compatible with AYS scheduler"); LOG_ERROR("Version not compatible with AYS_SCHEDULER scheduler");
return results; return results;
} }
@ -203,7 +205,7 @@ struct AYSSchedule : SigmaSchedule {
/* /*
* GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main * GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main
*/ */
struct GITSSchedule : SigmaSchedule { struct GITSScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
if (sigma_max <= 0.0f) { if (sigma_max <= 0.0f) {
return std::vector<float>{}; return std::vector<float>{};
@ -232,7 +234,7 @@ struct GITSSchedule : SigmaSchedule {
} }
}; };
struct SGMUniformSchedule : SigmaSchedule { struct SGMUniformScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override { std::vector<float> get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override {
std::vector<float> result; std::vector<float> result;
if (n == 0) { if (n == 0) {
@ -251,7 +253,24 @@ struct SGMUniformSchedule : SigmaSchedule {
} }
}; };
struct KarrasSchedule : SigmaSchedule { struct LCMScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector<float> result;
result.reserve(n + 1);
const int original_steps = 50;
const int k = TIMESTEPS / original_steps;
for (int i = 0; i < n; i++) {
// the rounding ensures we match the training schedule of the LCM model
int index = (i * original_steps) / n;
int timestep = (original_steps - index) * k - 1;
result.push_back(t_to_sigma(timestep));
}
result.push_back(0.0f);
return result;
}
};
struct KarrasScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
// These *COULD* be function arguments here, // These *COULD* be function arguments here,
// but does anybody ever bother to touch them? // but does anybody ever bother to touch them?
@ -270,7 +289,7 @@ struct KarrasSchedule : SigmaSchedule {
} }
}; };
struct SimpleSchedule : SigmaSchedule { struct SimpleScheduler : SigmaScheduler {
std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector<float> result_sigmas; std::vector<float> result_sigmas;
@ -299,8 +318,8 @@ struct SimpleSchedule : SigmaSchedule {
} }
}; };
// Close to Beta Schedule, but increadably simple in code. // Close to Beta Scheduler, but increadably simple in code.
struct SmoothStepSchedule : SigmaSchedule { struct SmoothStepScheduler : SigmaScheduler {
static constexpr float smoothstep(float x) { static constexpr float smoothstep(float x) {
return x * x * (3.0f - 2.0f * x); return x * x * (3.0f - 2.0f * x);
} }
@ -329,7 +348,6 @@ struct SmoothStepSchedule : SigmaSchedule {
}; };
struct Denoiser { struct Denoiser {
std::shared_ptr<SigmaSchedule> scheduler = std::make_shared<DiscreteSchedule>();
virtual float sigma_min() = 0; virtual float sigma_min() = 0;
virtual float sigma_max() = 0; virtual float sigma_max() = 0;
virtual float sigma_to_t(float sigma) = 0; virtual float sigma_to_t(float sigma) = 0;
@ -338,8 +356,51 @@ struct Denoiser {
virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0; virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0;
virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) = 0; virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) = 0;
virtual std::vector<float> get_sigmas(uint32_t n) { virtual std::vector<float> get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) {
auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
std::shared_ptr<SigmaScheduler> scheduler;
switch (scheduler_type) {
case DISCRETE_SCHEDULER:
LOG_INFO("get_sigmas with discrete scheduler");
scheduler = std::make_shared<DiscreteScheduler>();
break;
case KARRAS_SCHEDULER:
LOG_INFO("get_sigmas with Karras scheduler");
scheduler = std::make_shared<KarrasScheduler>();
break;
case EXPONENTIAL_SCHEDULER:
LOG_INFO("get_sigmas exponential scheduler");
scheduler = std::make_shared<ExponentialScheduler>();
break;
case AYS_SCHEDULER:
LOG_INFO("get_sigmas with Align-Your-Steps scheduler");
scheduler = std::make_shared<AYSScheduler>(version);
break;
case GITS_SCHEDULER:
LOG_INFO("get_sigmas with GITS scheduler");
scheduler = std::make_shared<GITSScheduler>();
break;
case SGM_UNIFORM_SCHEDULER:
LOG_INFO("get_sigmas with SGM Uniform scheduler");
scheduler = std::make_shared<SGMUniformScheduler>();
break;
case SIMPLE_SCHEDULER:
LOG_INFO("get_sigmas with Simple scheduler");
scheduler = std::make_shared<SimpleScheduler>();
break;
case SMOOTHSTEP_SCHEDULER:
LOG_INFO("get_sigmas with SmoothStep scheduler");
scheduler = std::make_shared<SmoothStepScheduler>();
break;
case LCM_SCHEDULER:
LOG_INFO("get_sigmas with LCM scheduler");
scheduler = std::make_shared<LCMScheduler>();
break;
default:
LOG_INFO("get_sigmas with discrete scheduler (default)");
scheduler = std::make_shared<DiscreteScheduler>();
break;
}
return scheduler->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma); return scheduler->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma);
} }
}; };
@ -401,8 +462,8 @@ struct CompVisDenoiser : public Denoiser {
// this function will modify noise/latent // this function will modify noise/latent
ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
ggml_tensor_scale(noise, sigma); ggml_ext_tensor_scale_inplace(noise, sigma);
ggml_tensor_add(latent, noise); ggml_ext_tensor_add_inplace(latent, noise);
return latent; return latent;
} }
@ -426,7 +487,6 @@ struct EDMVDenoiser : public CompVisVDenoiser {
EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0) EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0)
: min_sigma(min_sigma), max_sigma(max_sigma) { : min_sigma(min_sigma), max_sigma(max_sigma) {
scheduler = std::make_shared<ExponentialSchedule>();
} }
float t_to_sigma(float t) override { float t_to_sigma(float t) override {
@ -496,14 +556,14 @@ struct DiscreteFlowDenoiser : public Denoiser {
// this function will modify noise/latent // this function will modify noise/latent
ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
ggml_tensor_scale(noise, sigma); ggml_ext_tensor_scale_inplace(noise, sigma);
ggml_tensor_scale(latent, 1.0f - sigma); ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
ggml_tensor_add(latent, noise); ggml_ext_tensor_add_inplace(latent, noise);
return latent; return latent;
} }
ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
ggml_tensor_scale(latent, 1.0f / (1.0f - sigma)); ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
return latent; return latent;
} }
}; };
@ -522,10 +582,14 @@ struct FluxFlowDenoiser : public Denoiser {
set_parameters(shift); set_parameters(shift);
} }
void set_parameters(float shift = 1.15f) { void set_shift(float shift) {
this->shift = shift; this->shift = shift;
for (int i = 1; i < TIMESTEPS + 1; i++) { }
sigmas[i - 1] = t_to_sigma(i / TIMESTEPS * TIMESTEPS);
void set_parameters(float shift) {
set_shift(shift);
for (int i = 0; i < TIMESTEPS; i++) {
sigmas[i] = t_to_sigma(i);
} }
} }
@ -555,22 +619,54 @@ struct FluxFlowDenoiser : public Denoiser {
// this function will modify noise/latent // this function will modify noise/latent
ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
ggml_tensor_scale(noise, sigma); ggml_ext_tensor_scale_inplace(noise, sigma);
ggml_tensor_scale(latent, 1.0f - sigma); ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
ggml_tensor_add(latent, noise); ggml_ext_tensor_add_inplace(latent, noise);
return latent; return latent;
} }
ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
ggml_tensor_scale(latent, 1.0f / (1.0f - sigma)); ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
return latent; return latent;
} }
}; };
struct Flux2FlowDenoiser : public FluxFlowDenoiser {
Flux2FlowDenoiser() = default;
float compute_empirical_mu(uint32_t n, int image_seq_len) {
const float a1 = 8.73809524e-05f;
const float b1 = 1.89833333f;
const float a2 = 0.00016927f;
const float b2 = 0.45666666f;
if (image_seq_len > 4300) {
float mu = a2 * image_seq_len + b2;
return mu;
}
float m_200 = a2 * image_seq_len + b2;
float m_10 = a1 * image_seq_len + b1;
float a = (m_200 - m_10) / 190.0f;
float b = m_200 - 200.0f * a;
float mu = a * n + b;
return mu;
}
std::vector<float> get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version) override {
float mu = compute_empirical_mu(n, image_seq_len);
LOG_DEBUG("Flux2FlowDenoiser: set shift to %.3f", mu);
set_shift(mu);
return Denoiser::get_sigmas(n, image_seq_len, scheduler_type, version);
}
};
typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t; typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t;
// k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
static void sample_k_diffusion(sample_method_t method, static bool sample_k_diffusion(sample_method_t method,
denoise_cb_t model, denoise_cb_t model,
ggml_context* work_ctx, ggml_context* work_ctx,
ggml_tensor* x, ggml_tensor* x,
@ -580,7 +676,7 @@ static void sample_k_diffusion(sample_method_t method,
size_t steps = sigmas.size() - 1; size_t steps = sigmas.size() - 1;
// sample_euler_ancestral // sample_euler_ancestral
switch (method) { switch (method) {
case EULER_A: { case EULER_A_SAMPLE_METHOD: {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
@ -589,6 +685,9 @@ static void sample_k_diffusion(sample_method_t method,
// denoise // denoise
ggml_tensor* denoised = model(x, sigma, i + 1); ggml_tensor* denoised = model(x, sigma, i + 1);
if (denoised == nullptr) {
return false;
}
// d = (x - denoised) / sigma // d = (x - denoised) / sigma
{ {
@ -620,7 +719,7 @@ static void sample_k_diffusion(sample_method_t method,
if (sigmas[i + 1] > 0) { if (sigmas[i + 1] > 0) {
// x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
ggml_tensor_set_f32_randn(noise, rng); ggml_ext_im_set_randn_f32(noise, rng);
// noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin"); // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
{ {
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
@ -633,7 +732,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case EULER: // Implemented without any sigma churn case EULER_SAMPLE_METHOD: // Implemented without any sigma churn
{ {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
@ -642,6 +741,9 @@ static void sample_k_diffusion(sample_method_t method,
// denoise // denoise
ggml_tensor* denoised = model(x, sigma, i + 1); ggml_tensor* denoised = model(x, sigma, i + 1);
if (denoised == nullptr) {
return false;
}
// d = (x - denoised) / sigma // d = (x - denoised) / sigma
{ {
@ -666,13 +768,16 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case HEUN: { case HEUN_SAMPLE_METHOD: {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], -(i + 1)); ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
if (denoised == nullptr) {
return false;
}
// d = (x - denoised) / sigma // d = (x - denoised) / sigma
{ {
@ -707,7 +812,10 @@ static void sample_k_diffusion(sample_method_t method,
} }
ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
float* vec_denoised = (float*)denoised->data; if (denoised == nullptr) {
return false;
}
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1]; float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
vec_d[j] = (vec_d[j] + d2) / 2; vec_d[j] = (vec_d[j] + d2) / 2;
@ -716,13 +824,16 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case DPM2: { case DPM2_SAMPLE_METHOD: {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], i + 1);
if (denoised == nullptr) {
return false;
}
// d = (x - denoised) / sigma // d = (x - denoised) / sigma
{ {
@ -759,7 +870,10 @@ static void sample_k_diffusion(sample_method_t method,
} }
ggml_tensor* denoised = model(x2, sigma_mid, i + 1); ggml_tensor* denoised = model(x2, sigma_mid, i + 1);
float* vec_denoised = (float*)denoised->data; if (denoised == nullptr) {
return false;
}
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid; float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
vec_x[j] = vec_x[j] + d2 * dt_2; vec_x[j] = vec_x[j] + d2 * dt_2;
@ -768,13 +882,16 @@ static void sample_k_diffusion(sample_method_t method,
} }
} break; } break;
case DPMPP2S_A: { case DPMPP2S_A_SAMPLE_METHOD: {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], i + 1);
if (denoised == nullptr) {
return false;
}
// get_ancestral_step // get_ancestral_step
float sigma_up = std::min(sigmas[i + 1], float sigma_up = std::min(sigmas[i + 1],
@ -811,6 +928,9 @@ static void sample_k_diffusion(sample_method_t method,
} }
ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
if (denoised == nullptr) {
return false;
}
// Second half-step // Second half-step
for (int j = 0; j < ggml_nelements(x); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
@ -820,7 +940,7 @@ static void sample_k_diffusion(sample_method_t method,
// Noise addition // Noise addition
if (sigmas[i + 1] > 0) { if (sigmas[i + 1] > 0) {
ggml_tensor_set_f32_randn(noise, rng); ggml_ext_im_set_randn_f32(noise, rng);
{ {
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data; float* vec_noise = (float*)noise->data;
@ -832,7 +952,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case DPMPP2M: // DPM++ (2M) from Karras et al (2022) case DPMPP2M_SAMPLE_METHOD: // DPM++ (2M) from Karras et al (2022)
{ {
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
@ -841,6 +961,9 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], i + 1);
if (denoised == nullptr) {
return false;
}
float t = t_fn(sigmas[i]); float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]); float t_next = t_fn(sigmas[i + 1]);
@ -871,7 +994,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 case DPMPP2Mv2_SAMPLE_METHOD: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
{ {
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
@ -880,6 +1003,9 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// denoise // denoise
ggml_tensor* denoised = model(x, sigmas[i], i + 1); ggml_tensor* denoised = model(x, sigmas[i], i + 1);
if (denoised == nullptr) {
return false;
}
float t = t_fn(sigmas[i]); float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]); float t_next = t_fn(sigmas[i + 1]);
@ -914,7 +1040,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case IPNDM: // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main case IPNDM_SAMPLE_METHOD: // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
{ {
int max_order = 4; int max_order = 4;
ggml_tensor* x_next = x; ggml_tensor* x_next = x;
@ -930,7 +1056,10 @@ static void sample_k_diffusion(sample_method_t method,
// Denoising step // Denoising step
ggml_tensor* denoised = model(x_cur, sigma, i + 1); ggml_tensor* denoised = model(x_cur, sigma, i + 1);
float* vec_denoised = (float*)denoised->data; if (denoised == nullptr) {
return false;
}
float* vec_denoised = (float*)denoised->data;
// d_cur = (x_cur - denoised) / sigma // d_cur = (x_cur - denoised) / sigma
struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur); struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur);
float* vec_d_cur = (float*)d_cur->data; float* vec_d_cur = (float*)d_cur->data;
@ -989,7 +1118,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case IPNDM_V: // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main case IPNDM_V_SAMPLE_METHOD: // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
{ {
int max_order = 4; int max_order = 4;
std::vector<ggml_tensor*> buffer_model; std::vector<ggml_tensor*> buffer_model;
@ -1063,7 +1192,7 @@ static void sample_k_diffusion(sample_method_t method,
d_cur = ggml_dup_tensor(work_ctx, x_next); d_cur = ggml_dup_tensor(work_ctx, x_next);
} }
} break; } break;
case LCM: // Latent Consistency Models case LCM_SAMPLE_METHOD: // Latent Consistency Models
{ {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
@ -1073,6 +1202,9 @@ static void sample_k_diffusion(sample_method_t method,
// denoise // denoise
ggml_tensor* denoised = model(x, sigma, i + 1); ggml_tensor* denoised = model(x, sigma, i + 1);
if (denoised == nullptr) {
return false;
}
// x = denoised // x = denoised
{ {
@ -1085,7 +1217,7 @@ static void sample_k_diffusion(sample_method_t method,
if (sigmas[i + 1] > 0) { if (sigmas[i + 1] > 0) {
// x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1]) // x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1])
ggml_tensor_set_f32_randn(noise, rng); ggml_ext_im_set_randn_f32(noise, rng);
// noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
{ {
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
@ -1098,8 +1230,8 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
} break; } break;
case DDIM_TRAILING: // Denoising Diffusion Implicit Models case DDIM_TRAILING_SAMPLE_METHOD: // Denoising Diffusion Implicit Models
// with the "trailing" timestep spacing // with the "trailing" timestep spacing
{ {
// See J. Song et al., "Denoising Diffusion Implicit // See J. Song et al., "Denoising Diffusion Implicit
// Models", arXiv:2010.02502 [cs.LG] // Models", arXiv:2010.02502 [cs.LG]
@ -1109,7 +1241,7 @@ static void sample_k_diffusion(sample_method_t method,
// end beta) (which unfortunately k-diffusion's data // end beta) (which unfortunately k-diffusion's data
// structure hides from the denoiser), and the sigmas are // structure hides from the denoiser), and the sigmas are
// also needed to invert the behavior of CompVisDenoiser // also needed to invert the behavior of CompVisDenoiser
// (k-diffusion's LMSDiscreteScheduler) // (k-diffusion's LMSDiscreteSchedulerr)
float beta_start = 0.00085f; float beta_start = 0.00085f;
float beta_end = 0.0120f; float beta_end = 0.0120f;
std::vector<double> alphas_cumprod; std::vector<double> alphas_cumprod;
@ -1137,7 +1269,7 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
// The "trailing" DDIM timestep, see S. Lin et al., // The "trailing" DDIM timestep, see S. Lin et al.,
// "Common Diffusion Noise Schedules and Sample Steps // "Common Diffusion Noise Schedulers and Sample Steps
// are Flawed", arXiv:2305.08891 [cs], p. 4, Table // are Flawed", arXiv:2305.08891 [cs], p. 4, Table
// 2. Most variables below follow Diffusers naming // 2. Most variables below follow Diffusers naming
// //
@ -1276,7 +1408,7 @@ static void sample_k_diffusion(sample_method_t method,
} }
} }
if (eta > 0) { if (eta > 0) {
ggml_tensor_set_f32_randn(variance_noise, rng); ggml_ext_im_set_randn_f32(variance_noise, rng);
float* vec_variance_noise = float* vec_variance_noise =
(float*)variance_noise->data; (float*)variance_noise->data;
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
@ -1292,8 +1424,8 @@ static void sample_k_diffusion(sample_method_t method,
// factor c_in. // factor c_in.
} }
} break; } break;
case TCD: // Strategic Stochastic Sampling (Algorithm 4) in case TCD_SAMPLE_METHOD: // Strategic Stochastic Sampling (Algorithm 4) in
// Trajectory Consistency Distillation // Trajectory Consistency Distillation
{ {
// See J. Zheng et al., "Trajectory Consistency // See J. Zheng et al., "Trajectory Consistency
// Distillation: Improved Latent Consistency Distillation // Distillation: Improved Latent Consistency Distillation
@ -1444,7 +1576,7 @@ static void sample_k_diffusion(sample_method_t method,
if (eta > 0 && i != steps - 1) { if (eta > 0 && i != steps - 1) {
// In this case, x is still pred_noised_sample, // In this case, x is still pred_noised_sample,
// continue in-place // continue in-place
ggml_tensor_set_f32_randn(noise, rng); ggml_ext_im_set_randn_f32(noise, rng);
float* vec_x = (float*)x->data; float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data; float* vec_noise = (float*)noise->data;
for (int j = 0; j < ggml_nelements(x); j++) { for (int j = 0; j < ggml_nelements(x); j++) {
@ -1465,8 +1597,9 @@ static void sample_k_diffusion(sample_method_t method,
default: default:
LOG_ERROR("Attempting to sample with nonexisting sample method %i", method); LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
abort(); return false;
} }
return true;
} }
#endif // __DENOISER_HPP__ #endif // __DENOISER_HPP__

View File

@ -6,6 +6,7 @@
#include "qwen_image.hpp" #include "qwen_image.hpp"
#include "unet.hpp" #include "unet.hpp"
#include "wan.hpp" #include "wan.hpp"
#include "z_image.hpp"
struct DiffusionParams { struct DiffusionParams {
struct ggml_tensor* x = nullptr; struct ggml_tensor* x = nullptr;
@ -26,7 +27,7 @@ struct DiffusionParams {
struct DiffusionModel { struct DiffusionModel {
virtual std::string get_desc() = 0; virtual std::string get_desc() = 0;
virtual void compute(int n_threads, virtual bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) = 0; struct ggml_context* output_ctx = nullptr) = 0;
@ -35,7 +36,9 @@ struct DiffusionModel {
virtual void free_compute_buffer() = 0; virtual void free_compute_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0; virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0; virtual size_t get_params_buffer_size() = 0;
virtual int64_t get_adm_in_channels() = 0; virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
virtual int64_t get_adm_in_channels() = 0;
virtual void set_flash_attn_enabled(bool enabled) = 0;
}; };
struct UNetModel : public DiffusionModel { struct UNetModel : public DiffusionModel {
@ -43,10 +46,9 @@ struct UNetModel : public DiffusionModel {
UNetModel(ggml_backend_t backend, UNetModel(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_SD1, SDVersion version = VERSION_SD1)
bool flash_attn = false) : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
: unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
} }
std::string get_desc() override { std::string get_desc() override {
@ -73,11 +75,19 @@ struct UNetModel : public DiffusionModel {
return unet.get_params_buffer_size(); return unet.get_params_buffer_size();
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
unet.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override { int64_t get_adm_in_channels() override {
return unet.unet.adm_in_channels; return unet.unet.adm_in_channels;
} }
void compute(int n_threads, void set_flash_attn_enabled(bool enabled) {
unet.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { struct ggml_context* output_ctx = nullptr) override {
@ -98,9 +108,8 @@ struct MMDiTModel : public DiffusionModel {
MMDiTModel(ggml_backend_t backend, MMDiTModel(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
bool flash_attn = false, const String2TensorStorage& tensor_storage_map = {})
const String2GGMLType& tensor_types = {}) : mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
: mmdit(backend, offload_params_to_cpu, flash_attn, tensor_types, "model.diffusion_model") {
} }
std::string get_desc() override { std::string get_desc() override {
@ -127,11 +136,19 @@ struct MMDiTModel : public DiffusionModel {
return mmdit.get_params_buffer_size(); return mmdit.get_params_buffer_size();
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
mmdit.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override { int64_t get_adm_in_channels() override {
return 768 + 1280; return 768 + 1280;
} }
void compute(int n_threads, void set_flash_attn_enabled(bool enabled) {
mmdit.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { struct ggml_context* output_ctx = nullptr) override {
@ -151,11 +168,10 @@ struct FluxModel : public DiffusionModel {
FluxModel(ggml_backend_t backend, FluxModel(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_FLUX, SDVersion version = VERSION_FLUX,
bool flash_attn = false, bool use_mask = false)
bool use_mask = false) : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
: flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
} }
std::string get_desc() override { std::string get_desc() override {
@ -182,11 +198,19 @@ struct FluxModel : public DiffusionModel {
return flux.get_params_buffer_size(); return flux.get_params_buffer_size();
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
flux.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override { int64_t get_adm_in_channels() override {
return 768; return 768;
} }
void compute(int n_threads, void set_flash_attn_enabled(bool enabled) {
flux.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { struct ggml_context* output_ctx = nullptr) override {
@ -211,11 +235,10 @@ struct WanModel : public DiffusionModel {
WanModel(ggml_backend_t backend, WanModel(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model", const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_WAN2, SDVersion version = VERSION_WAN2)
bool flash_attn = false) : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
: prefix(prefix), wan(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) {
} }
std::string get_desc() override { std::string get_desc() override {
@ -242,11 +265,19 @@ struct WanModel : public DiffusionModel {
return wan.get_params_buffer_size(); return wan.get_params_buffer_size();
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
wan.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override { int64_t get_adm_in_channels() override {
return 768; return 768;
} }
void compute(int n_threads, void set_flash_attn_enabled(bool enabled) {
wan.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { struct ggml_context* output_ctx = nullptr) override {
@ -270,11 +301,10 @@ struct QwenImageModel : public DiffusionModel {
QwenImageModel(ggml_backend_t backend, QwenImageModel(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model", const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_QWEN_IMAGE, SDVersion version = VERSION_QWEN_IMAGE)
bool flash_attn = false) : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
: prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) {
} }
std::string get_desc() override { std::string get_desc() override {
@ -301,11 +331,19 @@ struct QwenImageModel : public DiffusionModel {
return qwen_image.get_params_buffer_size(); return qwen_image.get_params_buffer_size();
} }
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
qwen_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override { int64_t get_adm_in_channels() override {
return 768; return 768;
} }
void compute(int n_threads, void set_flash_attn_enabled(bool enabled) {
qwen_image.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params, DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr, struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override { struct ggml_context* output_ctx = nullptr) override {
@ -320,4 +358,67 @@ struct QwenImageModel : public DiffusionModel {
} }
}; };
struct ZImageModel : public DiffusionModel {
std::string prefix;
ZImage::ZImageRunner z_image;
ZImageModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_Z_IMAGE)
: prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return z_image.get_desc();
}
void alloc_params_buffer() override {
z_image.alloc_params_buffer();
}
void free_params_buffer() override {
z_image.free_params_buffer();
}
void free_compute_buffer() override {
z_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
z_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return z_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
z_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attn_enabled(bool enabled) {
z_image.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return z_image.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.ref_latents,
true, // increase_ref_index
output,
output_ctx);
}
};
#endif #endif

View File

@ -157,7 +157,7 @@ ninja
## Build with SYCL ## Build with SYCL
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux). Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
```shell ```shell
# Export relevant ENV variables # Export relevant ENV variables

21
docs/chroma_radiance.md Normal file
View File

@ -0,0 +1,21 @@
# How to Use
## Download weights
- Download Chroma1-Radiance
- safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
- gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
- Download t5xxl
- safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Examples
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
```
<img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />

View File

@ -1,40 +1,66 @@
# Running distilled models: SSD1B and SD1.x with tiny U-Nets # Running distilled models: SSD1B and SDx.x with tiny U-Nets
## Preface ## Preface
This kind of models have a reduced U-Net part. These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
Unlike other SDXL models the U-Net of SSD1B has only one middle block and lesser attention layers in up and down blocks, resulting in relatively smaller files. Running these models saves more than 33% of the time. For more details, refer to Segmind's paper on https://arxiv.org/abs/2401.02677v1 . Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
Unlike other SD 1.x models Tiny-UNet models consist of only 6 U-Net blocks, resulting in relatively smaller files (approximately 1 GB). Running these models saves almost 50% of the time. For more details, refer to the paper: https://arxiv.org/pdf/2305.15798.pdf .
## SSD1B ## SSD1B
Unfortunately not all of this models follow the standard model parameter naming mapping. Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
Anyway there are some very useful SSD1B models available online, such as:
* https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors * https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
* https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors * https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
Also there are useful LORAs available: Useful LoRAs are also available:
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors * https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors * https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
You can use this files **out-of-the-box** - unlike models in next section. These files can be used out-of-the-box, unlike the models described in the next section.
## SD1.x with tiny U-Nets ## SD1.x, SD2.x with tiny U-Nets
There are some Tiny SD 1.x models available online, such as: These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
* https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
### SD2.x
NotaAI provides the following model online:
* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
```
Second, create the .safetensors file by running:
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
--checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
```
This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
### SD1.x
Several Tiny SD 1.x models are available online, such as:
* https://huggingface.co/segmind/tiny-sd * https://huggingface.co/segmind/tiny-sd
* https://huggingface.co/segmind/portrait-finetuned * https://huggingface.co/segmind/portrait-finetuned
* https://huggingface.co/nota-ai/bk-sdm-tiny * https://huggingface.co/nota-ai/bk-sdm-tiny
These models need some conversion, for example because partially tensors are **non contiguous** stored. To create a usable checkpoint file, follow these **easy** steps: These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
Download and prepare the model using Python:
### Download model from Hugging Face ##### Download the model using Python on your computer, for example this way:
Download the model using Python on your computer, for example this way:
```python ```python
import torch import torch
@ -46,35 +72,22 @@ for param in unet.parameters():
pipe.save_pretrained("segmindtiny-sd", safe_serialization=True) pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
``` ```
### Convert that to a ckpt file ##### Run the conversion script:
To convert the downloaded model to a checkpoint file, you need another Python script. Download the conversion script from here:
* https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
### Run convert script
Now, run that conversion script:
```bash ```bash
python convert_diffusers_to_original_stable_diffusion.py \ python convert_diffusers_to_original_stable_diffusion.py \
--model_path ./segmindtiny-sd \ --model_path ./segmindtiny-sd \
--checkpoint_path ./segmind_tiny-sd.ckpt --half --checkpoint_path ./segmind_tiny-sd.ckpt --half
``` ```
The file **segmind_tiny-sd.ckpt** will be generated and is now ready to use with sd.cpp The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
You can follow a similar process for other models mentioned above from Hugging Face.
### Another ckpt file on the net ### Another available .ckpt file:
There is another model file available online:
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
If you want to use that, you have to adjust some **non-contiguous tensors** first: To use this file, you must first adjust its non-contiguous tensors:
```python ```python
import torch import torch

View File

@ -15,7 +15,7 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself. You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example: For example:
``` ```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0 .\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
``` ```

21
docs/flux2.md Normal file
View File

@ -0,0 +1,21 @@
# How to Use
## Download weights
- Download FLUX.2-dev
- gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
- gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
## Examples
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
```
<img alt="flux2 example" src="../assets/flux2/example.png" />

View File

@ -12,38 +12,15 @@ Here's a simple example:
`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
# Support matrix # Lora Apply Mode
> CUDA `get_rows` support is defined here: There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
> [ggml-org/ggml/src/ggml-cuda/getrows.cu#L156](https://github.com/ggml-org/ggml/blob/7dee1d6a1e7611f238d09be96738388da97c88ed/src/ggml-cuda/getrows.cu#L156)
> Currently only the basic types + Q4/Q5/Q8 are implemented. K-quants are **not** supported.
NOTE: The other backends may have different support. By default, the mode is selected automatically:
* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
* Otherwise, the **immediately** mode is used.
The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.
| Quant / Type | CUDA | Vulkan |
|--------------|------|--------|
| F32 | ✔️ | ✔️ |
| F16 | ✔️ | ✔️ |
| BF16 | ✔️ | ✔️ |
| I32 | ✔️ | ❌ |
| Q4_0 | ✔️ | ✔️ |
| Q4_1 | ✔️ | ✔️ |
| Q5_0 | ✔️ | ✔️ |
| Q5_1 | ✔️ | ✔️ |
| Q8_0 | ✔️ | ✔️ |
| Q2_K | ❌ | ❌ |
| Q3_K | ❌ | ❌ |
| Q4_K | ❌ | ❌ |
| Q5_K | ❌ | ❌ |
| Q6_K | ❌ | ❌ |
| Q8_K | ❌ | ❌ |
| IQ1_S | ❌ | ✔️ |
| IQ1_M | ❌ | ✔️ |
| IQ2_XXS | ❌ | ✔️ |
| IQ2_XS | ❌ | ✔️ |
| IQ2_S | ❌ | ✔️ |
| IQ3_XXS | ❌ | ✔️ |
| IQ3_S | ❌ | ✔️ |
| IQ4_XS | ❌ | ✔️ |
| IQ4_NL | ❌ | ✔️ |
| MXFP4 | ❌ | ✔️ |

19
docs/ovis_image.md Normal file
View File

@ -0,0 +1,19 @@
# How to Use
## Download weights
- Download Ovis-Image-7B
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Ovis-Image-7B-GGUF
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Ovis 2.5
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/text_encoders
## Examples
```
.\bin\Release\sd.exe --diffusion-model ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
```
<img alt="ovis image example" src="../assets/ovis_image/example.png" />

View File

@ -40,7 +40,7 @@ Running PMV2 is now a two-step process:
``` ```
python face_detect.py input_image_dir python face_detect.py input_image_dir
``` ```
An ```id_embeds.safetensors``` file will be generated in ```input_images_dir``` An ```id_embeds.bin``` file will be generated in ```input_images_dir```
**Note: this step is only needed to run once; the same ```id_embeds``` can be reused** **Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
@ -48,6 +48,6 @@ An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2) You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
- All the command line parameters from Version 1 remain the same for Version 2 - All the command line parameters from Version 1 remain the same for Version 2 plus one extra pointing to a valid ```id_embeds``` file: --pm-id-embed-path [path_to__id_embeds.bin]

View File

@ -14,7 +14,7 @@
## Examples ## Examples
``` ```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线 探索视觉生成基础模型的极限开创理解与生成一体化的未来。二、Qwen-Image的模型特色1、复杂文字渲染。支持中英渲染、自动布局 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3 .\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线 探索视觉生成基础模型的极限开创理解与生成一体化的未来。二、Qwen-Image的模型特色1、复杂文字渲染。支持中英渲染、自动布局 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
``` ```
<img alt="qwen example" src="../assets/qwen/example.png" /> <img alt="qwen example" src="../assets/qwen/example.png" />

View File

@ -20,7 +20,7 @@
### Qwen Image Edit ### Qwen Image Edit
``` ```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453 .\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
``` ```
<img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" /> <img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
@ -29,7 +29,7 @@
### Qwen Image Edit 2509 ### Qwen Image Edit 2509
``` ```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --qwen2vl_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'" .\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
``` ```
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" /> <img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />

View File

@ -7,7 +7,7 @@ You can use TAESD to accelerate the decoding of latent images by following these
Or curl Or curl
```bash ```bash
curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors curl -L -O https://huggingface.co/madebyollin/taesd/resolve/main/diffusion_pytorch_model.safetensors
``` ```
- Specify the model path using the `--taesd PATH` parameter. example: - Specify the model path using the `--taesd PATH` parameter. example:

28
docs/z_image.md Normal file
View File

@ -0,0 +1,28 @@
# How to Use
You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
## Download weights
- Download Z-Image-Turbo
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Qwen3 4b
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/text_encoders
- gguf: https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main
## Examples
```
.\bin\Release\sd.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
```
<img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
## Comparison of Different Quantization Types
| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|
|---|---|---|---|---|---|---|---|
| <img width="256" alt="bf16" src="../assets/z_image/bf16.png" /> | <img width="256" alt="q8_0" src="../assets/z_image/q8_0.png" /> | <img width="256" alt="q6_K" src="../assets/z_image/q6_K.png" /> | <img width="256" alt="q5_0" src="../assets/z_image/q5_0.png" /> | <img width="256" alt="q4_K" src="../assets/z_image/q4_K.png" /> | <img width="256" alt="q4_0" src="../assets/z_image/q4_0.png" /> | <img width="256" alt="q3_K" src="../assets/z_image/q3_K.png" /> | <img width="256" alt="q2_K" src="../assets/z_image/q2_K.png" /> |

265
easycache.hpp Normal file
View File

@ -0,0 +1,265 @@
#include <cmath>
#include <limits>
#include <unordered_map>
#include <vector>
#include "denoiser.hpp"
#include "ggml_extend.hpp"
struct EasyCacheConfig {
bool enabled = false;
float reuse_threshold = 0.2f;
float start_percent = 0.15f;
float end_percent = 0.95f;
};
struct EasyCacheCacheEntry {
std::vector<float> diff;
};
struct EasyCacheState {
EasyCacheConfig config;
Denoiser* denoiser = nullptr;
float start_sigma = std::numeric_limits<float>::max();
float end_sigma = 0.0f;
bool initialized = false;
bool initial_step = true;
bool skip_current_step = false;
bool step_active = false;
const SDCondition* anchor_condition = nullptr;
std::unordered_map<const SDCondition*, EasyCacheCacheEntry> cache_diffs;
std::vector<float> prev_input;
std::vector<float> prev_output;
float output_prev_norm = 0.0f;
bool has_prev_input = false;
bool has_prev_output = false;
bool has_output_prev_norm = false;
bool has_relative_transformation_rate = false;
float relative_transformation_rate = 0.0f;
float cumulative_change_rate = 0.0f;
float last_input_change = 0.0f;
bool has_last_input_change = false;
int total_steps_skipped = 0;
int current_step_index = -1;
void reset_runtime() {
initial_step = true;
skip_current_step = false;
step_active = false;
anchor_condition = nullptr;
cache_diffs.clear();
prev_input.clear();
prev_output.clear();
output_prev_norm = 0.0f;
has_prev_input = false;
has_prev_output = false;
has_output_prev_norm = false;
has_relative_transformation_rate = false;
relative_transformation_rate = 0.0f;
cumulative_change_rate = 0.0f;
last_input_change = 0.0f;
has_last_input_change = false;
total_steps_skipped = 0;
current_step_index = -1;
}
void init(const EasyCacheConfig& cfg, Denoiser* d) {
config = cfg;
denoiser = d;
initialized = cfg.enabled && d != nullptr;
reset_runtime();
if (initialized) {
start_sigma = percent_to_sigma(config.start_percent);
end_sigma = percent_to_sigma(config.end_percent);
}
}
bool enabled() const {
return initialized && config.enabled;
}
float percent_to_sigma(float percent) const {
if (!denoiser) {
return 0.0f;
}
if (percent <= 0.0f) {
return std::numeric_limits<float>::max();
}
if (percent >= 1.0f) {
return 0.0f;
}
float t = (1.0f - percent) * (TIMESTEPS - 1);
return denoiser->t_to_sigma(t);
}
void begin_step(int step_index, float sigma) {
if (!enabled()) {
return;
}
if (step_index == current_step_index) {
return;
}
current_step_index = step_index;
skip_current_step = false;
has_last_input_change = false;
step_active = false;
if (sigma > start_sigma) {
return;
}
if (!(sigma > end_sigma)) {
return;
}
step_active = true;
}
bool step_is_active() const {
return enabled() && step_active;
}
bool is_step_skipped() const {
return enabled() && step_active && skip_current_step;
}
bool has_cache(const SDCondition* cond) const {
auto it = cache_diffs.find(cond);
return it != cache_diffs.end() && !it->second.diff.empty();
}
void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
EasyCacheCacheEntry& entry = cache_diffs[cond];
size_t ne = static_cast<size_t>(ggml_nelements(output));
entry.diff.resize(ne);
float* out_data = (float*)output->data;
float* in_data = (float*)input->data;
for (size_t i = 0; i < ne; ++i) {
entry.diff[i] = out_data[i] - in_data[i];
}
}
void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
auto it = cache_diffs.find(cond);
if (it == cache_diffs.end() || it->second.diff.empty()) {
return;
}
copy_ggml_tensor(output, input);
float* out_data = (float*)output->data;
const std::vector<float>& diff = it->second.diff;
for (size_t i = 0; i < diff.size(); ++i) {
out_data[i] += diff[i];
}
}
bool before_condition(const SDCondition* cond,
ggml_tensor* input,
ggml_tensor* output,
float sigma,
int step_index) {
if (!enabled() || step_index < 0) {
return false;
}
if (step_index != current_step_index) {
begin_step(step_index, sigma);
}
if (!step_active) {
return false;
}
if (initial_step) {
anchor_condition = cond;
initial_step = false;
}
bool is_anchor = (cond == anchor_condition);
if (skip_current_step) {
if (has_cache(cond)) {
apply_cache(cond, input, output);
return true;
}
return false;
}
if (!is_anchor) {
return false;
}
if (!has_prev_input || !has_prev_output || !has_cache(cond)) {
return false;
}
size_t ne = static_cast<size_t>(ggml_nelements(input));
if (prev_input.size() != ne) {
return false;
}
float* input_data = (float*)input->data;
last_input_change = 0.0f;
for (size_t i = 0; i < ne; ++i) {
last_input_change += std::fabs(input_data[i] - prev_input[i]);
}
if (ne > 0) {
last_input_change /= static_cast<float>(ne);
}
has_last_input_change = true;
if (has_output_prev_norm && has_relative_transformation_rate && last_input_change > 0.0f && output_prev_norm > 0.0f) {
float approx_output_change_rate = (relative_transformation_rate * last_input_change) / output_prev_norm;
cumulative_change_rate += approx_output_change_rate;
if (cumulative_change_rate < config.reuse_threshold) {
skip_current_step = true;
total_steps_skipped++;
apply_cache(cond, input, output);
return true;
} else {
cumulative_change_rate = 0.0f;
}
}
return false;
}
void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
if (!step_is_active()) {
return;
}
update_cache(cond, input, output);
if (cond != anchor_condition) {
return;
}
size_t ne = static_cast<size_t>(ggml_nelements(input));
float* in_data = (float*)input->data;
prev_input.resize(ne);
for (size_t i = 0; i < ne; ++i) {
prev_input[i] = in_data[i];
}
has_prev_input = true;
float* out_data = (float*)output->data;
float output_change = 0.0f;
if (has_prev_output && prev_output.size() == ne) {
for (size_t i = 0; i < ne; ++i) {
output_change += std::fabs(out_data[i] - prev_output[i]);
}
if (ne > 0) {
output_change /= static_cast<float>(ne);
}
}
prev_output.resize(ne);
for (size_t i = 0; i < ne; ++i) {
prev_output[i] = out_data[i];
}
has_prev_output = true;
float mean_abs = 0.0f;
for (size_t i = 0; i < ne; ++i) {
mean_abs += std::fabs(out_data[i]);
}
output_prev_norm = (ne > 0) ? (mean_abs / static_cast<float>(ne)) : 0.0f;
has_output_prev_norm = output_prev_norm > 0.0f;
if (has_last_input_change && last_input_change > 0.0f && output_change > 0.0f) {
float rate = output_change / last_input_change;
if (std::isfinite(rate)) {
relative_transformation_rate = rate;
has_relative_transformation_rate = true;
}
}
cumulative_change_rate = 0.0f;
has_last_input_change = false;
}
};

View File

@ -27,11 +27,11 @@ public:
blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1})); blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
return ggml_leaky_relu(ctx, x, 0.2f, true); return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [n, num_feat, h, w] // x: [n, num_feat, h, w]
// return: [n, num_feat, h, w] // return: [n, num_feat, h, w]
@ -42,16 +42,16 @@ public:
auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]); auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
auto x1 = lrelu(ctx, conv1->forward(ctx, x)); auto x1 = lrelu(ctx, conv1->forward(ctx, x));
auto x_cat = ggml_concat(ctx, x, x1, 2); auto x_cat = ggml_concat(ctx->ggml_ctx, x, x1, 2);
auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat)); auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x2, 2); x_cat = ggml_concat(ctx->ggml_ctx, x_cat, x2, 2);
auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat)); auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x3, 2); x_cat = ggml_concat(ctx->ggml_ctx, x_cat, x3, 2);
auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat)); auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x4, 2); x_cat = ggml_concat(ctx->ggml_ctx, x_cat, x4, 2);
auto x5 = conv5->forward(ctx, x_cat); auto x5 = conv5->forward(ctx, x_cat);
x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x); x5 = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, x5, 0.2f), x);
return x5; return x5;
} }
}; };
@ -64,7 +64,7 @@ public:
blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch)); blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [n, num_feat, h, w] // x: [n, num_feat, h, w]
// return: [n, num_feat, h, w] // return: [n, num_feat, h, w]
@ -76,7 +76,7 @@ public:
out = rdb2->forward(ctx, out); out = rdb2->forward(ctx, out);
out = rdb3->forward(ctx, out); out = rdb3->forward(ctx, out);
out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x); out = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, out, 0.2f), x);
return out; return out;
} }
}; };
@ -112,11 +112,11 @@ public:
int get_scale() { return scale; } int get_scale() { return scale; }
int get_num_block() { return num_block; } int get_num_block() { return num_block; }
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
return ggml_leaky_relu(ctx, x, 0.2f, true); return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [n, num_in_ch, h, w] // x: [n, num_in_ch, h, w]
// return: [n, num_out_ch, h*scale, w*scale] // return: [n, num_out_ch, h*scale, w*scale]
auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]); auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
@ -133,14 +133,14 @@ public:
body_feat = block->forward(ctx, body_feat); body_feat = block->forward(ctx, body_feat);
} }
body_feat = conv_body->forward(ctx, body_feat); body_feat = conv_body->forward(ctx, body_feat);
feat = ggml_add(ctx, feat, body_feat); feat = ggml_add(ctx->ggml_ctx, feat, body_feat);
// upsample // upsample
if (scale >= 2) { if (scale >= 2) {
auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]); auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
if (scale == 4) { if (scale == 4) {
auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]); auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
} }
} }
// for all scales // for all scales
@ -156,22 +156,10 @@ struct ESRGAN : public GGMLRunner {
ESRGAN(ggml_backend_t backend, ESRGAN(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}) int tile_size = 128,
const String2TensorStorage& tensor_storage_map = {})
: GGMLRunner(backend, offload_params_to_cpu) { : GGMLRunner(backend, offload_params_to_cpu) {
// rrdb_net will be created in load_from_file this->tile_size = tile_size;
}
void enable_conv2d_direct() {
if (!rrdb_net)
return;
std::vector<GGMLBlock*> blocks;
rrdb_net->get_all_blocks(blocks);
for (auto block : blocks) {
if (block->get_desc() == "Conv2d") {
auto conv_block = (Conv2d*)block;
conv_block->enable_direct();
}
}
} }
std::string get_desc() override { std::string get_desc() override {
@ -182,7 +170,7 @@ struct ESRGAN : public GGMLRunner {
LOG_INFO("loading esrgan from '%s'", file_path.c_str()); LOG_INFO("loading esrgan from '%s'", file_path.c_str());
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
return false; return false;
} }
@ -357,21 +345,23 @@ struct ESRGAN : public GGMLRunner {
if (!rrdb_net) if (!rrdb_net)
return nullptr; return nullptr;
constexpr int kGraphNodes = 1 << 16; // 65k constexpr int kGraphNodes = 1 << 16; // 65k
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, kGraphNodes, /*grads*/ false); struct ggml_cgraph* gf = new_graph_custom(kGraphNodes);
x = to_backend(x); x = to_backend(x);
struct ggml_tensor* out = rrdb_net->forward(compute_ctx, x);
auto runner_ctx = get_context();
struct ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* x, struct ggml_tensor* x,
ggml_tensor** output, ggml_tensor** output,
ggml_context* output_ctx = nullptr) { ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x); return build_graph(x);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
}; };

View File

@ -3,14 +3,30 @@
``` ```
usage: ./bin/sd [options] usage: ./bin/sd [options]
Options: CLI Options:
-o, --output <string> path to write result image to (default: ./output.png)
--preview-path <string> path to write preview image to (default: ./preview.png)
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step)
--canny apply canny preprocessor (edge detection)
-v, --verbose print extra info
--color colors the logging tags according to level
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
-h, --help show this help message and exit
Context Options:
-m, --model <string> path to full model -m, --model <string> path to full model
--clip_l <string> path to the clip-l text encoder --clip_l <string> path to the clip-l text encoder
--clip_g <string> path to the clip-g text encoder --clip_g <string> path to the clip-g text encoder
--clip_vision <string> path to the clip-vision encoder --clip_vision <string> path to the clip-vision encoder
--t5xxl <string> path to the t5xxl text encoder --t5xxl <string> path to the t5xxl text encoder
--qwen2vl <string> path to the qwen2vl text encoder --llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
--qwen2vl_vision <string> path to the qwen2vl vit --llm_vision <string> path to the llm vit
--qwen2vl <string> alias of --llm. Deprecated.
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
--diffusion-model <string> path to the standalone diffusion model --diffusion-model <string> path to the standalone diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model --high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--vae <string> path to standalone vae model --vae <string> path to standalone vae model
@ -18,24 +34,52 @@ Options:
--control-net <string> path to control net model --control-net <string> path to control net model
--embd-dir <string> embeddings directory --embd-dir <string> embeddings directory
--lora-model-dir <string> lora model directory --lora-model-dir <string> lora model directory
-i, --init-img <string> path to the init image
--end-img <string> path to the end image, required by flf2v
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--photo-maker <string> path to PHOTOMAKER model --photo-maker <string> path to PHOTOMAKER model
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir --upscale-model <string> path to esrgan model.
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed -t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
--chroma-disable-dit-mask disable dit mask for chroma
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
contain any quantized parameters, the at_runtime mode will be used; otherwise,
immediately will be used.The immediately mode may have precision and
compatibility issues with quantized parameters, but it usually offers faster inference
speed and, in some cases, lower memory usage. The at_runtime mode, on the
other hand, is exactly the opposite.
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
Generation Options:
-p, --prompt <string> the prompt to render
-n, --negative-prompt <string> the negative prompt (default: "")
-i, --init-img <string> path to the init image
--end-img <string> path to the end image, required by flf2v
--mask <string> path to the mask image --mask <string> path to the mask image
--control-image <string> path to control image, control net --control-image <string> path to control image, control net
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in --control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
lexicographical (character) order. For example, if the control video path is lexicographical (character) order. For example, if the control video path is
`frames`, the directory contain images such as 00.png, 01.png, ... etc. `frames`, the directory contain images such as 00.png, 01.png, ... etc.
-o, --output <string> path to write result image to (default: ./output.png) --pm-id-images-dir <string> path to PHOTOMAKER input id images dir
-p, --prompt <string> the prompt to render --pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
-n, --negative-prompt <string> the negative prompt (default: "")
--upscale-model <string> path to esrgan model.
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
CPU physical cores
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)
@ -43,11 +87,11 @@ Options:
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, --clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
will be 1 for SD1.x, 2 for SD2.x will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count <int> batch count -b, --batch-count <int> batch count
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--video-frames <int> video frames (default: 1) --video-frames <int> video frames (default: 1)
--fps <int> fps (default: 24) --fps <int> fps (default: 24)
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for --timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant NitroSD-Vibrant
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
--cfg-scale <float> unconditional guidance scale: (default: 7.0) --cfg-scale <float> unconditional guidance scale: (default: 7.0)
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5) --guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
@ -67,44 +111,18 @@ Options:
--pm-style-strength <float> --pm-style-strength <float>
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image --control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 --moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
--vace-strength <float> wan vace strength --vace-strength <float> wan vace strength
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
--canny apply canny preprocessor (edge detection)
-v, --verbose print extra info
--color colors the logging tags according to level
--chroma-disable-dit-mask disable dit mask for chroma
--chroma-enable-t5-mask enable t5 mask for chroma
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
--disable-auto-resize-ref-image disable auto resize of ref images --disable-auto-resize-ref-image disable auto resize of ref images
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
--rng RNG, one of [std_default, cuda], default: cuda
-s, --seed RNG seed (default: 42, use random seed for < 0) -s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
discrete
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
--high-noise-scheduler (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
simple], default: discrete default: discrete
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times) -r, --ref-image reference image for Flux Kontext models (can be used multiple times)
-h, --help show this help message and exit --easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
(overrides --vae-tile-size)
``` ```

File diff suppressed because it is too large Load Diff

1151
flux.hpp

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

234
latent-preview.h Normal file
View File

@ -0,0 +1,234 @@
#include <cstddef>
#include <cstdint>
#include "ggml.h"
const float wan_21_latent_rgb_proj[16][3] = {
{0.015123f, -0.148418f, 0.479828f},
{0.003652f, -0.010680f, -0.037142f},
{0.212264f, 0.063033f, 0.016779f},
{0.232999f, 0.406476f, 0.220125f},
{-0.051864f, -0.082384f, -0.069396f},
{0.085005f, -0.161492f, 0.010689f},
{-0.245369f, -0.506846f, -0.117010f},
{-0.151145f, 0.017721f, 0.007207f},
{-0.293239f, -0.207936f, -0.421135f},
{-0.187721f, 0.050783f, 0.177649f},
{-0.013067f, 0.265964f, 0.166578f},
{0.028327f, 0.109329f, 0.108642f},
{-0.205343f, 0.043991f, 0.148914f},
{0.014307f, -0.048647f, -0.007219f},
{0.217150f, 0.053074f, 0.319923f},
{0.155357f, 0.083156f, 0.064780f}};
float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f};
const float wan_22_latent_rgb_proj[48][3] = {
{0.017126f, -0.027230f, -0.019257f},
{-0.113739f, -0.028715f, -0.022885f},
{-0.000106f, 0.021494f, 0.004629f},
{-0.013273f, -0.107137f, -0.033638f},
{-0.000381f, 0.000279f, 0.025877f},
{-0.014216f, -0.003975f, 0.040528f},
{0.001638f, -0.000748f, 0.011022f},
{0.029238f, -0.006697f, 0.035933f},
{0.021641f, -0.015874f, 0.040531f},
{-0.101984f, -0.070160f, -0.028855f},
{0.033207f, -0.021068f, 0.002663f},
{-0.104711f, 0.121673f, 0.102981f},
{0.082647f, -0.004991f, 0.057237f},
{-0.027375f, 0.031581f, 0.006868f},
{-0.045434f, 0.029444f, 0.019287f},
{-0.046572f, -0.012537f, 0.006675f},
{0.074709f, 0.033690f, 0.025289f},
{-0.008251f, -0.002745f, -0.006999f},
{0.012685f, -0.061856f, -0.048658f},
{0.042304f, -0.007039f, 0.000295f},
{-0.007644f, -0.060843f, -0.033142f},
{0.159909f, 0.045628f, 0.367541f},
{0.095171f, 0.086438f, 0.010271f},
{0.006812f, 0.019643f, 0.029637f},
{0.003467f, -0.010705f, 0.014252f},
{-0.099681f, -0.066272f, -0.006243f},
{0.047357f, 0.037040f, 0.000185f},
{-0.041797f, -0.089225f, -0.032257f},
{0.008928f, 0.017028f, 0.018684f},
{-0.042255f, 0.016045f, 0.006849f},
{0.011268f, 0.036462f, 0.037387f},
{0.011553f, -0.016375f, -0.048589f},
{0.046266f, -0.027189f, 0.056979f},
{0.009640f, -0.017576f, 0.030324f},
{-0.045794f, -0.036083f, -0.010616f},
{0.022418f, 0.039783f, -0.032939f},
{-0.052714f, -0.015525f, 0.007438f},
{0.193004f, 0.223541f, 0.264175f},
{-0.059406f, -0.008188f, 0.022867f},
{-0.156742f, -0.263791f, -0.007385f},
{-0.015717f, 0.016570f, 0.033969f},
{0.037969f, 0.109835f, 0.200449f},
{-0.000782f, -0.009566f, -0.008058f},
{0.010709f, 0.052960f, -0.044195f},
{0.017271f, 0.045839f, 0.034569f},
{0.009424f, 0.013088f, -0.001714f},
{-0.024805f, -0.059378f, -0.033756f},
{-0.078293f, 0.029070f, 0.026129f}};
float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f};
const float flux_latent_rgb_proj[16][3] = {
{-0.041168f, 0.019917f, 0.097253f},
{0.028096f, 0.026730f, 0.129576f},
{0.065618f, -0.067950f, -0.014651f},
{-0.012998f, -0.014762f, 0.081251f},
{0.078567f, 0.059296f, -0.024687f},
{-0.015987f, -0.003697f, 0.005012f},
{0.033605f, 0.138999f, 0.068517f},
{-0.024450f, -0.063567f, -0.030101f},
{-0.040194f, -0.016710f, 0.127185f},
{0.112681f, 0.088764f, -0.041940f},
{-0.023498f, 0.093664f, 0.025543f},
{0.082899f, 0.048320f, 0.007491f},
{0.075712f, 0.074139f, 0.081965f},
{-0.143501f, 0.018263f, -0.136138f},
{-0.025767f, -0.082035f, -0.040023f},
{-0.111849f, -0.055589f, -0.032361f}};
float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
const float flux2_latent_rgb_proj[32][3] = {
{0.000736f, -0.008385f, -0.019710f},
{-0.001352f, -0.016392f, 0.020693f},
{-0.006376f, 0.002428f, 0.036736f},
{0.039384f, 0.074167f, 0.119789f},
{0.007464f, -0.005705f, -0.004734f},
{-0.004086f, 0.005287f, -0.000409f},
{-0.032835f, 0.050802f, -0.028120f},
{-0.003158f, -0.000835f, 0.000406f},
{-0.112840f, -0.084337f, -0.023083f},
{0.001462f, -0.006656f, 0.000549f},
{-0.009980f, -0.007480f, 0.009702f},
{0.032540f, 0.000214f, -0.061388f},
{0.011023f, 0.000694f, 0.007143f},
{-0.001468f, -0.006723f, -0.001678f},
{-0.005921f, -0.010320f, -0.003907f},
{-0.028434f, 0.027584f, 0.018457f},
{0.014349f, 0.011523f, 0.000441f},
{0.009874f, 0.003081f, 0.001507f},
{0.002218f, 0.005712f, 0.001563f},
{0.053010f, -0.019844f, 0.008683f},
{-0.002507f, 0.005384f, 0.000938f},
{-0.002177f, -0.011366f, 0.003559f},
{-0.000261f, 0.015121f, -0.003240f},
{-0.003944f, -0.002083f, 0.005043f},
{-0.009138f, 0.011336f, 0.003781f},
{0.011429f, 0.003985f, -0.003855f},
{0.010518f, -0.005586f, 0.010131f},
{0.007883f, 0.002912f, -0.001473f},
{-0.003318f, -0.003160f, 0.003684f},
{-0.034560f, -0.008740f, 0.012996f},
{0.000166f, 0.001079f, -0.012153f},
{0.017772f, 0.000937f, -0.011953f}};
float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
// This one was taken straight from
// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
// (MiT Licence)
const float sd3_latent_rgb_proj[16][3] = {
{-0.0645f, 0.0177f, 0.1052f},
{0.0028f, 0.0312f, 0.0650f},
{0.1848f, 0.0762f, 0.0360f},
{0.0944f, 0.0360f, 0.0889f},
{0.0897f, 0.0506f, -0.0364f},
{-0.0020f, 0.1203f, 0.0284f},
{0.0855f, 0.0118f, 0.0283f},
{-0.0539f, 0.0658f, 0.1047f},
{-0.0057f, 0.0116f, 0.0700f},
{-0.0412f, 0.0281f, -0.0039f},
{0.1106f, 0.1171f, 0.1220f},
{-0.0248f, 0.0682f, -0.0481f},
{0.0815f, 0.0846f, 0.1207f},
{-0.0120f, -0.0055f, -0.0867f},
{-0.0749f, -0.0634f, -0.0456f},
{-0.1418f, -0.1457f, -0.1259f},
};
float sd3_latent_rgb_bias[3] = {0, 0, 0};
const float sdxl_latent_rgb_proj[4][3] = {
{0.258303f, 0.277640f, 0.329699f},
{-0.299701f, 0.105446f, 0.014194f},
{0.050522f, 0.186163f, -0.143257f},
{-0.211938f, -0.149892f, -0.080036f}};
float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f};
const float sd_latent_rgb_proj[4][3] = {
{0.337366f, 0.216344f, 0.257386f},
{0.165636f, 0.386828f, 0.046994f},
{-0.267803f, 0.237036f, 0.223517f},
{-0.178022f, -0.200862f, -0.678514f}};
float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
size_t buffer_head = 0;
uint32_t latent_width = latents->ne[0];
uint32_t latent_height = latents->ne[1];
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
uint32_t frames = 1;
if (ggml_n_dims(latents) == 4) {
frames = latents->ne[2];
}
uint32_t rgb_width = latent_width * patch_size;
uint32_t rgb_height = latent_height * patch_size;
uint32_t unpatched_dim = dim / (patch_size * patch_size);
for (int k = 0; k < frames; k++) {
for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
int latent_x = rgb_x / patch_size;
int latent_y = rgb_y / patch_size;
int channel_offset = 0;
if (patch_size > 1) {
channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
}
size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
// should be incremented by 1 for each pixel
size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
float r = 0, g = 0, b = 0;
if (latent_rgb_proj != nullptr) {
for (int d = 0; d < unpatched_dim; d++) {
float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
r += value * latent_rgb_proj[d][0];
g += value * latent_rgb_proj[d][1];
b += value * latent_rgb_proj[d][2];
}
} else {
// interpret first 3 channels as RGB
r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]);
g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
}
if (latent_rgb_bias != nullptr) {
// bias
r += latent_rgb_bias[0];
g += latent_rgb_bias[1];
b += latent_rgb_bias[2];
}
// change range
r = r * .5f + .5f;
g = g * .5f + .5f;
b = b * .5f + .5f;
// clamp rgb values to [0,1] range
r = r >= 0 ? r <= 1 ? r : 1 : 0;
g = g >= 0 ? g <= 1 ? g : 1 : 0;
b = b >= 0 ? b <= 1 ? b : 1 : 0;
buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
}
}
}
}

File diff suppressed because it is too large Load Diff

1446
lora.hpp

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,7 @@ namespace LTXV {
bias)); bias));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
bool causal = true) { bool causal = true) {
// x: [N*IC, ID, IH, IW] // x: [N*IC, ID, IH, IW]

270
mmdit.hpp
View File

@ -27,13 +27,13 @@ public:
blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias)); blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, n_token, in_features] // x: [N, n_token, in_features]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]); auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]); auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
x = fc1->forward(ctx, x); x = fc1->forward(ctx, x);
x = ggml_gelu_inplace(ctx, x); x = ggml_gelu_inplace(ctx->ggml_ctx, x);
x = fc2->forward(ctx, x); x = fc2->forward(ctx, x);
return x; return x;
} }
@ -72,7 +72,7 @@ public:
bias)); bias));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, C, H, W] // x: [N, C, H, W]
// return: [N, H*W, embed_dim] // return: [N, H*W, embed_dim]
auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]); auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);
@ -82,13 +82,13 @@ public:
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int pad_h = (patch_size - H % patch_size) % patch_size; int pad_h = (patch_size - H % patch_size) % patch_size;
int pad_w = (patch_size - W % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size;
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // TODO: reflect pad mode x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // TODO: reflect pad mode
} }
x = proj->forward(ctx, x); x = proj->forward(ctx, x);
if (flatten) { if (flatten) {
x = ggml_reshape_3d(ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]); x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));
} }
return x; return x;
} }
@ -101,22 +101,26 @@ protected:
public: public:
TimestepEmbedder(int64_t hidden_size, TimestepEmbedder(int64_t hidden_size,
int64_t frequency_embedding_size = 256) int64_t frequency_embedding_size = 256,
int64_t out_channels = 0)
: frequency_embedding_size(frequency_embedding_size) { : frequency_embedding_size(frequency_embedding_size) {
if (out_channels <= 0) {
out_channels = hidden_size;
}
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true)); blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true)); blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, out_channels, true, true));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* t) {
// t: [N, ] // t: [N, ]
// return: [N, hidden_size] // return: [N, hidden_size]
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]); auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]); auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
auto t_freq = ggml_nn_timestep_embedding(ctx, t, frequency_embedding_size); // [N, frequency_embedding_size] auto t_freq = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size); // [N, frequency_embedding_size]
auto t_emb = mlp_0->forward(ctx, t_freq); auto t_emb = mlp_0->forward(ctx, t_freq);
t_emb = ggml_silu_inplace(ctx, t_emb); t_emb = ggml_silu_inplace(ctx->ggml_ctx, t_emb);
t_emb = mlp_2->forward(ctx, t_emb); t_emb = mlp_2->forward(ctx, t_emb);
return t_emb; return t_emb;
} }
@ -131,14 +135,14 @@ public:
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true)); blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, input_dim] // x: [N, input_dim]
// return: [N, hidden_size] // return: [N, hidden_size]
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]); auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]); auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
x = mlp_0->forward(ctx, x); x = mlp_0->forward(ctx, x);
x = ggml_silu_inplace(ctx, x); x = ggml_silu_inplace(ctx->ggml_ctx, x);
x = mlp_2->forward(ctx, x); x = mlp_2->forward(ctx, x);
return x; return x;
} }
@ -149,16 +153,14 @@ public:
int64_t num_heads; int64_t num_heads;
bool pre_only; bool pre_only;
std::string qk_norm; std::string qk_norm;
bool flash_attn;
public: public:
SelfAttention(int64_t dim, SelfAttention(int64_t dim,
int64_t num_heads = 8, int64_t num_heads = 8,
std::string qk_norm = "", std::string qk_norm = "",
bool qkv_bias = false, bool qkv_bias = false,
bool pre_only = false, bool pre_only = false)
bool flash_attn = false) : num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm) {
: num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm), flash_attn(flash_attn) {
int64_t d_head = dim / num_heads; int64_t d_head = dim / num_heads;
blocks["qkv"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias)); blocks["qkv"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
if (!pre_only) { if (!pre_only) {
@ -173,15 +175,15 @@ public:
} }
} }
std::vector<struct ggml_tensor*> pre_attention(struct ggml_context* ctx, struct ggml_tensor* x) { std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]); auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
auto qkv = qkv_proj->forward(ctx, x); auto qkv = qkv_proj->forward(ctx, x);
auto qkv_vec = split_qkv(ctx, qkv); auto qkv_vec = split_qkv(ctx->ggml_ctx, qkv);
int64_t head_dim = qkv_vec[0]->ne[0] / num_heads; int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
auto q = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]); // [N, n_token, n_head, d_head] auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]); // [N, n_token, n_head, d_head]
auto k = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]); // [N, n_token, n_head, d_head] auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]); // [N, n_token, n_head, d_head]
auto v = qkv_vec[2]; // [N, n_token, n_head*d_head] auto v = qkv_vec[2]; // [N, n_token, n_head*d_head]
if (qk_norm == "rms" || qk_norm == "ln") { if (qk_norm == "rms" || qk_norm == "ln") {
auto ln_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_q"]); auto ln_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_q"]);
@ -190,13 +192,13 @@ public:
k = ln_k->forward(ctx, k); k = ln_k->forward(ctx, k);
} }
q = ggml_reshape_3d(ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]); // [N, n_token, n_head*d_head] q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]); // [N, n_token, n_head*d_head]
k = ggml_reshape_3d(ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]); // [N, n_token, n_head*d_head] k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]); // [N, n_token, n_head*d_head]
return {q, k, v}; return {q, k, v};
} }
struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
GGML_ASSERT(!pre_only); GGML_ASSERT(!pre_only);
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]); auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@ -206,12 +208,11 @@ public:
} }
// x: [N, n_token, dim] // x: [N, n_token, dim]
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x) { struct ggml_tensor* x) {
auto qkv = pre_attention(ctx, x); auto qkv = pre_attention(ctx, x);
x = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, true); // [N, n_token, dim] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim]
x = post_attention(ctx, x); // [N, n_token, dim] x = post_attention(ctx, x); // [N, n_token, dim]
return x; return x;
} }
}; };
@ -236,7 +237,6 @@ public:
int64_t num_heads; int64_t num_heads;
bool pre_only; bool pre_only;
bool self_attn; bool self_attn;
bool flash_attn;
public: public:
DismantledBlock(int64_t hidden_size, DismantledBlock(int64_t hidden_size,
@ -245,17 +245,16 @@ public:
std::string qk_norm = "", std::string qk_norm = "",
bool qkv_bias = false, bool qkv_bias = false,
bool pre_only = false, bool pre_only = false,
bool self_attn = false, bool self_attn = false)
bool flash_attn = false)
: num_heads(num_heads), pre_only(pre_only), self_attn(self_attn) { : num_heads(num_heads), pre_only(pre_only), self_attn(self_attn) {
// rmsnorm is always Flase // rmsnorm is always Flase
// scale_mod_only is always Flase // scale_mod_only is always Flase
// swiglu is always Flase // swiglu is always Flase
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false)); blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
blocks["attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only, flash_attn)); blocks["attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only));
if (self_attn) { if (self_attn) {
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false, flash_attn)); blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false));
} }
if (!pre_only) { if (!pre_only) {
@ -274,9 +273,9 @@ public:
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size)); blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size));
} }
std::tuple<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention_x(struct ggml_context* ctx, std::tuple<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention_x(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
GGML_ASSERT(self_attn); GGML_ASSERT(self_attn);
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
// c: [N, hidden_size] // c: [N, hidden_size]
@ -286,35 +285,35 @@ public:
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]); auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
int64_t n_mods = 9; int64_t n_mods = 9;
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, n_mods * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, n_mods * hidden_size]
m = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size] m = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size]
m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size] m = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size]
int64_t offset = m->nb[1] * m->ne[1]; int64_t offset = m->nb[1] * m->ne[1];
auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size] auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size] auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
auto gate_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size] auto gate_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size]
auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size] auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size]
auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size] auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size]
auto gate_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size] auto gate_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size]
auto shift_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6); // [N, hidden_size] auto shift_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6); // [N, hidden_size]
auto scale_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7); // [N, hidden_size] auto scale_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7); // [N, hidden_size]
auto gate_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8); // [N, hidden_size] auto gate_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8); // [N, hidden_size]
auto x_norm = norm1->forward(ctx, x); auto x_norm = norm1->forward(ctx, x);
auto attn_in = modulate(ctx, x_norm, shift_msa, scale_msa); auto attn_in = modulate(ctx->ggml_ctx, x_norm, shift_msa, scale_msa);
auto qkv = attn->pre_attention(ctx, attn_in); auto qkv = attn->pre_attention(ctx, attn_in);
auto attn2_in = modulate(ctx, x_norm, shift_msa2, scale_msa2); auto attn2_in = modulate(ctx->ggml_ctx, x_norm, shift_msa2, scale_msa2);
auto qkv2 = attn2->pre_attention(ctx, attn2_in); auto qkv2 = attn2->pre_attention(ctx, attn2_in);
return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}}; return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}};
} }
std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(struct ggml_context* ctx, std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
@ -327,33 +326,33 @@ public:
if (pre_only) { if (pre_only) {
n_mods = 2; n_mods = 2;
} }
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, n_mods * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, n_mods * hidden_size]
m = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size] m = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]); // [N, n_mods, hidden_size]
m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size] m = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3)); // [n_mods, N, hidden_size]
int64_t offset = m->nb[1] * m->ne[1]; int64_t offset = m->nb[1] * m->ne[1];
auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size] auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size] auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
if (!pre_only) { if (!pre_only) {
auto gate_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size] auto gate_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2); // [N, hidden_size]
auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size] auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3); // [N, hidden_size]
auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size] auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4); // [N, hidden_size]
auto gate_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size] auto gate_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5); // [N, hidden_size]
auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa); auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
auto qkv = attn->pre_attention(ctx, attn_in); auto qkv = attn->pre_attention(ctx, attn_in);
return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}}; return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}};
} else { } else {
auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa); auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
auto qkv = attn->pre_attention(ctx, attn_in); auto qkv = attn->pre_attention(ctx, attn_in);
return {qkv, {nullptr, nullptr, nullptr, nullptr, nullptr}}; return {qkv, {nullptr, nullptr, nullptr, nullptr, nullptr}};
} }
} }
struct ggml_tensor* post_attention_x(struct ggml_context* ctx, struct ggml_tensor* post_attention_x(GGMLRunnerContext* ctx,
struct ggml_tensor* attn_out, struct ggml_tensor* attn_out,
struct ggml_tensor* attn2_out, struct ggml_tensor* attn2_out,
struct ggml_tensor* x, struct ggml_tensor* x,
@ -376,22 +375,22 @@ public:
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]); auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]); auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);
gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size] gate_msa = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size]
gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size] gate_mlp = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size]
gate_msa2 = ggml_reshape_3d(ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]); // [N, 1, hidden_size] gate_msa2 = ggml_reshape_3d(ctx->ggml_ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]); // [N, 1, hidden_size]
attn_out = attn->post_attention(ctx, attn_out); attn_out = attn->post_attention(ctx, attn_out);
attn2_out = attn2->post_attention(ctx, attn2_out); attn2_out = attn2->post_attention(ctx, attn2_out);
x = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
x = ggml_add(ctx, x, ggml_mul(ctx, attn2_out, gate_msa2)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn2_out, gate_msa2));
auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp)); auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
x = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));
return x; return x;
} }
struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* post_attention(GGMLRunnerContext* ctx,
struct ggml_tensor* attn_out, struct ggml_tensor* attn_out,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* gate_msa, struct ggml_tensor* gate_msa,
@ -411,20 +410,19 @@ public:
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]); auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]); auto mlp = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);
gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size] gate_msa = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]); // [N, 1, hidden_size]
gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size] gate_mlp = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]); // [N, 1, hidden_size]
attn_out = attn->post_attention(ctx, attn_out); attn_out = attn->post_attention(ctx, attn_out);
x = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp)); auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
x = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp)); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));
return x; return x;
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
@ -441,8 +439,8 @@ public:
auto qkv2 = std::get<1>(qkv_intermediates); auto qkv2 = std::get<1>(qkv_intermediates);
auto intermediates = std::get<2>(qkv_intermediates); auto intermediates = std::get<2>(qkv_intermediates);
auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, flash_attn); // [N, n_token, dim] auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim]
auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, false, flash_attn); // [N, n_token, dim] auto attn2_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim]
x = post_attention_x(ctx, x = post_attention_x(ctx,
attn_out, attn_out,
attn2_out, attn2_out,
@ -458,7 +456,7 @@ public:
auto qkv = qkv_intermediates.first; auto qkv = qkv_intermediates.first;
auto intermediates = qkv_intermediates.second; auto intermediates = qkv_intermediates.second;
auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, flash_attn); // [N, n_token, dim] auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, dim]
x = post_attention(ctx, x = post_attention(ctx,
attn_out, attn_out,
intermediates[0], intermediates[0],
@ -472,9 +470,7 @@ public:
}; };
__STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*> __STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*>
block_mixing(struct ggml_context* ctx, block_mixing(GGMLRunnerContext* ctx,
ggml_backend_t backend,
bool flash_attn,
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c, struct ggml_tensor* c,
@ -501,29 +497,29 @@ block_mixing(struct ggml_context* ctx,
} }
std::vector<struct ggml_tensor*> qkv; std::vector<struct ggml_tensor*> qkv;
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1)); qkv.push_back(ggml_concat(ctx->ggml_ctx, context_qkv[i], x_qkv[i], 1));
} }
auto attn = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, false, flash_attn); // [N, n_context + n_token, hidden_size] auto attn = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_context + n_token, hidden_size]
attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size] attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size]
auto context_attn = ggml_view_3d(ctx, auto context_attn = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1], attn->ne[1],
context->ne[1], context->ne[1],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
0); // [n_context, N, hidden_size] 0); // [n_context, N, hidden_size]
context_attn = ggml_cont(ctx, ggml_permute(ctx, context_attn, 0, 2, 1, 3)); // [N, n_context, hidden_size] context_attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context_attn, 0, 2, 1, 3)); // [N, n_context, hidden_size]
auto x_attn = ggml_view_3d(ctx, auto x_attn = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1], attn->ne[1],
x->ne[1], x->ne[1],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
attn->nb[2] * context->ne[1]); // [n_token, N, hidden_size] attn->nb[2] * context->ne[1]); // [n_token, N, hidden_size]
x_attn = ggml_cont(ctx, ggml_permute(ctx, x_attn, 0, 2, 1, 3)); // [N, n_token, hidden_size] x_attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x_attn, 0, 2, 1, 3)); // [N, n_token, hidden_size]
if (!context_block->pre_only) { if (!context_block->pre_only) {
context = context_block->post_attention(ctx, context = context_block->post_attention(ctx,
@ -538,7 +534,7 @@ block_mixing(struct ggml_context* ctx,
} }
if (x_block->self_attn) { if (x_block->self_attn) {
auto attn2 = ggml_nn_attention_ext(ctx, backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads); // [N, n_token, hidden_size] auto attn2 = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, hidden_size]
x = x_block->post_attention_x(ctx, x = x_block->post_attention_x(ctx,
x_attn, x_attn,
@ -563,8 +559,6 @@ block_mixing(struct ggml_context* ctx,
} }
struct JointBlock : public GGMLBlock { struct JointBlock : public GGMLBlock {
bool flash_attn;
public: public:
JointBlock(int64_t hidden_size, JointBlock(int64_t hidden_size,
int64_t num_heads, int64_t num_heads,
@ -572,22 +566,19 @@ public:
std::string qk_norm = "", std::string qk_norm = "",
bool qkv_bias = false, bool qkv_bias = false,
bool pre_only = false, bool pre_only = false,
bool self_attn_x = false, bool self_attn_x = false) {
bool flash_attn = false) blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false));
: flash_attn(flash_attn) { blocks["x_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false, flash_attn));
blocks["x_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x, flash_attn));
} }
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]); auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
auto x_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]); auto x_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);
return block_mixing(ctx, backend, flash_attn, context, x, c, context_block, x_block); return block_mixing(ctx, context, x, c, context_block, x_block);
} }
}; };
@ -603,7 +594,7 @@ public:
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size)); blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
@ -613,15 +604,15 @@ public:
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]); auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]); auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c)); // [N, 2 * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 2 * hidden_size]
m = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]); // [N, 2, hidden_size] m = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], 2, c->ne[1]); // [N, 2, hidden_size]
m = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3)); // [2, N, hidden_size] m = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3)); // [2, N, hidden_size]
int64_t offset = m->nb[1] * m->ne[1]; int64_t offset = m->nb[1] * m->ne[1];
auto shift = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size] auto shift = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0); // [N, hidden_size]
auto scale = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size] auto scale = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1); // [N, hidden_size]
x = modulate(ctx, norm_final->forward(ctx, x), shift, scale); x = modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
x = linear->forward(ctx, x); x = linear->forward(ctx, x);
return x; return x;
@ -645,16 +636,14 @@ protected:
int64_t context_embedder_out_dim = 1536; int64_t context_embedder_out_dim = 1536;
int64_t hidden_size; int64_t hidden_size;
std::string qk_norm; std::string qk_norm;
bool flash_attn = false;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1); params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
} }
public: public:
MMDiT(bool flash_attn = false, const String2GGMLType& tensor_types = {}) MMDiT(const String2TensorStorage& tensor_storage_map = {}) {
: flash_attn(flash_attn) {
// input_size is always None // input_size is always None
// learn_sigma is always False // learn_sigma is always False
// register_length is alwalys 0 // register_length is alwalys 0
@ -667,8 +656,7 @@ public:
// pos_embed_offset is not used // pos_embed_offset is not used
// context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}} // context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}
// read tensors from tensor_types for (auto pair : tensor_storage_map) {
for (auto pair : tensor_types) {
std::string tensor_name = pair.first; std::string tensor_name = pair.first;
if (tensor_name.find("model.diffusion_model.") == std::string::npos) if (tensor_name.find("model.diffusion_model.") == std::string::npos)
continue; continue;
@ -722,8 +710,7 @@ public:
qk_norm, qk_norm,
true, true,
i == depth - 1, i == depth - 1,
i <= d_self, i <= d_self));
flash_attn));
} }
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels)); blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
@ -791,8 +778,7 @@ public:
return x; return x;
} }
struct ggml_tensor* forward_core_with_concat(struct ggml_context* ctx, struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c_mod, struct ggml_tensor* c_mod,
struct ggml_tensor* context, struct ggml_tensor* context,
@ -811,7 +797,7 @@ public:
auto block = std::dynamic_pointer_cast<JointBlock>(blocks["joint_blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<JointBlock>(blocks["joint_blocks." + std::to_string(i)]);
auto context_x = block->forward(ctx, backend, context, x, c_mod); auto context_x = block->forward(ctx, context, x, c_mod);
context = context_x.first; context = context_x.first;
x = context_x.second; x = context_x.second;
} }
@ -821,8 +807,7 @@ public:
return x; return x;
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* t, struct ggml_tensor* t,
struct ggml_tensor* y = nullptr, struct ggml_tensor* y = nullptr,
@ -840,16 +825,16 @@ public:
int64_t w = x->ne[0]; int64_t w = x->ne[0];
int64_t h = x->ne[1]; int64_t h = x->ne[1];
auto patch_embed = x_embedder->forward(ctx, x); // [N, H*W, hidden_size] auto patch_embed = x_embedder->forward(ctx, x); // [N, H*W, hidden_size]
auto pos_embed = cropped_pos_embed(ctx, h, w); // [1, H*W, hidden_size] auto pos_embed = cropped_pos_embed(ctx->ggml_ctx, h, w); // [1, H*W, hidden_size]
x = ggml_add(ctx, patch_embed, pos_embed); // [N, H*W, hidden_size] x = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed); // [N, H*W, hidden_size]
auto c = t_embedder->forward(ctx, t); // [N, hidden_size] auto c = t_embedder->forward(ctx, t); // [N, hidden_size]
if (y != nullptr && adm_in_channels != -1) { if (y != nullptr && adm_in_channels != -1) {
auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]); auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);
y = y_embedder->forward(ctx, y); // [N, hidden_size] y = y_embedder->forward(ctx, y); // [N, hidden_size]
c = ggml_add(ctx, c, y); c = ggml_add(ctx->ggml_ctx, c, y);
} }
if (context != nullptr) { if (context != nullptr) {
@ -858,9 +843,9 @@ public:
context = context_embedder->forward(ctx, context); // [N, L, D] aka [N, L, 1536] context = context_embedder->forward(ctx, context); // [N, L, D] aka [N, L, 1536]
} }
x = forward_core_with_concat(ctx, backend, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels) x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels)
x = unpatchify(ctx, x, h, w); // [N, C, H, W] x = unpatchify(ctx->ggml_ctx, x, h, w); // [N, C, H, W]
return x; return x;
} }
@ -870,11 +855,10 @@ struct MMDiTRunner : public GGMLRunner {
MMDiTRunner(ggml_backend_t backend, MMDiTRunner(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
bool flash_attn, const String2TensorStorage& tensor_storage_map = {},
const String2GGMLType& tensor_types = {}, const std::string prefix = "")
const std::string prefix = "") : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_storage_map) {
: GGMLRunner(backend, offload_params_to_cpu), mmdit(flash_attn, tensor_types) { mmdit.init(params_ctx, tensor_storage_map, prefix);
mmdit.init(params_ctx, tensor_types, prefix);
} }
std::string get_desc() override { std::string get_desc() override {
@ -890,15 +874,15 @@ struct MMDiTRunner : public GGMLRunner {
struct ggml_tensor* context, struct ggml_tensor* context,
struct ggml_tensor* y, struct ggml_tensor* y,
std::vector<int> skip_layers = std::vector<int>()) { std::vector<int> skip_layers = std::vector<int>()) {
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false); struct ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);
x = to_backend(x); x = to_backend(x);
context = to_backend(context); context = to_backend(context);
y = to_backend(y); y = to_backend(y);
timesteps = to_backend(timesteps); timesteps = to_backend(timesteps);
struct ggml_tensor* out = mmdit.forward(compute_ctx, auto runner_ctx = get_context();
runtime_backend, struct ggml_tensor* out = mmdit.forward(&runner_ctx,
x, x,
timesteps, timesteps,
y, y,
@ -910,7 +894,7 @@ struct MMDiTRunner : public GGMLRunner {
return gf; return gf;
} }
void compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
@ -926,7 +910,7 @@ struct MMDiTRunner : public GGMLRunner {
return build_graph(x, timesteps, context, y, skip_layers); return build_graph(x, timesteps, context, y, skip_layers);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
void test() { void test() {
@ -972,7 +956,7 @@ struct MMDiTRunner : public GGMLRunner {
// ggml_backend_t backend = ggml_backend_cuda_init(0); // ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init(); ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16; ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false, false); std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false);
{ {
LOG_INFO("loading from '%s'", file_path.c_str()); LOG_INFO("loading from '%s'", file_path.c_str());
@ -981,7 +965,7 @@ struct MMDiTRunner : public GGMLRunner {
mmdit->get_param_tensors(tensors, "model.diffusion_model"); mmdit->get_param_tensors(tensors, "model.diffusion_model");
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }

1228
model.cpp

File diff suppressed because it is too large Load Diff

79
model.h
View File

@ -15,6 +15,7 @@
#include "ggml.h" #include "ggml.h"
#include "gguf.h" #include "gguf.h"
#include "json.hpp" #include "json.hpp"
#include "ordered_map.hpp"
#include "zip.h" #include "zip.h"
#define SD_MAX_DIMS 5 #define SD_MAX_DIMS 5
@ -26,6 +27,7 @@ enum SDVersion {
VERSION_SD1_TINY_UNET, VERSION_SD1_TINY_UNET,
VERSION_SD2, VERSION_SD2,
VERSION_SD2_INPAINT, VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET,
VERSION_SDXL, VERSION_SDXL,
VERSION_SDXL_INPAINT, VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX, VERSION_SDXL_PIX2PIX,
@ -36,10 +38,14 @@ enum SDVersion {
VERSION_FLUX_FILL, VERSION_FLUX_FILL,
VERSION_FLUX_CONTROLS, VERSION_FLUX_CONTROLS,
VERSION_FLEX_2, VERSION_FLEX_2,
VERSION_CHROMA_RADIANCE,
VERSION_WAN2, VERSION_WAN2,
VERSION_WAN2_2_I2V, VERSION_WAN2_2_I2V,
VERSION_WAN2_2_TI2V, VERSION_WAN2_2_TI2V,
VERSION_QWEN_IMAGE, VERSION_QWEN_IMAGE,
VERSION_FLUX2,
VERSION_Z_IMAGE,
VERSION_OVIS_IMAGE,
VERSION_COUNT, VERSION_COUNT,
}; };
@ -51,7 +57,7 @@ static inline bool sd_version_is_sd1(SDVersion version) {
} }
static inline bool sd_version_is_sd2(SDVersion version) { static inline bool sd_version_is_sd2(SDVersion version) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) { if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
return true; return true;
} }
return false; return false;
@ -64,6 +70,15 @@ static inline bool sd_version_is_sdxl(SDVersion version) {
return false; return false;
} }
static inline bool sd_version_is_unet(SDVersion version) {
if (sd_version_is_sd1(version) ||
sd_version_is_sd2(version) ||
sd_version_is_sdxl(version)) {
return true;
}
return false;
}
static inline bool sd_version_is_sd3(SDVersion version) { static inline bool sd_version_is_sd3(SDVersion version) {
if (version == VERSION_SD3) { if (version == VERSION_SD3) {
return true; return true;
@ -72,7 +87,19 @@ static inline bool sd_version_is_sd3(SDVersion version) {
} }
static inline bool sd_version_is_flux(SDVersion version) { static inline bool sd_version_is_flux(SDVersion version) {
if (version == VERSION_FLUX || version == VERSION_FLUX_FILL || version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2) { if (version == VERSION_FLUX ||
version == VERSION_FLUX_FILL ||
version == VERSION_FLUX_CONTROLS ||
version == VERSION_FLEX_2 ||
version == VERSION_OVIS_IMAGE ||
version == VERSION_CHROMA_RADIANCE) {
return true;
}
return false;
}
static inline bool sd_version_is_flux2(SDVersion version) {
if (version == VERSION_FLUX2) {
return true; return true;
} }
return false; return false;
@ -92,8 +119,19 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
return false; return false;
} }
static inline bool sd_version_is_z_image(SDVersion version) {
if (version == VERSION_Z_IMAGE) {
return true;
}
return false;
}
static inline bool sd_version_is_inpaint(SDVersion version) { static inline bool sd_version_is_inpaint(SDVersion version) {
if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL || version == VERSION_FLEX_2) { if (version == VERSION_SD1_INPAINT ||
version == VERSION_SD2_INPAINT ||
version == VERSION_SDXL_INPAINT ||
version == VERSION_FLUX_FILL ||
version == VERSION_FLEX_2) {
return true; return true;
} }
return false; return false;
@ -101,9 +139,11 @@ static inline bool sd_version_is_inpaint(SDVersion version) {
static inline bool sd_version_is_dit(SDVersion version) { static inline bool sd_version_is_dit(SDVersion version) {
if (sd_version_is_flux(version) || if (sd_version_is_flux(version) ||
sd_version_is_flux2(version) ||
sd_version_is_sd3(version) || sd_version_is_sd3(version) ||
sd_version_is_wan(version) || sd_version_is_wan(version) ||
sd_version_is_qwen_image(version)) { sd_version_is_qwen_image(version) ||
sd_version_is_z_image(version)) {
return true; return true;
} }
return false; return false;
@ -129,7 +169,7 @@ enum PMVersion {
struct TensorStorage { struct TensorStorage {
std::string name; std::string name;
ggml_type type = GGML_TYPE_F32; ggml_type type = GGML_TYPE_F32;
bool is_bf16 = false; ggml_type expected_type = GGML_TYPE_COUNT;
bool is_f8_e4m3 = false; bool is_f8_e4m3 = false;
bool is_f8_e5m2 = false; bool is_f8_e5m2 = false;
bool is_f64 = false; bool is_f64 = false;
@ -163,7 +203,7 @@ struct TensorStorage {
} }
int64_t nbytes_to_read() const { int64_t nbytes_to_read() const {
if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) { if (is_f8_e4m3 || is_f8_e5m2) {
return nbytes() / 2; return nbytes() / 2;
} else if (is_f64 || is_i64) { } else if (is_f64 || is_i64) {
return nbytes() * 2; return nbytes() * 2;
@ -211,9 +251,7 @@ struct TensorStorage {
std::string to_string() const { std::string to_string() const {
std::stringstream ss; std::stringstream ss;
const char* type_name = ggml_type_name(type); const char* type_name = ggml_type_name(type);
if (is_bf16) { if (is_f8_e4m3) {
type_name = "bf16";
} else if (is_f8_e4m3) {
type_name = "f8_e4m3"; type_name = "f8_e4m3";
} else if (is_f8_e5m2) { } else if (is_f8_e5m2) {
type_name = "f8_e5m2"; type_name = "f8_e5m2";
@ -237,12 +275,15 @@ struct TensorStorage {
typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t; typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
typedef std::map<std::string, enum ggml_type> String2GGMLType; typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
class ModelLoader { class ModelLoader {
protected: protected:
SDVersion version_ = VERSION_COUNT;
std::vector<std::string> file_paths_; std::vector<std::string> file_paths_;
std::vector<TensorStorage> tensor_storages; String2TensorStorage tensor_storage_map;
void add_tensor_storage(const TensorStorage& tensor_storage);
bool parse_data_pkl(uint8_t* buffer, bool parse_data_pkl(uint8_t* buffer,
size_t buffer_size, size_t buffer_size,
@ -257,16 +298,18 @@ protected:
bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = ""); bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
public: public:
String2GGMLType tensor_storages_types;
bool init_from_file(const std::string& file_path, const std::string& prefix = ""); bool init_from_file(const std::string& file_path, const std::string& prefix = "");
bool model_is_unet(); void convert_tensors_name();
bool init_from_file_and_convert_name(const std::string& file_path,
const std::string& prefix = "",
SDVersion version = VERSION_COUNT);
SDVersion get_sd_version(); SDVersion get_sd_version();
std::map<ggml_type, uint32_t> get_wtype_stat(); std::map<ggml_type, uint32_t> get_wtype_stat();
std::map<ggml_type, uint32_t> get_conditioner_wtype_stat(); std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();
std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat(); std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
std::map<ggml_type, uint32_t> get_vae_wtype_stat(); std::map<ggml_type, uint32_t> get_vae_wtype_stat();
void set_wtype_override(ggml_type wtype, std::string prefix = ""); String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0); bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors, bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
std::set<std::string> ignore_tensors = {}, std::set<std::string> ignore_tensors = {},
@ -274,8 +317,8 @@ public:
std::vector<std::string> get_tensor_names() const { std::vector<std::string> get_tensor_names() const {
std::vector<std::string> names; std::vector<std::string> names;
for (const auto& ts : tensor_storages) { for (const auto& [name, tensor_storage] : tensor_storage_map) {
names.push_back(ts.name); names.push_back(name);
} }
return names; return names;
} }
@ -287,6 +330,8 @@ public:
static std::string load_merges(); static std::string load_merges();
static std::string load_qwen2_merges(); static std::string load_qwen2_merges();
static std::string load_mistral_merges();
static std::string load_mistral_vocab_json();
static std::string load_t5_tokenizer_json(); static std::string load_t5_tokenizer_json();
static std::string load_umt5_tokenizer_json(); static std::string load_umt5_tokenizer_json();
}; };

1105
name_conversion.cpp Normal file

File diff suppressed because it is too large Load Diff

14
name_conversion.h Normal file
View File

@ -0,0 +1,14 @@
#ifndef __NAME_CONVERSTION_H__
#define __NAME_CONVERSTION_H__
#include <string>
#include "model.h"
bool is_cond_stage_model_name(const std::string& name);
bool is_diffusion_model_name(const std::string& name);
bool is_first_stage_model_name(const std::string& name);
std::string convert_tensor_name(std::string name, SDVersion version);
#endif // __NAME_CONVERSTION_H__

177
ordered_map.hpp Normal file
View File

@ -0,0 +1,177 @@
#ifndef __ORDERED_MAP_HPP__
#define __ORDERED_MAP_HPP__
#include <iostream>
#include <list>
#include <string>
#include <unordered_map>
#include <initializer_list>
#include <iterator>
#include <list>
#include <stdexcept>
#include <unordered_map>
#include <utility>
template <typename Key, typename T>
class OrderedMap {
public:
using key_type = Key;
using mapped_type = T;
using value_type = std::pair<const Key, T>;
using list_type = std::list<value_type>;
using size_type = typename list_type::size_type;
using difference_type = typename list_type::difference_type;
using iterator = typename list_type::iterator;
using const_iterator = typename list_type::const_iterator;
private:
list_type data_;
std::unordered_map<Key, iterator> index_;
public:
// --- constructors ---
OrderedMap() = default;
OrderedMap(std::initializer_list<value_type> init) {
for (const auto& kv : init)
insert(kv);
}
OrderedMap(const OrderedMap&) = default;
OrderedMap(OrderedMap&&) noexcept = default;
OrderedMap& operator=(const OrderedMap&) = default;
OrderedMap& operator=(OrderedMap&&) noexcept = default;
// --- element access ---
T& at(const Key& key) {
auto it = index_.find(key);
if (it == index_.end())
throw std::out_of_range("OrderedMap::at: key not found");
return it->second->second;
}
const T& at(const Key& key) const {
auto it = index_.find(key);
if (it == index_.end())
throw std::out_of_range("OrderedMap::at: key not found");
return it->second->second;
}
T& operator[](const Key& key) {
auto it = index_.find(key);
if (it == index_.end()) {
data_.emplace_back(key, T{});
auto iter = std::prev(data_.end());
index_[key] = iter;
return iter->second;
}
return it->second->second;
}
// --- iterators ---
iterator begin() noexcept { return data_.begin(); }
const_iterator begin() const noexcept { return data_.begin(); }
const_iterator cbegin() const noexcept { return data_.cbegin(); }
iterator end() noexcept { return data_.end(); }
const_iterator end() const noexcept { return data_.end(); }
const_iterator cend() const noexcept { return data_.cend(); }
// --- capacity ---
bool empty() const noexcept { return data_.empty(); }
size_type size() const noexcept { return data_.size(); }
// --- modifiers ---
void clear() noexcept {
data_.clear();
index_.clear();
}
std::pair<iterator, bool> insert(const value_type& value) {
auto it = index_.find(value.first);
if (it != index_.end()) {
return {it->second, false};
}
data_.push_back(value);
auto iter = std::prev(data_.end());
index_[value.first] = iter;
return {iter, true};
}
std::pair<iterator, bool> insert(value_type&& value) {
auto it = index_.find(value.first);
if (it != index_.end()) {
return {it->second, false};
}
data_.push_back(std::move(value));
auto iter = std::prev(data_.end());
index_[iter->first] = iter;
return {iter, true};
}
void erase(const Key& key) {
auto it = index_.find(key);
if (it != index_.end()) {
data_.erase(it->second);
index_.erase(it);
}
}
iterator erase(iterator pos) {
index_.erase(pos->first);
return data_.erase(pos);
}
// --- lookup ---
size_type count(const Key& key) const {
return index_.count(key);
}
iterator find(const Key& key) {
auto it = index_.find(key);
if (it == index_.end())
return data_.end();
return it->second;
}
const_iterator find(const Key& key) const {
auto it = index_.find(key);
if (it == index_.end())
return data_.end();
return it->second;
}
bool contains(const Key& key) const {
return index_.find(key) != index_.end();
}
// --- comparison ---
bool operator==(const OrderedMap& other) const {
return data_ == other.data_;
}
bool operator!=(const OrderedMap& other) const {
return !(*this == other);
}
template <typename... Args>
std::pair<iterator, bool> emplace(Args&&... args) {
value_type value(std::forward<Args>(args)...);
auto it = index_.find(value.first);
if (it != index_.end()) {
return {it->second, false};
}
data_.push_back(std::move(value));
auto iter = std::prev(data_.end());
index_[iter->first] = iter;
return {iter, true};
}
void swap(OrderedMap& other) noexcept {
data_.swap(other.data_);
index_.swap(other.index_);
}
};
#endif // __ORDERED_MAP_HPP__

138
pmid.hpp
View File

@ -21,7 +21,7 @@ public:
blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim)); blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]); auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
@ -29,15 +29,15 @@ public:
auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]); auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]);
struct ggml_tensor* r = x; struct ggml_tensor* r = x;
// x = ggml_nn_layer_norm(ctx, x, ln_w, ln_b); // x = ggml_ext_layer_norm(ctx, x, ln_w, ln_b);
x = layer_norm->forward(ctx, x); x = layer_norm->forward(ctx, x);
// x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x), fc1_b); // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x), fc1_b);
x = fc1->forward(ctx, x); x = fc1->forward(ctx, x);
x = ggml_gelu_inplace(ctx, x); x = ggml_gelu_inplace(ctx->ggml_ctx, x);
x = fc2->forward(ctx, x); x = fc2->forward(ctx, x);
// x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x), fc2_b); // x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x), fc2_b);
if (use_residue) if (use_residue)
x = ggml_add(ctx, x, r); x = ggml_add(ctx->ggml_ctx, x, r);
return x; return x;
} }
}; };
@ -54,7 +54,7 @@ public:
blocks["1"] = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false)); blocks["1"] = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x) { struct ggml_tensor* x) {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]); auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
auto ff = std::dynamic_pointer_cast<Mlp>(blocks["1"]); auto ff = std::dynamic_pointer_cast<Mlp>(blocks["1"]);
@ -100,7 +100,7 @@ public:
ggml_cont(ctx, tli)}; ggml_cont(ctx, tli)};
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* latents) { struct ggml_tensor* latents) {
// x (torch.Tensor): image features // x (torch.Tensor): image features
@ -118,33 +118,33 @@ public:
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]); auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
auto q = to_q->forward(ctx, latents); auto q = to_q->forward(ctx, latents);
auto kv_input = ggml_concat(ctx, x, latents, 1); auto kv_input = ggml_concat(ctx->ggml_ctx, x, latents, 1);
auto to_kv = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]); auto to_kv = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
auto kv = to_kv->forward(ctx, kv_input); auto kv = to_kv->forward(ctx, kv_input);
auto k = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0); auto k = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0);
auto v = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2)); auto v = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2));
k = ggml_cont(ctx, k); k = ggml_cont(ctx->ggml_ctx, k);
v = ggml_cont(ctx, v); v = ggml_cont(ctx->ggml_ctx, v);
q = reshape_tensor(ctx, q, heads); q = reshape_tensor(ctx->ggml_ctx, q, heads);
k = reshape_tensor(ctx, k, heads); k = reshape_tensor(ctx->ggml_ctx, k, heads);
v = reshape_tensor(ctx, v, heads); v = reshape_tensor(ctx->ggml_ctx, v, heads);
scale = 1.f / sqrt(sqrt((float)dim_head)); scale = 1.f / sqrt(sqrt((float)dim_head));
k = ggml_scale_inplace(ctx, k, scale); k = ggml_scale_inplace(ctx->ggml_ctx, k, scale);
q = ggml_scale_inplace(ctx, q, scale); q = ggml_scale_inplace(ctx->ggml_ctx, q, scale);
// auto weight = ggml_mul_mat(ctx, q, k); // auto weight = ggml_mul_mat(ctx, q, k);
auto weight = ggml_mul_mat(ctx, k, q); // NOTE order of mul is opposite to pytorch auto weight = ggml_mul_mat(ctx->ggml_ctx, k, q); // NOTE order of mul is opposite to pytorch
// GGML's softmax() is equivalent to pytorch's softmax(x, dim=-1) // GGML's softmax() is equivalent to pytorch's softmax(x, dim=-1)
// in this case, dimension along which Softmax will be computed is the last dim // in this case, dimension along which Softmax will be computed is the last dim
// in torch and the first dim in GGML, consistent with the convention that pytorch's // in torch and the first dim in GGML, consistent with the convention that pytorch's
// last dimension (varying most rapidly) corresponds to GGML's first (varying most rapidly). // last dimension (varying most rapidly) corresponds to GGML's first (varying most rapidly).
// weight = ggml_soft_max(ctx, weight); // weight = ggml_soft_max(ctx, weight);
weight = ggml_soft_max_inplace(ctx, weight); weight = ggml_soft_max_inplace(ctx->ggml_ctx, weight);
v = ggml_cont(ctx, ggml_transpose(ctx, v)); v = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, v));
// auto out = ggml_mul_mat(ctx, weight, v); // auto out = ggml_mul_mat(ctx, weight, v);
auto out = ggml_mul_mat(ctx, v, weight); // NOTE order of mul is opposite to pytorch auto out = ggml_mul_mat(ctx->ggml_ctx, v, weight); // NOTE order of mul is opposite to pytorch
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));
out = ggml_reshape_3d(ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1])); out = ggml_reshape_3d(ctx->ggml_ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1]));
auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]); auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
out = to_out->forward(ctx, out); out = to_out->forward(ctx, out);
return out; return out;
@ -176,7 +176,7 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* latents, struct ggml_tensor* latents,
struct ggml_tensor* x) { struct ggml_tensor* x) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
@ -191,9 +191,9 @@ public:
name = "layers." + std::to_string(i) + ".1"; name = "layers." + std::to_string(i) + ".1";
auto ff = std::dynamic_pointer_cast<PMFeedForward>(blocks[name]); auto ff = std::dynamic_pointer_cast<PMFeedForward>(blocks[name]);
auto t = attn->forward(ctx, x, latents); auto t = attn->forward(ctx, x, latents);
latents = ggml_add(ctx, t, latents); latents = ggml_add(ctx->ggml_ctx, t, latents);
t = ff->forward(ctx, latents); t = ff->forward(ctx, latents);
latents = ggml_add(ctx, t, latents); latents = ggml_add(ctx->ggml_ctx, t, latents);
} }
latents = proj_out->forward(ctx, latents); latents = proj_out->forward(ctx, latents);
latents = norm_out->forward(ctx, latents); latents = norm_out->forward(ctx, latents);
@ -225,7 +225,7 @@ public:
4)); 4));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* last_hidden_state) { struct ggml_tensor* last_hidden_state) {
// x: [N, channels, h, w] // x: [N, channels, h, w]
@ -235,11 +235,11 @@ public:
x = token_proj->forward(ctx, x); x = token_proj->forward(ctx, x);
int64_t nel = ggml_nelements(x); int64_t nel = ggml_nelements(x);
x = ggml_reshape_3d(ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens)); x = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
x = token_norm->forward(ctx, x); x = token_norm->forward(ctx, x);
struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state); struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
if (use_residul) if (use_residul)
out = ggml_add(ctx, x, out); out = ggml_add(ctx->ggml_ctx, x, out);
return out; return out;
} }
}; };
@ -256,24 +256,24 @@ public:
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim)); blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
} }
struct ggml_tensor* fuse_fn(struct ggml_context* ctx, struct ggml_tensor* fuse_fn(GGMLRunnerContext* ctx,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds) { struct ggml_tensor* id_embeds) {
auto mlp1 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]); auto mlp1 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
auto mlp2 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]); auto mlp2 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]); auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);
auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds, id_embeds, 0); auto stacked_id_embeds = ggml_concat(ctx->ggml_ctx, prompt_embeds, id_embeds, 0);
stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds); stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds);
stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds); stacked_id_embeds = ggml_add(ctx->ggml_ctx, stacked_id_embeds, prompt_embeds);
stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds); stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds);
stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds); stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds);
return stacked_id_embeds; return stacked_id_embeds;
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds, struct ggml_tensor* id_embeds,
struct ggml_tensor* class_tokens_mask, struct ggml_tensor* class_tokens_mask,
@ -286,25 +286,25 @@ public:
// # slice out the image token embeddings // # slice out the image token embeddings
ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos"); ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
ggml_set_name(prompt_embeds, "prompt_embeds"); ggml_set_name(prompt_embeds, "prompt_embeds");
struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx, prompt_embeds, class_tokens_mask_pos); struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos);
ggml_set_name(image_token_embeds, "image_token_embeds"); ggml_set_name(image_token_embeds, "image_token_embeds");
valid_id_embeds = ggml_reshape_2d(ctx, valid_id_embeds, valid_id_embeds->ne[0], valid_id_embeds = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0],
ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]); ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds); struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);
if (left && right) { if (left && right) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
} else if (left) { } else if (left) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
} else if (right) { } else if (right) {
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1); stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
} }
class_tokens_mask = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask)); class_tokens_mask = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask));
class_tokens_mask = ggml_repeat(ctx, class_tokens_mask, prompt_embeds); class_tokens_mask = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds);
prompt_embeds = ggml_mul(ctx, prompt_embeds, class_tokens_mask); prompt_embeds = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask);
struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx, prompt_embeds, stacked_id_embeds); struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds);
ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds"); ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
return updated_prompt_embeds; return updated_prompt_embeds;
} }
@ -317,8 +317,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048)); blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* id_pixel_values, struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* class_tokens_mask, struct ggml_tensor* class_tokens_mask,
@ -331,15 +330,15 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]); auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]);
auto fuse_module = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]); auto fuse_module = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, backend, id_pixel_values); // [N, hidden_size] struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size]
struct ggml_tensor* id_embeds = visual_projection->forward(ctx, shared_id_embeds); // [N, proj_dim(768)] struct ggml_tensor* id_embeds = visual_projection->forward(ctx, shared_id_embeds); // [N, proj_dim(768)]
struct ggml_tensor* id_embeds_2 = visual_projection_2->forward(ctx, shared_id_embeds); // [N, 1280] struct ggml_tensor* id_embeds_2 = visual_projection_2->forward(ctx, shared_id_embeds); // [N, 1280]
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3)); id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 2, 0, 1, 3));
id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3)); id_embeds_2 = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds_2, 2, 0, 1, 3));
id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right id_embeds = ggml_concat(ctx->ggml_ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3)); id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 1, 2, 0, 3));
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
prompt_embeds, prompt_embeds,
@ -366,8 +365,7 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo
num_tokens)); num_tokens));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* id_pixel_values, struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* class_tokens_mask, struct ggml_tensor* class_tokens_mask,
@ -381,7 +379,7 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo
auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]); auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]);
// struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size] // struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size]
struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, backend, id_pixel_values, false); // [N, hidden_size] struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false); // [N, hidden_size]
id_embeds = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state); id_embeds = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
@ -414,7 +412,7 @@ public:
public: public:
PhotoMakerIDEncoder(ggml_backend_t backend, PhotoMakerIDEncoder(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
SDVersion version = VERSION_SDXL, SDVersion version = VERSION_SDXL,
PMVersion pm_v = PM_VERSION_1, PMVersion pm_v = PM_VERSION_1,
@ -424,9 +422,9 @@ public:
pm_version(pm_v), pm_version(pm_v),
style_strength(sty) { style_strength(sty) {
if (pm_version == PM_VERSION_1) { if (pm_version == PM_VERSION_1) {
id_encoder.init(params_ctx, tensor_types, prefix); id_encoder.init(params_ctx, tensor_storage_map, prefix);
} else if (pm_version == PM_VERSION_2) { } else if (pm_version == PM_VERSION_2) {
id_encoder2.init(params_ctx, tensor_types, prefix); id_encoder2.init(params_ctx, tensor_storage_map, prefix);
} }
} }
@ -458,7 +456,7 @@ public:
zeros_right.clear(); zeros_right.clear();
zeros_right_16.clear(); zeros_right_16.clear();
ggml_context* ctx0 = compute_ctx; auto runner_ctx = get_context();
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
@ -466,7 +464,7 @@ public:
int64_t seq_length = prompt_embeds->ne[1]; int64_t seq_length = prompt_embeds->ne[1];
ggml_type type = GGML_TYPE_F32; ggml_type type = GGML_TYPE_F32;
struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(ctx0, type, class_tokens_mask.size()); struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size());
struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values); struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
struct ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds); struct ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds);
@ -488,16 +486,16 @@ public:
} }
// printf("\n"); // printf("\n");
if (ctmpos[0] > 0) { if (ctmpos[0] > 0) {
// left = ggml_new_tensor_3d(ctx0, type, hidden_size, 1, ctmpos[0]); // left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, 1, ctmpos[0]);
left = ggml_new_tensor_3d(ctx0, type, hidden_size, ctmpos[0], 1); left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, ctmpos[0], 1);
} }
if (ctmpos[ctmpos.size() - 1] < seq_length - 1) { if (ctmpos[ctmpos.size() - 1] < seq_length - 1) {
// right = ggml_new_tensor_3d(ctx0, type, // right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
// hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1); // hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1);
right = ggml_new_tensor_3d(ctx0, type, right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1); hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1);
} }
struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ctmpos.size()); struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size());
{ {
if (type == GGML_TYPE_F16) if (type == GGML_TYPE_F16)
@ -530,16 +528,14 @@ public:
} }
struct ggml_tensor* updated_prompt_embeds = nullptr; struct ggml_tensor* updated_prompt_embeds = nullptr;
if (pm_version == PM_VERSION_1) if (pm_version == PM_VERSION_1)
updated_prompt_embeds = id_encoder.forward(ctx0, updated_prompt_embeds = id_encoder.forward(&runner_ctx,
runtime_backend,
id_pixel_values_d, id_pixel_values_d,
prompt_embeds_d, prompt_embeds_d,
class_tokens_mask_d, class_tokens_mask_d,
class_tokens_mask_pos, class_tokens_mask_pos,
left, right); left, right);
else if (pm_version == PM_VERSION_2) else if (pm_version == PM_VERSION_2)
updated_prompt_embeds = id_encoder2.forward(ctx0, updated_prompt_embeds = id_encoder2.forward(&runner_ctx,
runtime_backend,
id_pixel_values_d, id_pixel_values_d,
prompt_embeds_d, prompt_embeds_d,
class_tokens_mask_d, class_tokens_mask_d,
@ -552,7 +548,7 @@ public:
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* id_pixel_values, struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds, struct ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds, struct ggml_tensor* id_embeds,
@ -565,7 +561,7 @@ public:
}; };
// GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds); // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
} }
}; };
@ -582,7 +578,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
const std::string& file_path = "", const std::string& file_path = "",
const std::string& prefix = "") const std::string& prefix = "")
: file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) { : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
if (!model_loader->init_from_file(file_path, prefix)) { if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
load_failed = true; load_failed = true;
} }
} }

View File

@ -28,7 +28,7 @@ void gaussian_kernel(struct ggml_tensor* kernel) {
for (int x = 0; x < kernel->ne[1]; x++) { for (int x = 0; x < kernel->ne[1]; x++) {
float gy = -ks_mid + x; float gy = -ks_mid + x;
float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal; float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
ggml_tensor_set_f32(kernel, k_, x, y); ggml_ext_tensor_set_f32(kernel, k_, x, y);
} }
} }
} }
@ -36,11 +36,11 @@ void gaussian_kernel(struct ggml_tensor* kernel) {
void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) { void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
for (int iy = 0; iy < rgb_img->ne[1]; iy++) { for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
for (int ix = 0; ix < rgb_img->ne[0]; ix++) { for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
float r = ggml_tensor_get_f32(rgb_img, ix, iy); float r = ggml_ext_tensor_get_f32(rgb_img, ix, iy);
float g = ggml_tensor_get_f32(rgb_img, ix, iy, 1); float g = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 1);
float b = ggml_tensor_get_f32(rgb_img, ix, iy, 2); float b = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 2);
float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b; float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
ggml_tensor_set_f32(grayscale, gray, ix, iy); ggml_ext_tensor_set_f32(grayscale, gray, ix, iy);
} }
} }
} }
@ -81,37 +81,37 @@ void normalize_tensor(struct ggml_tensor* g) {
void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) { void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
for (int iy = 1; iy < result->ne[1] - 1; iy++) { for (int iy = 1; iy < result->ne[1] - 1; iy++) {
for (int ix = 1; ix < result->ne[0] - 1; ix++) { for (int ix = 1; ix < result->ne[0] - 1; ix++) {
float angle = ggml_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_; float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
angle = angle < 0.0f ? angle += 180.0f : angle; angle = angle < 0.0f ? angle += 180.0f : angle;
float q = 1.0f; float q = 1.0f;
float r = 1.0f; float r = 1.0f;
// angle 0 // angle 0
if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) { if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
q = ggml_tensor_get_f32(G, ix, iy + 1); q = ggml_ext_tensor_get_f32(G, ix, iy + 1);
r = ggml_tensor_get_f32(G, ix, iy - 1); r = ggml_ext_tensor_get_f32(G, ix, iy - 1);
} }
// angle 45 // angle 45
else if (22.5f >= angle && angle < 67.5f) { else if (22.5f >= angle && angle < 67.5f) {
q = ggml_tensor_get_f32(G, ix + 1, iy - 1); q = ggml_ext_tensor_get_f32(G, ix + 1, iy - 1);
r = ggml_tensor_get_f32(G, ix - 1, iy + 1); r = ggml_ext_tensor_get_f32(G, ix - 1, iy + 1);
} }
// angle 90 // angle 90
else if (67.5f >= angle && angle < 112.5) { else if (67.5f >= angle && angle < 112.5) {
q = ggml_tensor_get_f32(G, ix + 1, iy); q = ggml_ext_tensor_get_f32(G, ix + 1, iy);
r = ggml_tensor_get_f32(G, ix - 1, iy); r = ggml_ext_tensor_get_f32(G, ix - 1, iy);
} }
// angle 135 // angle 135
else if (112.5 >= angle && angle < 157.5f) { else if (112.5 >= angle && angle < 157.5f) {
q = ggml_tensor_get_f32(G, ix - 1, iy - 1); q = ggml_ext_tensor_get_f32(G, ix - 1, iy - 1);
r = ggml_tensor_get_f32(G, ix + 1, iy + 1); r = ggml_ext_tensor_get_f32(G, ix + 1, iy + 1);
} }
float cur = ggml_tensor_get_f32(G, ix, iy); float cur = ggml_ext_tensor_get_f32(G, ix, iy);
if ((cur >= q) && (cur >= r)) { if ((cur >= q) && (cur >= r)) {
ggml_tensor_set_f32(result, cur, ix, iy); ggml_ext_tensor_set_f32(result, cur, ix, iy);
} else { } else {
ggml_tensor_set_f32(result, 0.0f, ix, iy); ggml_ext_tensor_set_f32(result, 0.0f, ix, iy);
} }
} }
} }
@ -138,9 +138,9 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
for (int iy = 0; iy < img->ne[1]; iy++) { for (int iy = 0; iy < img->ne[1]; iy++) {
for (int ix = 0; ix < img->ne[0]; ix++) { for (int ix = 0; ix < img->ne[0]; ix++) {
if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) { if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
ggml_tensor_set_f32(img, ggml_tensor_get_f32(img, ix, iy), ix, iy); ggml_ext_tensor_set_f32(img, ggml_ext_tensor_get_f32(img, ix, iy), ix, iy);
} else { } else {
ggml_tensor_set_f32(img, 0.0f, ix, iy); ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
} }
} }
} }
@ -148,14 +148,14 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
// hysteresis // hysteresis
for (int iy = 1; iy < img->ne[1] - 1; iy++) { for (int iy = 1; iy < img->ne[1] - 1; iy++) {
for (int ix = 1; ix < img->ne[0] - 1; ix++) { for (int ix = 1; ix < img->ne[0] - 1; ix++) {
float imd_v = ggml_tensor_get_f32(img, ix, iy); float imd_v = ggml_ext_tensor_get_f32(img, ix, iy);
if (imd_v == weak) { if (imd_v == weak) {
if (ggml_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix + 1, iy) == strong || if (ggml_ext_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix + 1, iy) == strong ||
ggml_tensor_get_f32(img, ix, iy - 1) == strong || ggml_tensor_get_f32(img, ix, iy + 1) == strong || ggml_ext_tensor_get_f32(img, ix, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix, iy + 1) == strong ||
ggml_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix - 1, iy) == strong) { ggml_ext_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix - 1, iy) == strong) {
ggml_tensor_set_f32(img, strong, ix, iy); ggml_ext_tensor_set_f32(img, strong, ix, iy);
} else { } else {
ggml_tensor_set_f32(img, 0.0f, ix, iy); ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
} }
} }
} }
@ -198,7 +198,7 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
struct ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray); struct ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray); struct ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray); struct ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray);
sd_image_to_tensor(img, image); sd_image_to_ggml_tensor(img, image);
grayscale(image, image_gray); grayscale(image, image_gray);
convolve(image_gray, image_gray, gkernel, 2); convolve(image_gray, image_gray, gkernel, 2);
convolve(image_gray, iX, sf_kx, 1); convolve(image_gray, iX, sf_kx, 1);
@ -211,14 +211,14 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
// to RGB channels // to RGB channels
for (int iy = 0; iy < img.height; iy++) { for (int iy = 0; iy < img.height; iy++) {
for (int ix = 0; ix < img.width; ix++) { for (int ix = 0; ix < img.width; ix++) {
float gray = ggml_tensor_get_f32(image_gray, ix, iy); float gray = ggml_ext_tensor_get_f32(image_gray, ix, iy);
gray = inverse ? 1.0f - gray : gray; gray = inverse ? 1.0f - gray : gray;
ggml_tensor_set_f32(image, gray, ix, iy); ggml_ext_tensor_set_f32(image, gray, ix, iy);
ggml_tensor_set_f32(image, gray, ix, iy, 1); ggml_ext_tensor_set_f32(image, gray, ix, iy, 1);
ggml_tensor_set_f32(image, gray, ix, iy, 2); ggml_ext_tensor_set_f32(image, gray, ix, iy, 2);
} }
} }
sd_tensor_to_image(image, img.data); ggml_tensor_to_sd_image(image, img.data);
ggml_free(work_ctx); ggml_free(work_ctx);
return true; return true;
} }

View File

@ -27,18 +27,18 @@ namespace Qwen {
blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias)); blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* sample, struct ggml_tensor* sample,
struct ggml_tensor* condition = nullptr) { struct ggml_tensor* condition = nullptr) {
if (condition != nullptr) { if (condition != nullptr) {
auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]); auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]);
sample = ggml_add(ctx, sample, cond_proj->forward(ctx, condition)); sample = ggml_add(ctx->ggml_ctx, sample, cond_proj->forward(ctx, condition));
} }
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]); auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]); auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
sample = linear_1->forward(ctx, sample); sample = linear_1->forward(ctx, sample);
sample = ggml_silu_inplace(ctx, sample); sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
sample = linear_2->forward(ctx, sample); sample = linear_2->forward(ctx, sample);
return sample; return sample;
} }
@ -50,13 +50,13 @@ namespace Qwen {
blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim)); blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* timesteps) { struct ggml_tensor* timesteps) {
// timesteps: [N,] // timesteps: [N,]
// return: [N, embedding_dim] // return: [N, embedding_dim]
auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]); auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]);
auto timesteps_proj = ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1.f); auto timesteps_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1.f);
auto timesteps_emb = timestep_embedder->forward(ctx, timesteps_proj); auto timesteps_emb = timestep_embedder->forward(ctx, timesteps_proj);
return timesteps_emb; return timesteps_emb;
} }
@ -65,7 +65,6 @@ namespace Qwen {
struct QwenImageAttention : public GGMLBlock { struct QwenImageAttention : public GGMLBlock {
protected: protected:
int64_t dim_head; int64_t dim_head;
bool flash_attn;
public: public:
QwenImageAttention(int64_t query_dim, QwenImageAttention(int64_t query_dim,
@ -75,9 +74,8 @@ namespace Qwen {
int64_t out_context_dim = 0, int64_t out_context_dim = 0,
bool bias = true, bool bias = true,
bool out_bias = true, bool out_bias = true,
float eps = 1e-6, float eps = 1e-6)
bool flash_attn = false) : dim_head(dim_head) {
: dim_head(dim_head), flash_attn(flash_attn) {
int64_t inner_dim = out_dim > 0 ? out_dim : dim_head * num_heads; int64_t inner_dim = out_dim > 0 ? out_dim : dim_head * num_heads;
out_dim = out_dim > 0 ? out_dim : query_dim; out_dim = out_dim > 0 ? out_dim : query_dim;
out_context_dim = out_context_dim > 0 ? out_context_dim : query_dim; out_context_dim = out_context_dim > 0 ? out_context_dim : query_dim;
@ -96,17 +94,20 @@ namespace Qwen {
blocks["norm_added_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps)); blocks["norm_added_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
blocks["norm_added_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps)); blocks["norm_added_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
float scale = 1.f / 32.f; float scale = 1.f / 32.f;
bool force_prec_f32 = false;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
// The purpose of the scale here is to prevent NaN issues in certain situations. // The purpose of the scale here is to prevent NaN issues in certain situations.
// For example when using CUDA but the weights are k-quants (not all prompts). // For example when using CUDA but the weights are k-quants (not all prompts).
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, false, scale)); blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
// to_out.1 is nn.Dropout // to_out.1 is nn.Dropout
blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale)); blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
} }
std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx, std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* img, struct ggml_tensor* img,
struct ggml_tensor* txt, struct ggml_tensor* txt,
struct ggml_tensor* pe, struct ggml_tensor* pe,
@ -138,49 +139,49 @@ namespace Qwen {
auto img_q = to_q->forward(ctx, img); auto img_q = to_q->forward(ctx, img);
int64_t num_heads = img_q->ne[0] / dim_head; int64_t num_heads = img_q->ne[0] / dim_head;
img_q = ggml_reshape_4d(ctx, img_q, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head] img_q = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head]
auto img_k = to_k->forward(ctx, img); auto img_k = to_k->forward(ctx, img);
img_k = ggml_reshape_4d(ctx, img_k, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head] img_k = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head]
auto img_v = to_v->forward(ctx, img); auto img_v = to_v->forward(ctx, img);
img_v = ggml_reshape_4d(ctx, img_v, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head] img_v = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head]
img_q = norm_q->forward(ctx, img_q); img_q = norm_q->forward(ctx, img_q);
img_k = norm_k->forward(ctx, img_k); img_k = norm_k->forward(ctx, img_k);
auto txt_q = add_q_proj->forward(ctx, txt); auto txt_q = add_q_proj->forward(ctx, txt);
txt_q = ggml_reshape_4d(ctx, txt_q, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head] txt_q = ggml_reshape_4d(ctx->ggml_ctx, txt_q, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head]
auto txt_k = add_k_proj->forward(ctx, txt); auto txt_k = add_k_proj->forward(ctx, txt);
txt_k = ggml_reshape_4d(ctx, txt_k, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head] txt_k = ggml_reshape_4d(ctx->ggml_ctx, txt_k, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head]
auto txt_v = add_v_proj->forward(ctx, txt); auto txt_v = add_v_proj->forward(ctx, txt);
txt_v = ggml_reshape_4d(ctx, txt_v, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head] txt_v = ggml_reshape_4d(ctx->ggml_ctx, txt_v, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head]
txt_q = norm_added_q->forward(ctx, txt_q); txt_q = norm_added_q->forward(ctx, txt_q);
txt_k = norm_added_k->forward(ctx, txt_k); txt_k = norm_added_k->forward(ctx, txt_k);
auto q = ggml_concat(ctx, txt_q, img_q, 2); // [N, n_txt_token + n_img_token, n_head, d_head] auto q = ggml_concat(ctx->ggml_ctx, txt_q, img_q, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
auto k = ggml_concat(ctx, txt_k, img_k, 2); // [N, n_txt_token + n_img_token, n_head, d_head] auto k = ggml_concat(ctx->ggml_ctx, txt_k, img_k, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
auto v = ggml_concat(ctx, txt_v, img_v, 2); // [N, n_txt_token + n_img_token, n_head, d_head] auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2); // [N, n_txt_token + n_img_token, n_head, d_head]
auto attn = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, (1.0f / 128.f)); // [N, n_txt_token + n_img_token, n_head*d_head] auto attn = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f)); // [N, n_txt_token + n_img_token, n_head*d_head]
attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_txt_token + n_img_token, N, hidden_size] attn = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3)); // [n_txt_token + n_img_token, N, hidden_size]
auto txt_attn_out = ggml_view_3d(ctx, auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1], attn->ne[1],
txt->ne[1], txt->ne[1],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
0); // [n_txt_token, N, hidden_size] 0); // [n_txt_token, N, hidden_size]
txt_attn_out = ggml_cont(ctx, ggml_permute(ctx, txt_attn_out, 0, 2, 1, 3)); // [N, n_txt_token, hidden_size] txt_attn_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_attn_out, 0, 2, 1, 3)); // [N, n_txt_token, hidden_size]
auto img_attn_out = ggml_view_3d(ctx, auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
attn, attn,
attn->ne[0], attn->ne[0],
attn->ne[1], attn->ne[1],
img->ne[1], img->ne[1],
attn->nb[1], attn->nb[1],
attn->nb[2], attn->nb[2],
attn->nb[2] * txt->ne[1]); // [n_img_token, N, hidden_size] attn->nb[2] * txt->ne[1]); // [n_img_token, N, hidden_size]
img_attn_out = ggml_cont(ctx, ggml_permute(ctx, img_attn_out, 0, 2, 1, 3)); // [N, n_img_token, hidden_size] img_attn_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, img_attn_out, 0, 2, 1, 3)); // [N, n_img_token, hidden_size]
img_attn_out = to_out_0->forward(ctx, img_attn_out); img_attn_out = to_out_0->forward(ctx, img_attn_out);
txt_attn_out = to_add_out->forward(ctx, txt_attn_out); txt_attn_out = to_add_out->forward(ctx, txt_attn_out);
@ -194,8 +195,7 @@ namespace Qwen {
QwenImageTransformerBlock(int64_t dim, QwenImageTransformerBlock(int64_t dim,
int64_t num_attention_heads, int64_t num_attention_heads,
int64_t attention_head_dim, int64_t attention_head_dim,
float eps = 1e-6, float eps = 1e-6) {
bool flash_attn = false) {
// img_mod.0 is nn.SiLU() // img_mod.0 is nn.SiLU()
blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true)); blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));
@ -217,12 +217,10 @@ namespace Qwen {
0, // out_context-dim 0, // out_context-dim
true, // bias true, // bias
true, // out_bias true, // out_bias
eps, eps));
flash_attn));
} }
virtual std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx, virtual std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* img, struct ggml_tensor* img,
struct ggml_tensor* txt, struct ggml_tensor* txt,
struct ggml_tensor* t_emb, struct ggml_tensor* t_emb,
@ -244,40 +242,40 @@ namespace Qwen {
auto attn = std::dynamic_pointer_cast<QwenImageAttention>(blocks["attn"]); auto attn = std::dynamic_pointer_cast<QwenImageAttention>(blocks["attn"]);
auto img_mod_params = ggml_silu(ctx, t_emb); auto img_mod_params = ggml_silu(ctx->ggml_ctx, t_emb);
img_mod_params = img_mod_1->forward(ctx, img_mod_params); img_mod_params = img_mod_1->forward(ctx, img_mod_params);
auto img_mod_param_vec = ggml_chunk(ctx, img_mod_params, 6, 0); auto img_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0);
auto txt_mod_params = ggml_silu(ctx, t_emb); auto txt_mod_params = ggml_silu(ctx->ggml_ctx, t_emb);
txt_mod_params = txt_mod_1->forward(ctx, txt_mod_params); txt_mod_params = txt_mod_1->forward(ctx, txt_mod_params);
auto txt_mod_param_vec = ggml_chunk(ctx, txt_mod_params, 6, 0); auto txt_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0);
auto img_normed = img_norm1->forward(ctx, img); auto img_normed = img_norm1->forward(ctx, img);
auto img_modulated = Flux::modulate(ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]); auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]);
auto img_gate1 = img_mod_param_vec[2]; auto img_gate1 = img_mod_param_vec[2];
auto txt_normed = txt_norm1->forward(ctx, txt); auto txt_normed = txt_norm1->forward(ctx, txt);
auto txt_modulated = Flux::modulate(ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]); auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]);
auto txt_gate1 = txt_mod_param_vec[2]; auto txt_gate1 = txt_mod_param_vec[2];
auto [img_attn_output, txt_attn_output] = attn->forward(ctx, backend, img_modulated, txt_modulated, pe); auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe);
img = ggml_add(ctx, img, ggml_mul(ctx, img_attn_output, img_gate1)); img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_gate1));
txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn_output, txt_gate1)); txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_gate1));
auto img_normed2 = img_norm2->forward(ctx, img); auto img_normed2 = img_norm2->forward(ctx, img);
auto img_modulated2 = Flux::modulate(ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]); auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]);
auto img_gate2 = img_mod_param_vec[5]; auto img_gate2 = img_mod_param_vec[5];
auto txt_normed2 = txt_norm2->forward(ctx, txt); auto txt_normed2 = txt_norm2->forward(ctx, txt);
auto txt_modulated2 = Flux::modulate(ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]); auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]);
auto txt_gate2 = txt_mod_param_vec[5]; auto txt_gate2 = txt_mod_param_vec[5];
auto img_mlp_out = img_mlp->forward(ctx, img_modulated2); auto img_mlp_out = img_mlp->forward(ctx, img_modulated2);
auto txt_mlp_out = txt_mlp->forward(ctx, txt_modulated2); auto txt_mlp_out = txt_mlp->forward(ctx, txt_modulated2);
img = ggml_add(ctx, img, ggml_mul(ctx, img_mlp_out, img_gate2)); img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_gate2));
txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_mlp_out, txt_gate2)); txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_gate2));
return {img, txt}; return {img, txt};
} }
@ -294,7 +292,7 @@ namespace Qwen {
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias)); blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* c) { struct ggml_tensor* c) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
@ -304,13 +302,13 @@ namespace Qwen {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]); auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]); auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
auto emb = linear->forward(ctx, ggml_silu(ctx, c)); auto emb = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, c));
auto mods = ggml_chunk(ctx, emb, 2, 0); auto mods = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
auto scale = mods[0]; auto scale = mods[0];
auto shift = mods[1]; auto shift = mods[1];
x = norm->forward(ctx, x); x = norm->forward(ctx, x);
x = Flux::modulate(ctx, x, shift, scale); x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
return x; return x;
} }
@ -327,7 +325,6 @@ namespace Qwen {
float theta = 10000; float theta = 10000;
std::vector<int> axes_dim = {16, 56, 56}; std::vector<int> axes_dim = {16, 56, 56};
int64_t axes_dim_sum = 128; int64_t axes_dim_sum = 128;
bool flash_attn = false;
}; };
class QwenImageModel : public GGMLBlock { class QwenImageModel : public GGMLBlock {
@ -349,8 +346,7 @@ namespace Qwen {
auto block = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim, auto block = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
params.num_attention_heads, params.num_attention_heads,
params.attention_head_dim, params.attention_head_dim,
1e-6f, 1e-6f));
params.flash_attn));
blocks["transformer_blocks." + std::to_string(i)] = block; blocks["transformer_blocks." + std::to_string(i)] = block;
} }
@ -421,8 +417,7 @@ namespace Qwen {
return x; return x;
} }
struct ggml_tensor* forward_orig(struct ggml_context* ctx, struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* timestep, struct ggml_tensor* timestep,
struct ggml_tensor* context, struct ggml_tensor* context,
@ -442,7 +437,7 @@ namespace Qwen {
for (int i = 0; i < params.num_layers; i++) { for (int i = 0; i < params.num_layers; i++) {
auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
auto result = block->forward(ctx, backend, img, txt, t_emb, pe); auto result = block->forward(ctx, img, txt, t_emb, pe);
img = result.first; img = result.first;
txt = result.second; txt = result.second;
} }
@ -453,8 +448,7 @@ namespace Qwen {
return img; return img;
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* timestep, struct ggml_tensor* timestep,
struct ggml_tensor* context, struct ggml_tensor* context,
@ -472,32 +466,32 @@ namespace Qwen {
int64_t C = x->ne[2]; int64_t C = x->ne[2];
int64_t N = x->ne[3]; int64_t N = x->ne[3];
auto img = process_img(ctx, x); auto img = process_img(ctx->ggml_ctx, x);
uint64_t img_tokens = img->ne[1]; uint64_t img_tokens = img->ne[1];
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
for (ggml_tensor* ref : ref_latents) { for (ggml_tensor* ref : ref_latents) {
ref = process_img(ctx, ref); ref = process_img(ctx->ggml_ctx, ref);
img = ggml_concat(ctx, img, ref, 1); img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
} }
} }
int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size); int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size);
int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size); int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);
auto out = forward_orig(ctx, backend, img, timestep, context, pe); // [N, h_len*w_len, ph*pw*C] auto out = forward_orig(ctx, img, timestep, context, pe); // [N, h_len*w_len, ph*pw*C]
if (out->ne[1] > img_tokens) { if (out->ne[1] > img_tokens) {
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size] out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0); out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size] out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
} }
out = unpatchify(ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w] out = unpatchify(ctx->ggml_ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w]
// slice // slice
out = ggml_slice(ctx, out, 1, 0, H); // [N, C, H, W + pad_w] out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H); // [N, C, H, W + pad_w]
out = ggml_slice(ctx, out, 0, 0, W); // [N, C, H, W] out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W); // [N, C, H, W]
return out; return out;
} }
@ -512,14 +506,12 @@ namespace Qwen {
QwenImageRunner(ggml_backend_t backend, QwenImageRunner(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "", const std::string prefix = "",
SDVersion version = VERSION_QWEN_IMAGE, SDVersion version = VERSION_QWEN_IMAGE)
bool flash_attn = false)
: GGMLRunner(backend, offload_params_to_cpu) { : GGMLRunner(backend, offload_params_to_cpu) {
qwen_image_params.flash_attn = flash_attn;
qwen_image_params.num_layers = 0; qwen_image_params.num_layers = 0;
for (auto pair : tensor_types) { for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first; std::string tensor_name = pair.first;
if (tensor_name.find(prefix) == std::string::npos) if (tensor_name.find(prefix) == std::string::npos)
continue; continue;
@ -538,7 +530,7 @@ namespace Qwen {
} }
LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers); LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
qwen_image = QwenImageModel(qwen_image_params); qwen_image = QwenImageModel(qwen_image_params);
qwen_image.init(params_ctx, tensor_types, prefix); qwen_image.init(params_ctx, tensor_storage_map, prefix);
} }
std::string get_desc() override { std::string get_desc() override {
@ -555,7 +547,7 @@ namespace Qwen {
std::vector<ggml_tensor*> ref_latents = {}, std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false) { bool increase_ref_index = false) {
GGML_ASSERT(x->ne[3] == 1); GGML_ASSERT(x->ne[3] == 1);
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, QWEN_IMAGE_GRAPH_SIZE, false); struct ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
x = to_backend(x); x = to_backend(x);
context = to_backend(context); context = to_backend(context);
@ -582,8 +574,9 @@ namespace Qwen {
// pe->data = nullptr; // pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data()); set_backend_tensor_data(pe, pe_vec.data());
struct ggml_tensor* out = qwen_image.forward(compute_ctx, auto runner_ctx = get_context();
runtime_backend,
struct ggml_tensor* out = qwen_image.forward(&runner_ctx,
x, x,
timesteps, timesteps,
context, context,
@ -595,7 +588,7 @@ namespace Qwen {
return gf; return gf;
} }
void compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
@ -610,7 +603,7 @@ namespace Qwen {
return build_graph(x, timesteps, context, ref_latents, increase_ref_index); return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
void test() { void test() {
@ -649,31 +642,29 @@ namespace Qwen {
static void load_from_file_and_test(const std::string& file_path) { static void load_from_file_and_test(const std::string& file_path) {
// cuda q8: pass // cuda q8: pass
// cuda q8 fa: nan // cuda q8 fa: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0); // ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init(); ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_Q8_0; ggml_type model_data_type = GGML_TYPE_Q8_0;
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) { if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }
auto tensor_types = model_loader.tensor_storages_types; auto& tensor_storage_map = model_loader.get_tensor_storage_map();
for (auto& item : tensor_types) { for (auto& [name, tensor_storage] : tensor_storage_map) {
// LOG_DEBUG("%s %u", item.first.c_str(), item.second); if (ends_with(name, "weight")) {
if (ends_with(item.first, "weight")) { tensor_storage.expected_type = model_data_type;
item.second = model_data_type;
} }
} }
std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend, std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
false, false,
tensor_types, tensor_storage_map,
"model.diffusion_model", "model.diffusion_model",
VERSION_QWEN_IMAGE, VERSION_QWEN_IMAGE);
true);
qwen_image->alloc_params_buffer(); qwen_image->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors; std::map<std::string, ggml_tensor*> tensors;

147
rng_mt19937.hpp Normal file
View File

@ -0,0 +1,147 @@
#ifndef __RNG_MT19937_HPP__
#define __RNG_MT19937_HPP__
#include <cmath>
#include <vector>
#include "rng.hpp"
// RNG imitiating torch cpu randn on CPU.
// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
class MT19937RNG : public RNG {
static const int N = 624;
static const int M = 397;
static const uint32_t MATRIX_A = 0x9908b0dfU;
static const uint32_t UMASK = 0x80000000U;
static const uint32_t LMASK = 0x7fffffffU;
struct State {
uint64_t seed_;
int left_;
bool seeded_;
uint32_t next_;
std::array<uint32_t, N> state_;
bool has_next_gauss = false;
double next_gauss = 0.0f;
};
State s;
uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
void next_state() {
uint32_t* p = s.state_.data();
s.left_ = N;
s.next_ = 0;
for (int j = N - M + 1; --j; p++)
p[0] = p[M] ^ twist(p[0], p[1]);
for (int j = M; --j; p++)
p[0] = p[M - N] ^ twist(p[0], p[1]);
p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
}
uint32_t rand_uint32() {
if (--s.left_ == 0)
next_state();
uint32_t y = s.state_[s.next_++];
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680U;
y ^= (y << 15) & 0xefc60000U;
y ^= (y >> 18);
return y;
}
uint64_t rand_uint64() {
uint64_t high = (uint64_t)rand_uint32();
uint64_t low = (uint64_t)rand_uint32();
return (high << 32) | low;
}
template <typename T, typename V>
T uniform_real(V val, T from, T to) {
constexpr auto MASK = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
T x = (val & MASK) * DIVISOR;
return (x * (to - from) + from);
}
double normal_double_value(double mean, double std) {
if (s.has_next_gauss) {
s.has_next_gauss = false;
return s.next_gauss;
}
double u1 = uniform_real(rand_uint64(), 0., 1.); // double
double u2 = uniform_real(rand_uint64(), 0., 1.); // double
double r = std::sqrt(-2.0 * std::log1p(-u2));
double theta = 2.0 * 3.14159265358979323846 * u1;
double value = r * std::cos(theta) * std + mean;
s.next_gauss = r * std::sin(theta) * std + mean;
s.has_next_gauss = true;
return value;
}
void normal_fill_16(float* data, float mean, float std) {
for (int j = 0; j < 8; ++j) {
float u1 = 1.0f - data[j];
float u2 = data[j + 8];
float r = std::sqrt(-2.0f * std::log(u1));
float theta = 2.0f * 3.14159265358979323846 * u2;
data[j] = r * std::cos(theta) * std + mean;
data[j + 8] = r * std::sin(theta) * std + mean;
}
}
void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
if (size >= 16) {
for (int64_t i = 0; i < size; i++) {
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
}
for (int64_t i = 0; i < size - 15; i += 16) {
normal_fill_16(data + i, mean, std);
}
if (size % 16 != 0) {
// Recompute the last 16 values.
data = data + size - 16;
for (int64_t i = 0; i < 16; i++) {
data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
}
normal_fill_16(data, mean, std);
}
} else {
// Strange handling, hard to understand, but keeping it consistent with PyTorch.
for (int64_t i = 0; i < size; i++) {
data[i] = (float)normal_double_value(mean, std);
}
}
}
public:
MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
void manual_seed(uint64_t seed) override {
s.seed_ = seed;
s.seeded_ = true;
s.state_[0] = (uint32_t)(seed & 0xffffffffU);
for (int j = 1; j < N; j++) {
uint32_t prev = s.state_[j - 1];
s.state_[j] = 1812433253U * (prev ^ (prev >> 30)) + j;
}
s.left_ = 1;
s.next_ = 0;
s.has_next_gauss = false;
}
std::vector<float> randn(uint32_t n) override {
std::vector<float> out;
out.resize(n);
randn((float*)out.data(), out.size());
return out;
}
};
#endif // __RNG_MT19937_HPP__

130
rope.hpp
View File

@ -72,15 +72,30 @@ namespace Rope {
} }
// Generate IDs for image patches and text // Generate IDs for image patches and text
__STATIC_INLINE__ std::vector<std::vector<float>> gen_txt_ids(int bs, int context_len) { __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num, std::set<int> arange_dims) {
return std::vector<std::vector<float>>(bs * context_len, std::vector<float>(3, 0.0)); auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
for (int dim = 0; dim < axes_dim_num; dim++) {
if (arange_dims.find(dim) != arange_dims.end()) {
for (int i = 0; i < bs * context_len; i++) {
txt_ids[i][dim] = (i % context_len);
}
}
}
return txt_ids;
} }
__STATIC_INLINE__ std::vector<std::vector<float>> gen_img_ids(int h, int w, int patch_size, int bs, int index = 0, int h_offset = 0, int w_offset = 0) { __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_img_ids(int h,
int w,
int patch_size,
int bs,
int axes_dim_num,
int index = 0,
int h_offset = 0,
int w_offset = 0) {
int h_len = (h + (patch_size / 2)) / patch_size; int h_len = (h + (patch_size / 2)) / patch_size;
int w_len = (w + (patch_size / 2)) / patch_size; int w_len = (w + (patch_size / 2)) / patch_size;
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0)); std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));
std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len); std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
std::vector<float> col_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len); std::vector<float> col_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
@ -153,8 +168,10 @@ namespace Rope {
__STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size, __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
int bs, int bs,
int axes_dim_num,
const std::vector<ggml_tensor*>& ref_latents, const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index) { bool increase_ref_index,
float ref_index_scale) {
std::vector<std::vector<float>> ids; std::vector<std::vector<float>> ids;
uint64_t curr_h_offset = 0; uint64_t curr_h_offset = 0;
uint64_t curr_w_offset = 0; uint64_t curr_w_offset = 0;
@ -170,7 +187,14 @@ namespace Rope {
} }
} }
auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, index, h_offset, w_offset); auto ref_ids = gen_flux_img_ids(ref->ne[1],
ref->ne[0],
patch_size,
bs,
axes_dim_num,
static_cast<int>(index * ref_index_scale),
h_offset,
w_offset);
ids = concat_ids(ids, ref_ids, bs); ids = concat_ids(ids, ref_ids, bs);
if (increase_ref_index) { if (increase_ref_index) {
@ -187,15 +211,18 @@ namespace Rope {
int w, int w,
int patch_size, int patch_size,
int bs, int bs,
int axes_dim_num,
int context_len, int context_len,
std::set<int> txt_arange_dims,
const std::vector<ggml_tensor*>& ref_latents, const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index) { bool increase_ref_index,
auto txt_ids = gen_txt_ids(bs, context_len); float ref_index_scale) {
auto img_ids = gen_img_ids(h, w, patch_size, bs); auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
auto ids = concat_ids(txt_ids, img_ids, bs); auto ids = concat_ids(txt_ids, img_ids, bs);
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
auto refs_ids = gen_refs_ids(patch_size, bs, ref_latents, increase_ref_index); auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale);
ids = concat_ids(ids, refs_ids, bs); ids = concat_ids(ids, refs_ids, bs);
} }
return ids; return ids;
@ -207,11 +234,22 @@ namespace Rope {
int patch_size, int patch_size,
int bs, int bs,
int context_len, int context_len,
std::set<int> txt_arange_dims,
const std::vector<ggml_tensor*>& ref_latents, const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index, bool increase_ref_index,
float ref_index_scale,
int theta, int theta,
const std::vector<int>& axes_dim) { const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index); std::vector<std::vector<float>> ids = gen_flux_ids(h,
w,
patch_size,
bs,
static_cast<int>(axes_dim.size()),
context_len,
txt_arange_dims,
ref_latents,
increase_ref_index,
ref_index_scale);
return embed_nd(ids, bs, theta, axes_dim); return embed_nd(ids, bs, theta, axes_dim);
} }
@ -232,10 +270,11 @@ namespace Rope {
txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]}; txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
} }
} }
auto img_ids = gen_img_ids(h, w, patch_size, bs); int axes_dim_num = 3;
auto ids = concat_ids(txt_ids_repeated, img_ids, bs); auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
auto ids = concat_ids(txt_ids_repeated, img_ids, bs);
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
auto refs_ids = gen_refs_ids(patch_size, bs, ref_latents, increase_ref_index); auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f);
ids = concat_ids(ids, refs_ids, bs); ids = concat_ids(ids, refs_ids, bs);
} }
return ids; return ids;
@ -345,6 +384,55 @@ namespace Rope {
return embed_nd(ids, 1, theta, axes_dim); return embed_nd(ids, 1, theta, axes_dim);
} }
__STATIC_INLINE__ int bound_mod(int a, int m) {
return (m - (a % m)) % m;
}
__STATIC_INLINE__ std::vector<std::vector<float>> gen_z_image_ids(int h,
int w,
int patch_size,
int bs,
int context_len,
int seq_multi_of,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index) {
int padded_context_len = context_len + bound_mod(context_len, seq_multi_of);
auto txt_ids = std::vector<std::vector<float>>(bs * padded_context_len, std::vector<float>(3, 0.0f));
for (int i = 0; i < bs * padded_context_len; i++) {
txt_ids[i][0] = (i % padded_context_len) + 1.f;
}
int axes_dim_num = 3;
int index = padded_context_len + 1;
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, index);
int img_pad_len = bound_mod(static_cast<int>(img_ids.size() / bs), seq_multi_of);
if (img_pad_len > 0) {
std::vector<std::vector<float>> img_pad_ids(bs * img_pad_len, std::vector<float>(3, 0.f));
img_ids = concat_ids(img_ids, img_pad_ids, bs);
}
auto ids = concat_ids(txt_ids, img_ids, bs);
// ignore ref_latents for now
return ids;
}
// Generate z_image positional embeddings
__STATIC_INLINE__ std::vector<float> gen_z_image_pe(int h,
int w,
int patch_size,
int bs,
int context_len,
int seq_multi_of,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index,
int theta,
const std::vector<int>& axes_dim) {
std::vector<std::vector<float>> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index);
return embed_nd(ids, bs, theta, axes_dim);
}
__STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx, __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* pe, struct ggml_tensor* pe,
@ -360,8 +448,8 @@ namespace Rope {
x = ggml_reshape_4d(ctx, x, 2, d_head / 2, L, n_head * N); // [N * n_head, L, d_head/2, 2] x = ggml_reshape_4d(ctx, x, 2, d_head / 2, L, n_head * N); // [N * n_head, L, d_head/2, 2]
x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 0, 1, 2)); // [2, N * n_head, L, d_head/2] x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 0, 1, 2)); // [2, N * n_head, L, d_head/2]
} else { } else {
x = ggml_reshape_4d(ctx, x, d_head / 2, 2, L, n_head * N); // [N * n_head, L, 2, d_head/2] x = ggml_reshape_4d(ctx, x, d_head / 2, 2, L, n_head * N); // [N * n_head, L, 2, d_head/2]
x = ggml_cont(ctx, ggml_torch_permute(ctx, x, 0, 2, 3, 1)); // [2, N * n_head, L, d_head/2] x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 3, 1)); // [2, N * n_head, L, d_head/2]
} }
int64_t offset = x->nb[2] * x->ne[2]; int64_t offset = x->nb[2] * x->ne[2];
@ -386,23 +474,21 @@ namespace Rope {
return x_out; return x_out;
} }
__STATIC_INLINE__ struct ggml_tensor* attention(struct ggml_context* ctx, __STATIC_INLINE__ struct ggml_tensor* attention(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* q, struct ggml_tensor* q,
struct ggml_tensor* k, struct ggml_tensor* k,
struct ggml_tensor* v, struct ggml_tensor* v,
struct ggml_tensor* pe, struct ggml_tensor* pe,
struct ggml_tensor* mask, struct ggml_tensor* mask,
bool flash_attn,
float kv_scale = 1.0f, float kv_scale = 1.0f,
bool rope_interleaved = true) { bool rope_interleaved = true) {
// q,k,v: [N, L, n_head, d_head] // q,k,v: [N, L, n_head, d_head]
// pe: [L, d_head/2, 2, 2] // pe: [L, d_head/2, 2, 2]
// return: [N, L, n_head*d_head] // return: [N, L, n_head*d_head]
q = apply_rope(ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head] q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved); // [N*n_head, L, d_head]
k = apply_rope(ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head] k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved); // [N*n_head, L, d_head]
auto x = ggml_nn_attention_ext(ctx, backend, q, k, v, v->ne[1], mask, false, true, flash_attn, kv_scale); // [N, L, n_head*d_head] auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, false, true, ctx->flash_attn_enabled, kv_scale); // [N, L, n_head*d_head]
return x; return x;
} }
}; // namespace Rope }; // namespace Rope

File diff suppressed because it is too large Load Diff

View File

@ -31,46 +31,46 @@ extern "C" {
enum rng_type_t { enum rng_type_t {
STD_DEFAULT_RNG, STD_DEFAULT_RNG,
CUDA_RNG, CUDA_RNG,
CPU_RNG,
RNG_TYPE_COUNT RNG_TYPE_COUNT
}; };
enum sample_method_t { enum sample_method_t {
SAMPLE_METHOD_DEFAULT, EULER_SAMPLE_METHOD,
EULER, EULER_A_SAMPLE_METHOD,
HEUN, HEUN_SAMPLE_METHOD,
DPM2, DPM2_SAMPLE_METHOD,
DPMPP2S_A, DPMPP2S_A_SAMPLE_METHOD,
DPMPP2M, DPMPP2M_SAMPLE_METHOD,
DPMPP2Mv2, DPMPP2Mv2_SAMPLE_METHOD,
IPNDM, IPNDM_SAMPLE_METHOD,
IPNDM_V, IPNDM_V_SAMPLE_METHOD,
LCM, LCM_SAMPLE_METHOD,
DDIM_TRAILING, DDIM_TRAILING_SAMPLE_METHOD,
TCD, TCD_SAMPLE_METHOD,
EULER_A,
SAMPLE_METHOD_COUNT SAMPLE_METHOD_COUNT
}; };
enum scheduler_t { enum scheduler_t {
DEFAULT, DISCRETE_SCHEDULER,
DISCRETE, KARRAS_SCHEDULER,
KARRAS, EXPONENTIAL_SCHEDULER,
EXPONENTIAL, AYS_SCHEDULER,
AYS, GITS_SCHEDULER,
GITS, SGM_UNIFORM_SCHEDULER,
SGM_UNIFORM, SIMPLE_SCHEDULER,
SIMPLE, SMOOTHSTEP_SCHEDULER,
SMOOTHSTEP, LCM_SCHEDULER,
SCHEDULE_COUNT SCHEDULER_COUNT
}; };
enum prediction_t { enum prediction_t {
DEFAULT_PRED,
EPS_PRED, EPS_PRED,
V_PRED, V_PRED,
EDM_V_PRED, EDM_V_PRED,
SD3_FLOW_PRED, FLOW_PRED,
FLUX_FLOW_PRED, FLUX_FLOW_PRED,
FLUX2_FLOW_PRED,
PREDICTION_COUNT PREDICTION_COUNT
}; };
@ -126,6 +126,21 @@ enum sd_log_level_t {
SD_LOG_ERROR SD_LOG_ERROR
}; };
enum preview_t {
PREVIEW_NONE,
PREVIEW_PROJ,
PREVIEW_TAE,
PREVIEW_VAE,
PREVIEW_COUNT
};
enum lora_apply_mode_t {
LORA_APPLY_AUTO,
LORA_APPLY_IMMEDIATELY,
LORA_APPLY_AT_RUNTIME,
LORA_APPLY_MODE_COUNT,
};
typedef struct { typedef struct {
bool enabled; bool enabled;
int tile_size_x; int tile_size_x;
@ -135,33 +150,43 @@ typedef struct {
float rel_size_y; float rel_size_y;
} sd_tiling_params_t; } sd_tiling_params_t;
typedef struct {
const char* name;
const char* path;
} sd_embedding_t;
typedef struct { typedef struct {
const char* model_path; const char* model_path;
const char* clip_l_path; const char* clip_l_path;
const char* clip_g_path; const char* clip_g_path;
const char* clip_vision_path; const char* clip_vision_path;
const char* t5xxl_path; const char* t5xxl_path;
const char* qwen2vl_path; const char* llm_path;
const char* qwen2vl_vision_path; const char* llm_vision_path;
const char* diffusion_model_path; const char* diffusion_model_path;
const char* high_noise_diffusion_model_path; const char* high_noise_diffusion_model_path;
const char* vae_path; const char* vae_path;
const char* taesd_path; const char* taesd_path;
const char* control_net_path; const char* control_net_path;
const char* lora_model_dir; const char* lora_model_dir;
const char* embedding_dir; const sd_embedding_t* embeddings;
uint32_t embedding_count;
const char* photo_maker_path; const char* photo_maker_path;
const char* tensor_type_rules;
bool vae_decode_only; bool vae_decode_only;
bool free_params_immediately; bool free_params_immediately;
int n_threads; int n_threads;
enum sd_type_t wtype; enum sd_type_t wtype;
enum rng_type_t rng_type; enum rng_type_t rng_type;
enum rng_type_t sampler_rng_type;
enum prediction_t prediction; enum prediction_t prediction;
enum lora_apply_mode_t lora_apply_mode;
bool offload_params_to_cpu; bool offload_params_to_cpu;
bool keep_clip_on_cpu; bool keep_clip_on_cpu;
bool keep_control_net_on_cpu; bool keep_control_net_on_cpu;
bool keep_vae_on_cpu; bool keep_vae_on_cpu;
bool diffusion_flash_attn; bool diffusion_flash_attn;
bool tae_preview_only;
bool diffusion_conv_direct; bool diffusion_conv_direct;
bool vae_conv_direct; bool vae_conv_direct;
bool force_sdxl_vae_conv_scale; bool force_sdxl_vae_conv_scale;
@ -210,6 +235,21 @@ typedef struct {
} sd_pm_params_t; // photo maker } sd_pm_params_t; // photo maker
typedef struct { typedef struct {
bool enabled;
float reuse_threshold;
float start_percent;
float end_percent;
} sd_easycache_params_t;
typedef struct {
bool is_high_noise;
float multiplier;
const char* path;
} sd_lora_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt; const char* prompt;
const char* negative_prompt; const char* negative_prompt;
int clip_skip; int clip_skip;
@ -229,9 +269,12 @@ typedef struct {
float control_strength; float control_strength;
sd_pm_params_t pm_params; sd_pm_params_t pm_params;
sd_tiling_params_t vae_tiling_params; sd_tiling_params_t vae_tiling_params;
sd_easycache_params_t easycache;
} sd_img_gen_params_t; } sd_img_gen_params_t;
typedef struct { typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt; const char* prompt;
const char* negative_prompt; const char* negative_prompt;
int clip_skip; int clip_skip;
@ -248,16 +291,19 @@ typedef struct {
int64_t seed; int64_t seed;
int video_frames; int video_frames;
float vace_strength; float vace_strength;
sd_easycache_params_t easycache;
} sd_vid_gen_params_t; } sd_vid_gen_params_t;
typedef struct sd_ctx_t sd_ctx_t; typedef struct sd_ctx_t sd_ctx_t;
typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy, void* data);
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
SD_API int32_t get_num_physical_cores(); SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
SD_API int32_t sd_get_num_physical_cores();
SD_API const char* sd_get_system_info(); SD_API const char* sd_get_system_info();
SD_API const char* sd_type_name(enum sd_type_t type); SD_API const char* sd_type_name(enum sd_type_t type);
@ -266,21 +312,29 @@ SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
SD_API enum rng_type_t str_to_rng_type(const char* str); SD_API enum rng_type_t str_to_rng_type(const char* str);
SD_API const char* sd_sample_method_name(enum sample_method_t sample_method); SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
SD_API enum sample_method_t str_to_sample_method(const char* str); SD_API enum sample_method_t str_to_sample_method(const char* str);
SD_API const char* sd_schedule_name(enum scheduler_t scheduler); SD_API const char* sd_scheduler_name(enum scheduler_t scheduler);
SD_API enum scheduler_t str_to_schedule(const char* str); SD_API enum scheduler_t str_to_scheduler(const char* str);
SD_API const char* sd_prediction_name(enum prediction_t prediction); SD_API const char* sd_prediction_name(enum prediction_t prediction);
SD_API enum prediction_t str_to_prediction(const char* str); SD_API enum prediction_t str_to_prediction(const char* str);
SD_API const char* sd_preview_name(enum preview_t preview);
SD_API enum preview_t str_to_preview(const char* str);
SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
SD_API void sd_easycache_params_init(sd_easycache_params_t* easycache_params);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params); SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);
SD_API void sd_sample_params_init(sd_sample_params_t* sample_params); SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params); SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);
SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);
SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx);
SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params); SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
@ -293,7 +347,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
bool offload_params_to_cpu, bool offload_params_to_cpu,
bool direct, bool direct,
int n_threads); int n_threads,
int tile_size);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
@ -315,6 +370,9 @@ SD_API bool preprocess_canny(sd_image_t image,
float strong, float strong,
bool inverse); bool inverse);
SD_API const char* sd_commit(void);
SD_API const char* sd_version(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

98
t5.hpp
View File

@ -461,7 +461,7 @@ protected:
int64_t hidden_size; int64_t hidden_size;
float eps; float eps;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
} }
@ -472,10 +472,10 @@ public:
: hidden_size(hidden_size), : hidden_size(hidden_size),
eps(eps) {} eps(eps) {}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
struct ggml_tensor* w = params["weight"]; struct ggml_tensor* w = params["weight"];
x = ggml_rms_norm(ctx, x, eps); x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
x = ggml_mul(ctx, x, w); x = ggml_mul(ctx->ggml_ctx, x, w);
return x; return x;
} }
}; };
@ -487,13 +487,13 @@ public:
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false)); blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]); auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
x = wi->forward(ctx, x); x = wi->forward(ctx, x);
x = ggml_relu_inplace(ctx, x); x = ggml_relu_inplace(ctx->ggml_ctx, x);
x = wo->forward(ctx, x); x = wo->forward(ctx, x);
return x; return x;
} }
@ -509,15 +509,15 @@ public:
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale)); blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]); auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]); auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
auto hidden_gelu = ggml_gelu_inplace(ctx, wi_0->forward(ctx, x)); auto hidden_gelu = ggml_gelu_inplace(ctx->ggml_ctx, wi_0->forward(ctx, x));
auto hidden_linear = wi_1->forward(ctx, x); auto hidden_linear = wi_1->forward(ctx, x);
x = ggml_mul_inplace(ctx, hidden_gelu, hidden_linear); x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
x = wo->forward(ctx, x); x = wo->forward(ctx, x);
return x; return x;
} }
@ -530,14 +530,14 @@ public:
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]); auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
auto forwarded_states = layer_norm->forward(ctx, x); auto forwarded_states = layer_norm->forward(ctx, x);
forwarded_states = DenseReluDense->forward(ctx, forwarded_states); forwarded_states = DenseReluDense->forward(ctx, forwarded_states);
x = ggml_add_inplace(ctx, forwarded_states, x); x = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
return x; return x;
} }
}; };
@ -569,18 +569,17 @@ public:
} }
} }
struct ggml_tensor* compute_bias(struct ggml_context* ctx, struct ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
struct ggml_tensor* relative_position_bucket) { struct ggml_tensor* relative_position_bucket) {
auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]); auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads)
values = ggml_cont(ctx, ggml_permute(ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length) values = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length)
return values; return values;
} }
// x: [N, n_token, model_dim] // x: [N, n_token, model_dim]
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* past_bias = nullptr, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* mask = nullptr, struct ggml_tensor* mask = nullptr,
@ -602,16 +601,16 @@ public:
} }
if (past_bias != nullptr) { if (past_bias != nullptr) {
if (mask != nullptr) { if (mask != nullptr) {
mask = ggml_repeat(ctx, mask, past_bias); mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
mask = ggml_add(ctx, mask, past_bias); mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
} else { } else {
mask = past_bias; mask = past_bias;
} }
} }
k = ggml_scale_inplace(ctx, k, sqrt(d_head)); k = ggml_scale_inplace(ctx->ggml_ctx, k, sqrt(d_head));
x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head] x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head]
x = out_proj->forward(ctx, x); // [N, n_token, model_dim] x = out_proj->forward(ctx, x); // [N, n_token, model_dim]
return {x, past_bias}; return {x, past_bias};
@ -629,8 +628,7 @@ public:
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
} }
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* past_bias = nullptr, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* mask = nullptr, struct ggml_tensor* mask = nullptr,
@ -640,11 +638,11 @@ public:
auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
auto normed_hidden_state = layer_norm->forward(ctx, x); auto normed_hidden_state = layer_norm->forward(ctx, x);
auto ret = SelfAttention->forward(ctx, backend, normed_hidden_state, past_bias, mask, relative_position_bucket); auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
auto output = ret.first; auto output = ret.first;
past_bias = ret.second; past_bias = ret.second;
x = ggml_add_inplace(ctx, output, x); x = ggml_add_inplace(ctx->ggml_ctx, output, x);
return {x, past_bias}; return {x, past_bias};
} }
}; };
@ -656,8 +654,7 @@ public:
blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim)); blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
} }
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* past_bias = nullptr, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* mask = nullptr, struct ggml_tensor* mask = nullptr,
@ -666,7 +663,7 @@ public:
auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]); auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]); auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
auto ret = layer_0->forward(ctx, backend, x, past_bias, mask, relative_position_bucket); auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
x = ret.first; x = ret.first;
past_bias = ret.second; past_bias = ret.second;
x = layer_1->forward(ctx, x); x = layer_1->forward(ctx, x);
@ -692,8 +689,7 @@ public:
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* past_bias = nullptr, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* attention_mask = nullptr, struct ggml_tensor* attention_mask = nullptr,
@ -702,7 +698,7 @@ public:
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
auto ret = block->forward(ctx, backend, x, past_bias, attention_mask, relative_position_bucket); auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
x = ret.first; x = ret.first;
past_bias = ret.second; past_bias = ret.second;
} }
@ -740,8 +736,7 @@ public:
params.model_dim)); params.model_dim));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* past_bias = nullptr, struct ggml_tensor* past_bias = nullptr,
struct ggml_tensor* attention_mask = nullptr, struct ggml_tensor* attention_mask = nullptr,
@ -752,7 +747,7 @@ public:
auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]); auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
auto x = shared->forward(ctx, input_ids); auto x = shared->forward(ctx, input_ids);
x = encoder->forward(ctx, backend, x, past_bias, attention_mask, relative_position_bucket); x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
return x; return x;
} }
}; };
@ -764,7 +759,7 @@ struct T5Runner : public GGMLRunner {
T5Runner(ggml_backend_t backend, T5Runner(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
bool is_umt5 = false) bool is_umt5 = false)
: GGMLRunner(backend, offload_params_to_cpu) { : GGMLRunner(backend, offload_params_to_cpu) {
@ -773,7 +768,7 @@ struct T5Runner : public GGMLRunner {
params.relative_attention = false; params.relative_attention = false;
} }
model = T5(params); model = T5(params);
model.init(params_ctx, tensor_types, prefix); model.init(params_ctx, tensor_storage_map, prefix);
} }
std::string get_desc() override { std::string get_desc() override {
@ -784,15 +779,14 @@ struct T5Runner : public GGMLRunner {
model.get_param_tensors(tensors, prefix); model.get_param_tensors(tensors, prefix);
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* relative_position_bucket, struct ggml_tensor* relative_position_bucket,
struct ggml_tensor* attention_mask = nullptr) { struct ggml_tensor* attention_mask = nullptr) {
size_t N = input_ids->ne[1]; size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0]; size_t n_token = input_ids->ne[0];
auto hidden_states = model.forward(ctx, backend, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim] auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim]
return hidden_states; return hidden_states;
} }
@ -818,14 +812,15 @@ struct T5Runner : public GGMLRunner {
input_ids->ne[0]); input_ids->ne[0]);
set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, relative_position_bucket, attention_mask); auto runner_ctx = get_context();
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* input_ids, struct ggml_tensor* input_ids,
struct ggml_tensor* attention_mask, struct ggml_tensor* attention_mask,
ggml_tensor** output, ggml_tensor** output,
@ -833,7 +828,7 @@ struct T5Runner : public GGMLRunner {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(input_ids, attention_mask); return build_graph(input_ids, attention_mask);
}; };
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
} }
static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position, static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
@ -910,10 +905,10 @@ struct T5Embedder {
T5Embedder(ggml_backend_t backend, T5Embedder(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "", const std::string prefix = "",
bool is_umt5 = false) bool is_umt5 = false)
: model(backend, offload_params_to_cpu, tensor_types, prefix, is_umt5), tokenizer(is_umt5) { : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@ -1009,20 +1004,19 @@ struct T5Embedder {
ggml_type model_data_type = GGML_TYPE_F16; ggml_type model_data_type = GGML_TYPE_F16;
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return; return;
} }
auto tensor_types = model_loader.tensor_storages_types; auto& tensor_storage_map = model_loader.get_tensor_storage_map();
for (auto& item : tensor_types) { for (auto& [name, tensor_storage] : tensor_storage_map) {
// LOG_DEBUG("%s %u", item.first.c_str(), item.second); if (ends_with(name, "weight")) {
if (ends_with(item.first, "weight")) { tensor_storage.expected_type = model_data_type;
item.second = model_data_type;
} }
} }
std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_types, "", true); std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_storage_map, "", true);
t5->alloc_params_buffer(); t5->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors; std::map<std::string, ggml_tensor*> tensors;

52
tae.hpp
View File

@ -29,7 +29,7 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [n, n_in, h, w] // x: [n, n_in, h, w]
// return: [n, n_out, h, w] // return: [n, n_out, h, w]
@ -38,9 +38,9 @@ public:
auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]); auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
auto h = conv_0->forward(ctx, x); auto h = conv_0->forward(ctx, x);
h = ggml_relu_inplace(ctx, h); h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv_2->forward(ctx, h); h = conv_2->forward(ctx, h);
h = ggml_relu_inplace(ctx, h); h = ggml_relu_inplace(ctx->ggml_ctx, h);
h = conv_4->forward(ctx, h); h = conv_4->forward(ctx, h);
if (n_in != n_out) { if (n_in != n_out) {
@ -49,8 +49,8 @@ public:
x = skip->forward(ctx, x); x = skip->forward(ctx, x);
} }
h = ggml_add(ctx, h, x); h = ggml_add(ctx->ggml_ctx, h, x);
h = ggml_relu_inplace(ctx, h); h = ggml_relu_inplace(ctx->ggml_ctx, h);
return h; return h;
} }
}; };
@ -86,7 +86,7 @@ public:
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1})); blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [n, in_channels, h, w] // x: [n, in_channels, h, w]
// return: [n, z_channels, h/8, w/8] // return: [n, z_channels, h/8, w/8]
@ -136,20 +136,20 @@ public:
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override {
// z: [n, z_channels, h, w] // z: [n, z_channels, h, w]
// return: [n, out_channels, h*8, w*8] // return: [n, out_channels, h*8, w*8]
auto h = ggml_scale(ctx, z, 1.0f / 3.0f); auto h = ggml_scale(ctx->ggml_ctx, z, 1.0f / 3.0f);
h = ggml_tanh_inplace(ctx, h); h = ggml_tanh_inplace(ctx->ggml_ctx, h);
h = ggml_scale(ctx, h, 3.0f); h = ggml_scale(ctx->ggml_ctx, h, 3.0f);
for (int i = 0; i < num_blocks * 3 + 10; i++) { for (int i = 0; i < num_blocks * 3 + 10; i++) {
if (blocks.find(std::to_string(i)) == blocks.end()) { if (blocks.find(std::to_string(i)) == blocks.end()) {
if (i == 1) { if (i == 1) {
h = ggml_relu_inplace(ctx, h); h = ggml_relu_inplace(ctx->ggml_ctx, h);
} else { } else {
h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST); h = ggml_upscale(ctx->ggml_ctx, h, 2, GGML_SCALE_MODE_NEAREST);
} }
continue; continue;
} }
@ -180,12 +180,12 @@ public:
} }
} }
struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) { struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]); auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
return decoder->forward(ctx, z); return decoder->forward(ctx, z);
} }
struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]); auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
return encoder->forward(ctx, x); return encoder->forward(ctx, x);
} }
@ -197,25 +197,14 @@ struct TinyAutoEncoder : public GGMLRunner {
TinyAutoEncoder(ggml_backend_t backend, TinyAutoEncoder(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
bool decoder_only = true, bool decoder_only = true,
SDVersion version = VERSION_SD1) SDVersion version = VERSION_SD1)
: decode_only(decoder_only), : decode_only(decoder_only),
taesd(decoder_only, version), taesd(decoder_only, version),
GGMLRunner(backend, offload_params_to_cpu) { GGMLRunner(backend, offload_params_to_cpu) {
taesd.init(params_ctx, tensor_types, prefix); taesd.init(params_ctx, tensor_storage_map, prefix);
}
void enable_conv2d_direct() {
std::vector<GGMLBlock*> blocks;
taesd.get_all_blocks(blocks);
for (auto block : blocks) {
if (block->get_desc() == "Conv2d") {
auto conv_block = (Conv2d*)block;
conv_block->enable_direct();
}
}
} }
std::string get_desc() override { std::string get_desc() override {
@ -233,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
} }
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) { if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str()); LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
return false; return false;
} }
@ -252,12 +241,13 @@ struct TinyAutoEncoder : public GGMLRunner {
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z); z = to_backend(z);
struct ggml_tensor* out = decode_graph ? taesd.decode(compute_ctx, z) : taesd.encode(compute_ctx, z); auto runner_ctx = get_context();
struct ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* z, struct ggml_tensor* z,
bool decode_graph, bool decode_graph,
struct ggml_tensor** output, struct ggml_tensor** output,
@ -266,7 +256,7 @@ struct TinyAutoEncoder : public GGMLRunner {
return build_graph(z, decode_graph); return build_graph(z, decode_graph);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
}; };

View File

@ -811,6 +811,8 @@ bool starts_with(const std::vector<char32_t>& text,
return std::equal(prefix.begin(), prefix.end(), text.begin() + index); return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
} }
// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
std::vector<std::string> token_split(const std::string& text) { std::vector<std::string> token_split(const std::string& text) {
std::vector<std::string> tokens; std::vector<std::string> tokens;
auto cps = utf8_to_codepoints(text); auto cps = utf8_to_codepoints(text);

181
unet.hpp
View File

@ -7,7 +7,7 @@
/*==================================================== UnetModel =====================================================*/ /*==================================================== UnetModel =====================================================*/
#define UNET_GRAPH_SIZE 10240 #define UNET_GRAPH_SIZE 102400
class SpatialVideoTransformer : public SpatialTransformer { class SpatialVideoTransformer : public SpatialTransformer {
protected: protected:
@ -20,9 +20,10 @@ public:
int64_t d_head, int64_t d_head,
int64_t depth, int64_t depth,
int64_t context_dim, int64_t context_dim,
bool use_linear,
int64_t time_depth = 1, int64_t time_depth = 1,
int64_t max_time_embed_period = 10000) int64_t max_time_embed_period = 10000)
: SpatialTransformer(in_channels, n_head, d_head, depth, context_dim), : SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear),
max_time_embed_period(max_time_embed_period) { max_time_embed_period(max_time_embed_period) {
// We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
// use_spatial_context is always True // use_spatial_context is always True
@ -60,8 +61,7 @@ public:
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender()); blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* context, struct ggml_tensor* context,
int timesteps) { int timesteps) {
@ -92,7 +92,7 @@ public:
auto time_context = context; // [b*t, n_context, context_dim] auto time_context = context; // [b*t, n_context, context_dim]
auto spatial_context = context; auto spatial_context = context;
// time_context_first_timestep = time_context[::timesteps] // time_context_first_timestep = time_context[::timesteps]
auto time_context_first_timestep = ggml_view_3d(ctx, auto time_context_first_timestep = ggml_view_3d(ctx->ggml_ctx,
time_context, time_context,
time_context->ne[0], time_context->ne[0],
time_context->ne[1], time_context->ne[1],
@ -100,26 +100,26 @@ public:
time_context->nb[1], time_context->nb[1],
time_context->nb[2], time_context->nb[2],
0); // [b, n_context, context_dim] 0); // [b, n_context, context_dim]
time_context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, time_context = ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32,
time_context_first_timestep->ne[0], time_context_first_timestep->ne[0],
time_context_first_timestep->ne[1], time_context_first_timestep->ne[1],
time_context_first_timestep->ne[2] * h * w); time_context_first_timestep->ne[2] * h * w);
time_context = ggml_repeat(ctx, time_context_first_timestep, time_context); // [b*h*w, n_context, context_dim] time_context = ggml_repeat(ctx->ggml_ctx, time_context_first_timestep, time_context); // [b*h*w, n_context, context_dim]
x = norm->forward(ctx, x); x = norm->forward(ctx, x);
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w] x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim] x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim] x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
auto num_frames = ggml_arange(ctx, 0, timesteps, 1); auto num_frames = ggml_arange(ctx->ggml_ctx, 0, timesteps, 1);
// since b is 1, no need to do repeat // since b is 1, no need to do repeat
auto t_emb = ggml_nn_timestep_embedding(ctx, num_frames, in_channels, max_time_embed_period); // [N, in_channels] auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, num_frames, in_channels, max_time_embed_period); // [N, in_channels]
auto emb = time_pos_embed_0->forward(ctx, t_emb); auto emb = time_pos_embed_0->forward(ctx, t_emb);
emb = ggml_silu_inplace(ctx, emb); emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
emb = time_pos_embed_2->forward(ctx, emb); // [N, in_channels] emb = time_pos_embed_2->forward(ctx, emb); // [N, in_channels]
emb = ggml_reshape_3d(ctx, emb, emb->ne[0], 1, emb->ne[1]); // [N, 1, in_channels] emb = ggml_reshape_3d(ctx->ggml_ctx, emb, emb->ne[0], 1, emb->ne[1]); // [N, 1, in_channels]
for (int i = 0; i < depth; i++) { for (int i = 0; i < depth; i++) {
std::string transformer_name = "transformer_blocks." + std::to_string(i); std::string transformer_name = "transformer_blocks." + std::to_string(i);
@ -128,11 +128,11 @@ public:
auto block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[transformer_name]); auto block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[transformer_name]);
auto mix_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[time_stack_name]); auto mix_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[time_stack_name]);
x = block->forward(ctx, backend, x, spatial_context); // [N, h * w, inner_dim] x = block->forward(ctx, x, spatial_context); // [N, h * w, inner_dim]
// in_channels == inner_dim // in_channels == inner_dim
auto x_mix = x; auto x_mix = x;
x_mix = ggml_add(ctx, x_mix, emb); // [N, h * w, inner_dim] x_mix = ggml_add(ctx->ggml_ctx, x_mix, emb); // [N, h * w, inner_dim]
int64_t N = x_mix->ne[2]; int64_t N = x_mix->ne[2];
int64_t T = timesteps; int64_t T = timesteps;
@ -140,26 +140,26 @@ public:
int64_t S = x_mix->ne[1]; int64_t S = x_mix->ne[1];
int64_t C = x_mix->ne[0]; int64_t C = x_mix->ne[0];
x_mix = ggml_reshape_4d(ctx, x_mix, C, S, T, B); // (b t) s c -> b t s c x_mix = ggml_reshape_4d(ctx->ggml_ctx, x_mix, C, S, T, B); // (b t) s c -> b t s c
x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3)); // b t s c -> b s t c x_mix = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x_mix, 0, 2, 1, 3)); // b t s c -> b s t c
x_mix = ggml_reshape_3d(ctx, x_mix, C, T, S * B); // b s t c -> (b s) t c x_mix = ggml_reshape_3d(ctx->ggml_ctx, x_mix, C, T, S * B); // b s t c -> (b s) t c
x_mix = mix_block->forward(ctx, backend, x_mix, time_context); // [B * h * w, T, inner_dim] x_mix = mix_block->forward(ctx, x_mix, time_context); // [B * h * w, T, inner_dim]
x_mix = ggml_reshape_4d(ctx, x_mix, C, T, S, B); // (b s) t c -> b s t c x_mix = ggml_reshape_4d(ctx->ggml_ctx, x_mix, C, T, S, B); // (b s) t c -> b s t c
x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3)); // b s t c -> b t s c x_mix = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x_mix, 0, 2, 1, 3)); // b s t c -> b t s c
x_mix = ggml_reshape_3d(ctx, x_mix, C, S, T * B); // b t s c -> (b t) s c x_mix = ggml_reshape_3d(ctx->ggml_ctx, x_mix, C, S, T * B); // b t s c -> (b t) s c
x = time_mixer->forward(ctx, x, x_mix); // [N, h * w, inner_dim] x = time_mixer->forward(ctx, x, x_mix); // [N, h * w, inner_dim]
} }
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w] x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w] x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
// proj_out // proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w] x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
x = ggml_add(ctx, x, x_in); x = ggml_add(ctx->ggml_ctx, x, x_in);
return x; return x;
} }
}; };
@ -179,17 +179,20 @@ protected:
int num_heads = 8; int num_heads = 8;
int num_head_channels = -1; // channels // num_heads int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
bool use_linear_projection = false;
bool tiny_unet = false;
public: public:
int model_channels = 320; int model_channels = 320;
int adm_in_channels = 2816; // only for VERSION_SDXL/SVD int adm_in_channels = 2816; // only for VERSION_SDXL/SVD
UnetModelBlock(SDVersion version = VERSION_SD1, const String2GGMLType& tensor_types = {}, bool flash_attn = false) UnetModelBlock(SDVersion version = VERSION_SD1, const String2TensorStorage& tensor_storage_map = {})
: version(version) { : version(version) {
if (sd_version_is_sd2(version)) { if (sd_version_is_sd2(version)) {
context_dim = 1024; context_dim = 1024;
num_head_channels = 64; num_head_channels = 64;
num_heads = -1; num_heads = -1;
use_linear_projection = true;
} else if (sd_version_is_sdxl(version)) { } else if (sd_version_is_sdxl(version)) {
context_dim = 2048; context_dim = 2048;
attention_resolutions = {4, 2}; attention_resolutions = {4, 2};
@ -197,22 +200,26 @@ public:
transformer_depth = {1, 2, 10}; transformer_depth = {1, 2, 10};
num_head_channels = 64; num_head_channels = 64;
num_heads = -1; num_heads = -1;
use_linear_projection = true;
} else if (version == VERSION_SVD) { } else if (version == VERSION_SVD) {
in_channels = 8; in_channels = 8;
out_channels = 4; out_channels = 4;
context_dim = 1024; context_dim = 1024;
adm_in_channels = 768; adm_in_channels = 768;
num_head_channels = 64; num_head_channels = 64;
num_heads = -1; num_heads = -1;
} else if (version == VERSION_SD1_TINY_UNET) { use_linear_projection = true;
num_res_blocks = 1;
channel_mult = {1, 2, 4};
} }
if (sd_version_is_inpaint(version)) { if (sd_version_is_inpaint(version)) {
in_channels = 9; in_channels = 9;
} else if (sd_version_is_unet_edit(version)) { } else if (sd_version_is_unet_edit(version)) {
in_channels = 8; in_channels = 8;
} }
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET) {
num_res_blocks = 1;
channel_mult = {1, 2, 4};
tiny_unet = true;
}
// dims is always 2 // dims is always 2
// use_temporal_attention is always True for SVD // use_temporal_attention is always True for SVD
@ -250,9 +257,9 @@ public:
int64_t depth, int64_t depth,
int64_t context_dim) -> SpatialTransformer* { int64_t context_dim) -> SpatialTransformer* {
if (version == VERSION_SVD) { if (version == VERSION_SVD) {
return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim); return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
} else { } else {
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn); return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
} }
}; };
@ -286,7 +293,7 @@ public:
context_dim)); context_dim));
} }
input_block_chans.push_back(ch); input_block_chans.push_back(ch);
if (version == VERSION_SD1_TINY_UNET) { if (tiny_unet) {
input_block_idx++; input_block_idx++;
} }
} }
@ -307,7 +314,7 @@ public:
d_head = num_head_channels; d_head = num_head_channels;
n_head = ch / d_head; n_head = ch / d_head;
} }
if (version != VERSION_SD1_TINY_UNET) { if (!tiny_unet) {
blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch)); blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
if (version != VERSION_SDXL_SSD1B) { if (version != VERSION_SDXL_SSD1B) {
blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
@ -354,7 +361,7 @@ public:
} }
if (i > 0 && j == num_res_blocks) { if (i > 0 && j == num_res_blocks) {
if (version == VERSION_SD1_TINY_UNET) { if (tiny_unet) {
output_block_idx++; output_block_idx++;
if (output_block_idx == 2) { if (output_block_idx == 2) {
up_sample_idx = 1; up_sample_idx = 1;
@ -377,7 +384,7 @@ public:
} }
struct ggml_tensor* resblock_forward(std::string name, struct ggml_tensor* resblock_forward(std::string name,
struct ggml_context* ctx, GGMLRunnerContext* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* emb, struct ggml_tensor* emb,
int num_video_frames) { int num_video_frames) {
@ -393,24 +400,22 @@ public:
} }
struct ggml_tensor* attention_layer_forward(std::string name, struct ggml_tensor* attention_layer_forward(std::string name,
struct ggml_context* ctx, GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* context, struct ggml_tensor* context,
int timesteps) { int timesteps) {
if (version == VERSION_SVD) { if (version == VERSION_SVD) {
auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]); auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]);
return block->forward(ctx, backend, x, context, timesteps); return block->forward(ctx, x, context, timesteps);
} else { } else {
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]); auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
return block->forward(ctx, backend, x, context); return block->forward(ctx, x, context);
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_backend_t backend,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
@ -427,20 +432,20 @@ public:
// return: [N, out_channels, h, w] // return: [N, out_channels, h, w]
if (context != nullptr) { if (context != nullptr) {
if (context->ne[2] != x->ne[3]) { if (context->ne[2] != x->ne[3]) {
context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3])); context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
} }
} }
if (c_concat != nullptr) { if (c_concat != nullptr) {
if (c_concat->ne[3] != x->ne[3]) { if (c_concat->ne[3] != x->ne[3]) {
c_concat = ggml_repeat(ctx, c_concat, x); c_concat = ggml_repeat(ctx->ggml_ctx, c_concat, x);
} }
x = ggml_concat(ctx, x, c_concat, 2); x = ggml_concat(ctx->ggml_ctx, x, c_concat, 2);
} }
if (y != nullptr) { if (y != nullptr) {
if (y->ne[1] != x->ne[3]) { if (y->ne[1] != x->ne[3]) {
y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3])); y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
} }
} }
@ -451,10 +456,10 @@ public:
auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out.0"]); auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out.0"]);
auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]); auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]);
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels] auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels); // [N, model_channels]
auto emb = time_embed_0->forward(ctx, t_emb); auto emb = time_embed_0->forward(ctx, t_emb);
emb = ggml_silu_inplace(ctx, emb); emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim] emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
// SDXL/SVD // SDXL/SVD
@ -463,10 +468,10 @@ public:
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]); auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
auto label_emb = label_embed_0->forward(ctx, y); auto label_emb = label_embed_0->forward(ctx, y);
label_emb = ggml_silu_inplace(ctx, label_emb); label_emb = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim] label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim] emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
} }
// input_blocks // input_blocks
@ -489,11 +494,11 @@ public:
h = resblock_forward(name, ctx, h, emb, num_video_frames); // [N, mult*model_channels, h, w] h = resblock_forward(name, ctx, h, emb, num_video_frames); // [N, mult*model_channels, h, w]
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
h = attention_layer_forward(name, ctx, backend, h, context, num_video_frames); // [N, mult*model_channels, h, w] h = attention_layer_forward(name, ctx, h, context, num_video_frames); // [N, mult*model_channels, h, w]
} }
hs.push_back(h); hs.push_back(h);
} }
if (version == VERSION_SD1_TINY_UNET) { if (tiny_unet) {
input_block_idx++; input_block_idx++;
} }
if (i != len_mults - 1) { if (i != len_mults - 1) {
@ -510,16 +515,16 @@ public:
// [N, 4*model_channels, h/8, w/8] // [N, 4*model_channels, h/8, w/8]
// middle_block // middle_block
if (version != VERSION_SD1_TINY_UNET) { if (!tiny_unet) {
h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
if (version != VERSION_SDXL_SSD1B) { if (version != VERSION_SDXL_SSD1B) {
h = attention_layer_forward("middle_block.1", ctx, backend, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8]
h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
} }
} }
if (controls.size() > 0) { if (controls.size() > 0) {
auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength); auto cs = ggml_scale_inplace(ctx->ggml_ctx, controls[controls.size() - 1], control_strength);
h = ggml_add(ctx, h, cs); // middle control h = ggml_add(ctx->ggml_ctx, h, cs); // middle control
} }
int control_offset = controls.size() - 2; int control_offset = controls.size() - 2;
@ -531,12 +536,12 @@ public:
hs.pop_back(); hs.pop_back();
if (controls.size() > 0) { if (controls.size() > 0) {
auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength); auto cs = ggml_scale_inplace(ctx->ggml_ctx, controls[control_offset], control_strength);
h_skip = ggml_add(ctx, h_skip, cs); // control net condition h_skip = ggml_add(ctx->ggml_ctx, h_skip, cs); // control net condition
control_offset--; control_offset--;
} }
h = ggml_concat(ctx, h, h_skip, 2); h = ggml_concat(ctx->ggml_ctx, h, h_skip, 2);
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0"; std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
@ -546,13 +551,13 @@ public:
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1"; std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
h = attention_layer_forward(name, ctx, backend, h, context, num_video_frames); h = attention_layer_forward(name, ctx, h, context, num_video_frames);
up_sample_idx++; up_sample_idx++;
} }
if (i > 0 && j == num_res_blocks) { if (i > 0 && j == num_res_blocks) {
if (version == VERSION_SD1_TINY_UNET) { if (tiny_unet) {
output_block_idx++; output_block_idx++;
if (output_block_idx == 2) { if (output_block_idx == 2) {
up_sample_idx = 1; up_sample_idx = 1;
@ -572,7 +577,7 @@ public:
// out // out
h = out_0->forward(ctx, h); h = out_0->forward(ctx, h);
h = ggml_silu_inplace(ctx, h); h = ggml_silu_inplace(ctx->ggml_ctx, h);
h = out_2->forward(ctx, h); h = out_2->forward(ctx, h);
ggml_set_name(h, "bench-end"); ggml_set_name(h, "bench-end");
return h; // [N, out_channels, h, w] return h; // [N, out_channels, h, w]
@ -584,24 +589,11 @@ struct UNetModelRunner : public GGMLRunner {
UNetModelRunner(ggml_backend_t backend, UNetModelRunner(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
SDVersion version = VERSION_SD1, SDVersion version = VERSION_SD1)
bool flash_attn = false) : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_storage_map) {
: GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_types, flash_attn) { unet.init(params_ctx, tensor_storage_map, prefix);
unet.init(params_ctx, tensor_types, prefix);
}
void enable_conv2d_direct() {
std::vector<GGMLBlock*> blocks;
unet.get_all_blocks(blocks);
for (auto block : blocks) {
if (block->get_desc() == "Conv2d") {
LOG_DEBUG("block %s", block->get_desc().c_str());
auto conv_block = (Conv2d*)block;
conv_block->enable_direct();
}
}
} }
std::string get_desc() override { std::string get_desc() override {
@ -620,7 +612,7 @@ struct UNetModelRunner : public GGMLRunner {
int num_video_frames = -1, int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {}, std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f) { float control_strength = 0.f) {
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, UNET_GRAPH_SIZE, false); struct ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE);
if (num_video_frames == -1) { if (num_video_frames == -1) {
num_video_frames = x->ne[3]; num_video_frames = x->ne[3];
@ -636,8 +628,9 @@ struct UNetModelRunner : public GGMLRunner {
controls[i] = to_backend(controls[i]); controls[i] = to_backend(controls[i]);
} }
struct ggml_tensor* out = unet.forward(compute_ctx, auto runner_ctx = get_context();
runtime_backend,
struct ggml_tensor* out = unet.forward(&runner_ctx,
x, x,
timesteps, timesteps,
context, context,
@ -652,7 +645,7 @@ struct UNetModelRunner : public GGMLRunner {
return gf; return gf;
} }
void compute(int n_threads, bool compute(int n_threads,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
@ -672,7 +665,7 @@ struct UNetModelRunner : public GGMLRunner {
return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength); return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
}; };
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
void test() { void test() {

View File

@ -9,12 +9,15 @@ struct UpscalerGGML {
std::shared_ptr<ESRGAN> esrgan_upscaler; std::shared_ptr<ESRGAN> esrgan_upscaler;
std::string esrgan_path; std::string esrgan_path;
int n_threads; int n_threads;
bool direct = false; bool direct = false;
int tile_size = 128;
UpscalerGGML(int n_threads, UpscalerGGML(int n_threads,
bool direct = false) bool direct = false,
int tile_size = 128)
: n_threads(n_threads), : n_threads(n_threads),
direct(direct) { direct(direct),
tile_size(tile_size) {
} }
bool load_from_file(const std::string& esrgan_path, bool load_from_file(const std::string& esrgan_path,
@ -42,7 +45,7 @@ struct UpscalerGGML {
backend = ggml_backend_sycl_init(0); backend = ggml_backend_sycl_init(0);
#endif #endif
ModelLoader model_loader; ModelLoader model_loader;
if (!model_loader.init_from_file(esrgan_path)) { if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
} }
model_loader.set_wtype_override(model_data_type); model_loader.set_wtype_override(model_data_type);
@ -51,9 +54,9 @@ struct UpscalerGGML {
backend = ggml_backend_cpu_init(); backend = ggml_backend_cpu_init();
} }
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.tensor_storages_types); esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
if (direct) { if (direct) {
esrgan_upscaler->enable_conv2d_direct(); esrgan_upscaler->set_conv2d_direct_enabled(true);
} }
if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) { if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {
return false; return false;
@ -82,7 +85,7 @@ struct UpscalerGGML {
} }
// LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); // LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1); ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
sd_image_to_tensor(input_image, input_image_tensor); sd_image_to_ggml_tensor(input_image, input_image_tensor);
ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1); ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
@ -91,8 +94,8 @@ struct UpscalerGGML {
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling); sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
esrgan_upscaler->free_compute_buffer(); esrgan_upscaler->free_compute_buffer();
ggml_tensor_clamp(upscaled, 0.f, 1.f); ggml_ext_tensor_clamp_inplace(upscaled, 0.f, 1.f);
uint8_t* upscaled_data = sd_tensor_to_image(upscaled); uint8_t* upscaled_data = ggml_tensor_to_sd_image(upscaled);
ggml_free(upscale_ctx); ggml_free(upscale_ctx);
int64_t t3 = ggml_time_ms(); int64_t t3 = ggml_time_ms();
LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f); LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f);
@ -113,14 +116,15 @@ struct upscaler_ctx_t {
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
bool offload_params_to_cpu, bool offload_params_to_cpu,
bool direct, bool direct,
int n_threads) { int n_threads,
int tile_size) {
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
if (upscaler_ctx == nullptr) { if (upscaler_ctx == nullptr) {
return nullptr; return nullptr;
} }
std::string esrgan_path(esrgan_path_c_str); std::string esrgan_path(esrgan_path_c_str);
upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct); upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
if (upscaler_ctx->upscaler == nullptr) { if (upscaler_ctx->upscaler == nullptr) {
return nullptr; return nullptr;
} }

133
util.cpp
View File

@ -5,6 +5,7 @@
#include <cstdarg> #include <cstdarg>
#include <fstream> #include <fstream>
#include <locale> #include <locale>
#include <regex>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <thread> #include <thread>
@ -26,9 +27,6 @@
#include "ggml.h" #include "ggml.h"
#include "stable-diffusion.h" #include "stable-diffusion.h"
#define STB_IMAGE_RESIZE_IMPLEMENTATION
#include "stb_image_resize.h"
bool ends_with(const std::string& str, const std::string& ending) { bool ends_with(const std::string& str, const std::string& ending) {
if (str.length() >= ending.length()) { if (str.length() >= ending.length()) {
return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0); return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
@ -59,7 +57,7 @@ void replace_all_chars(std::string& str, char target, char replacement) {
} }
} }
std::string format(const char* fmt, ...) { std::string sd_format(const char* fmt, ...) {
va_list ap; va_list ap;
va_list ap2; va_list ap2;
va_start(ap, fmt); va_start(ap, fmt);
@ -97,20 +95,6 @@ bool is_directory(const std::string& path) {
return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY)); return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
} }
std::string get_full_path(const std::string& dir, const std::string& filename) {
std::string full_path = dir + "\\" + filename;
WIN32_FIND_DATA find_file_data;
HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
if (hFind != INVALID_HANDLE_VALUE) {
FindClose(hFind);
return full_path;
} else {
return "";
}
}
#else // Unix #else // Unix
#include <dirent.h> #include <dirent.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -125,32 +109,12 @@ bool is_directory(const std::string& path) {
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode)); return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
} }
// TODO: add windows version
std::string get_full_path(const std::string& dir, const std::string& filename) {
DIR* dp = opendir(dir.c_str());
if (dp != nullptr) {
struct dirent* entry;
while ((entry = readdir(dp)) != nullptr) {
if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
closedir(dp);
return dir + "/" + entry->d_name;
}
}
closedir(dp);
}
return "";
}
#endif #endif
// get_num_physical_cores is copy from // get_num_physical_cores is copy from
// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp // https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE // LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
int32_t get_num_physical_cores() { int32_t sd_get_num_physical_cores() {
#ifdef __linux__ #ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores // enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings; std::unordered_set<std::string> siblings;
@ -188,6 +152,13 @@ int32_t get_num_physical_cores() {
static sd_progress_cb_t sd_progress_cb = nullptr; static sd_progress_cb_t sd_progress_cb = nullptr;
void* sd_progress_cb_data = nullptr; void* sd_progress_cb_data = nullptr;
static sd_preview_cb_t sd_preview_cb = nullptr;
static void* sd_preview_cb_data = nullptr;
preview_t sd_preview_mode = PREVIEW_NONE;
int sd_preview_interval = 1;
bool sd_preview_denoised = true;
bool sd_preview_noisy = false;
std::u32string utf8_to_utf32(const std::string& utf8_str) { std::u32string utf8_to_utf32(const std::string& utf8_str) {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter; std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
return converter.from_bytes(utf8_str); return converter.from_bytes(utf8_str);
@ -269,13 +240,16 @@ void pretty_progress(int step, int steps, float time) {
} }
} }
progress += "|"; progress += "|";
printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K",
progress.c_str(), step, steps, const char* lf = (step == steps ? "\n" : "");
time > 1.0f || time == 0 ? time : (1.0f / time)); const char* unit = "s/it";
fflush(stdout); // for linux float speed = time;
if (step == steps) { if (speed < 1.0f && speed > 0.f) {
printf("\n"); speed = 1.0f / speed;
unit = "it/s";
} }
printf("\r%s %i/%i - %.2f%s\033[K%s", progress.c_str(), step, steps, speed, unit, lf);
fflush(stdout); // for linux
} }
std::string ltrim(const std::string& s) { std::string ltrim(const std::string& s) {
@ -331,23 +305,58 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
sd_progress_cb = cb; sd_progress_cb = cb;
sd_progress_cb_data = data; sd_progress_cb_data = data;
} }
void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy, void* data) {
sd_preview_cb = cb;
sd_preview_cb_data = data;
sd_preview_mode = mode;
sd_preview_interval = interval;
sd_preview_denoised = denoised;
sd_preview_noisy = noisy;
}
sd_preview_cb_t sd_get_preview_callback() {
return sd_preview_cb;
}
void* sd_get_preview_callback_data() {
return sd_preview_cb_data;
}
preview_t sd_get_preview_mode() {
return sd_preview_mode;
}
int sd_get_preview_interval() {
return sd_preview_interval;
}
bool sd_should_preview_denoised() {
return sd_preview_denoised;
}
bool sd_should_preview_noisy() {
return sd_preview_noisy;
}
sd_progress_cb_t sd_get_progress_callback() {
return sd_progress_cb;
}
void* sd_get_progress_callback_data() {
return sd_progress_cb_data;
}
const char* sd_get_system_info() { const char* sd_get_system_info() {
static char buffer[1024]; static char buffer[1024];
std::stringstream ss; std::stringstream ss;
ss << "System Info: \n"; ss << "System Info: \n";
ss << " SSE3 = " << ggml_cpu_has_sse3() << std::endl; ss << " SSE3 = " << ggml_cpu_has_sse3() << " | ";
ss << " AVX = " << ggml_cpu_has_avx() << std::endl; ss << " AVX = " << ggml_cpu_has_avx() << " | ";
ss << " AVX2 = " << ggml_cpu_has_avx2() << std::endl; ss << " AVX2 = " << ggml_cpu_has_avx2() << " | ";
ss << " AVX512 = " << ggml_cpu_has_avx512() << std::endl; ss << " AVX512 = " << ggml_cpu_has_avx512() << " | ";
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl; ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl; ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
ss << " FMA = " << ggml_cpu_has_fma() << std::endl; ss << " FMA = " << ggml_cpu_has_fma() << " | ";
ss << " NEON = " << ggml_cpu_has_neon() << std::endl; ss << " NEON = " << ggml_cpu_has_neon() << " | ";
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl; ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
ss << " F16C = " << ggml_cpu_has_f16c() << std::endl; ss << " F16C = " << ggml_cpu_has_f16c() << " | ";
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl; ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl; ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
ss << " VSX = " << ggml_cpu_has_vsx() << std::endl; ss << " VSX = " << ggml_cpu_has_vsx() << " | ";
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str()); snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer; return buffer;
} }
@ -513,6 +522,8 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
// (abc) - increases attention to abc by a multiplier of 1.1 // (abc) - increases attention to abc by a multiplier of 1.1
// (abc:3.12) - increases attention to abc by a multiplier of 3.12 // (abc:3.12) - increases attention to abc by a multiplier of 3.12
// [abc] - decreases attention to abc by a multiplier of 1.1 // [abc] - decreases attention to abc by a multiplier of 1.1
// BREAK - separates the prompt into conceptually distinct parts for sequential processing
// B - internal helper pattern; prevents 'B' in 'BREAK' from being consumed as normal text
// \( - literal character '(' // \( - literal character '('
// \[ - literal character '[' // \[ - literal character '['
// \) - literal character ')' // \) - literal character ')'
@ -548,7 +559,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
float round_bracket_multiplier = 1.1f; float round_bracket_multiplier = 1.1f;
float square_bracket_multiplier = 1 / 1.1f; float square_bracket_multiplier = 1 / 1.1f;
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)"); std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|\bBREAK\b|[^\\()\[\]:B]+|:|\bB)");
std::regex re_break(R"(\s*\bBREAK\b\s*)"); std::regex re_break(R"(\s*\bBREAK\b\s*)");
auto multiply_range = [&](int start_position, float multiplier) { auto multiply_range = [&](int start_position, float multiplier) {
@ -557,7 +568,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
} }
}; };
std::smatch m; std::smatch m, m2;
std::string remaining_text = text; std::string remaining_text = text;
while (std::regex_search(remaining_text, m, re_attention)) { while (std::regex_search(remaining_text, m, re_attention)) {
@ -581,6 +592,8 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
square_brackets.pop_back(); square_brackets.pop_back();
} else if (text == "\\(") { } else if (text == "\\(") {
res.push_back({text.substr(1), 1.0f}); res.push_back({text.substr(1), 1.0f});
} else if (std::regex_search(text, m2, re_break)) {
res.push_back({"BREAK", -1.0f});
} else { } else {
res.push_back({text, 1.0f}); res.push_back({text, 1.0f});
} }

13
util.h
View File

@ -14,7 +14,7 @@ bool ends_with(const std::string& str, const std::string& ending);
bool starts_with(const std::string& str, const std::string& start); bool starts_with(const std::string& str, const std::string& start);
bool contains(const std::string& str, const std::string& substr); bool contains(const std::string& str, const std::string& substr);
std::string format(const char* fmt, ...); std::string sd_format(const char* fmt, ...);
void replace_all_chars(std::string& str, char target, char replacement); void replace_all_chars(std::string& str, char target, char replacement);
@ -22,7 +22,6 @@ int round_up_to(int value, int base);
bool file_exists(const std::string& filename); bool file_exists(const std::string& filename);
bool is_directory(const std::string& path); bool is_directory(const std::string& path);
std::string get_full_path(const std::string& dir, const std::string& filename);
std::u32string utf8_to_utf32(const std::string& utf8_str); std::u32string utf8_to_utf32(const std::string& utf8_str);
std::string utf32_to_utf8(const std::u32string& utf32_str); std::string utf32_to_utf8(const std::u32string& utf32_str);
@ -54,6 +53,16 @@ std::string trim(const std::string& s);
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text); std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
sd_progress_cb_t sd_get_progress_callback();
void* sd_get_progress_callback_data();
sd_preview_cb_t sd_get_preview_callback();
void* sd_get_preview_callback_data();
preview_t sd_get_preview_mode();
int sd_get_preview_interval();
bool sd_should_preview_denoised();
bool sd_should_preview_noisy();
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)

300
vae.hpp
View File

@ -30,7 +30,7 @@ public:
} }
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
// t_emb is always None // t_emb is always None
auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]); auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
@ -40,12 +40,12 @@ public:
auto h = x; auto h = x;
h = norm1->forward(ctx, h); h = norm1->forward(ctx, h);
h = ggml_silu_inplace(ctx, h); // swish h = ggml_silu_inplace(ctx->ggml_ctx, h); // swish
h = conv1->forward(ctx, h); h = conv1->forward(ctx, h);
// return h; // return h;
h = norm2->forward(ctx, h); h = norm2->forward(ctx, h);
h = ggml_silu_inplace(ctx, h); // swish h = ggml_silu_inplace(ctx->ggml_ctx, h); // swish
// dropout, skip for inference // dropout, skip for inference
h = conv2->forward(ctx, h); h = conv2->forward(ctx, h);
@ -56,7 +56,7 @@ public:
x = nin_shortcut->forward(ctx, x); // [N, out_channels, h, w] x = nin_shortcut->forward(ctx, x); // [N, out_channels, h, w]
} }
h = ggml_add(ctx, h, x); h = ggml_add(ctx->ggml_ctx, h, x);
return h; // [N, out_channels, h, w] return h; // [N, out_channels, h, w]
} }
}; };
@ -64,25 +64,51 @@ public:
class AttnBlock : public UnaryBlock { class AttnBlock : public UnaryBlock {
protected: protected:
int64_t in_channels; int64_t in_channels;
bool use_linear;
public: void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
AttnBlock(int64_t in_channels) auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
: in_channels(in_channels) { if (iter != tensor_storage_map.end()) {
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels)); if (iter->second.n_dims == 4 && use_linear) {
blocks["q"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1})); use_linear = false;
blocks["k"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1})); blocks["q"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
blocks["v"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1})); blocks["k"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
blocks["v"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1})); blocks["proj_out"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
} else if (iter->second.n_dims == 2 && !use_linear) {
use_linear = true;
blocks["q"] = std::make_shared<Linear>(in_channels, in_channels);
blocks["k"] = std::make_shared<Linear>(in_channels, in_channels);
blocks["v"] = std::make_shared<Linear>(in_channels, in_channels);
blocks["proj_out"] = std::make_shared<Linear>(in_channels, in_channels);
}
}
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { public:
AttnBlock(int64_t in_channels, bool use_linear)
: in_channels(in_channels), use_linear(use_linear) {
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
if (use_linear) {
blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
} else {
blocks["q"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
blocks["k"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
blocks["v"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
}
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]); auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
auto q_proj = std::dynamic_pointer_cast<Conv2d>(blocks["q"]); auto q_proj = std::dynamic_pointer_cast<UnaryBlock>(blocks["q"]);
auto k_proj = std::dynamic_pointer_cast<Conv2d>(blocks["k"]); auto k_proj = std::dynamic_pointer_cast<UnaryBlock>(blocks["k"]);
auto v_proj = std::dynamic_pointer_cast<Conv2d>(blocks["v"]); auto v_proj = std::dynamic_pointer_cast<UnaryBlock>(blocks["v"]);
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]); auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
auto h_ = norm->forward(ctx, x); auto h_ = norm->forward(ctx, x);
@ -91,25 +117,46 @@ public:
const int64_t h = h_->ne[1]; const int64_t h = h_->ne[1];
const int64_t w = h_->ne[0]; const int64_t w = h_->ne[0];
auto q = q_proj->forward(ctx, h_); // [N, in_channels, h, w] ggml_tensor* q;
q = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels] ggml_tensor* k;
q = ggml_reshape_3d(ctx, q, c, h * w, n); // [N, h * w, in_channels] ggml_tensor* v;
if (use_linear) {
h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 2, 0, 3)); // [N, h, w, in_channels]
h_ = ggml_reshape_3d(ctx->ggml_ctx, h_, c, h * w, n); // [N, h * w, in_channels]
auto k = k_proj->forward(ctx, h_); // [N, in_channels, h, w] q = q_proj->forward(ctx, h_); // [N, h * w, in_channels]
k = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels] k = k_proj->forward(ctx, h_); // [N, h * w, in_channels]
k = ggml_reshape_3d(ctx, k, c, h * w, n); // [N, h * w, in_channels] v = v_proj->forward(ctx, h_); // [N, h * w, in_channels]
auto v = v_proj->forward(ctx, h_); // [N, in_channels, h, w] v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 0, 2, 3)); // [N, in_channels, h * w]
v = ggml_reshape_3d(ctx, v, h * w, c, n); // [N, in_channels, h * w] } else {
q = q_proj->forward(ctx, h_); // [N, in_channels, h, w]
q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels]
q = ggml_reshape_3d(ctx->ggml_ctx, q, c, h * w, n); // [N, h * w, in_channels]
h_ = ggml_nn_attention(ctx, q, k, v, false); // [N, h * w, in_channels] k = k_proj->forward(ctx, h_); // [N, in_channels, h, w]
k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels]
k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n); // [N, h * w, in_channels]
h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w] v = v_proj->forward(ctx, h_); // [N, in_channels, h, w]
h_ = ggml_reshape_4d(ctx, h_, w, h, c, n); // [N, in_channels, h, w] v = ggml_reshape_3d(ctx->ggml_ctx, v, h * w, c, n); // [N, in_channels, h * w]
}
h_ = proj_out->forward(ctx, h_); // [N, in_channels, h, w] h_ = ggml_ext_attention(ctx->ggml_ctx, q, k, v, false); // [N, h * w, in_channels]
h_ = ggml_add(ctx, h_, x); if (use_linear) {
h_ = proj_out->forward(ctx, h_); // [N, h * w, in_channels]
h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w]
h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n); // [N, in_channels, h, w]
} else {
h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w]
h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n); // [N, in_channels, h, w]
h_ = proj_out->forward(ctx, h_); // [N, in_channels, h, w]
}
h_ = ggml_add(ctx->ggml_ctx, h_, x);
return h_; return h_;
} }
}; };
@ -133,7 +180,7 @@ public:
kernel_padding)); kernel_padding));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x) override { struct ggml_tensor* x) override {
// timesteps always None // timesteps always None
// skip_video always False // skip_video always False
@ -152,24 +199,24 @@ public:
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int64_t W = x->ne[0]; int64_t W = x->ne[0];
x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w) x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
x = time_mix_conv->forward(ctx, x); // [B, OC, T, OH * OW] x = time_mix_conv->forward(ctx, x); // [B, OC, T, OH * OW]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
return x; // [B*T, OC, OH, OW] return x; // [B*T, OC, OH, OW]
} }
}; };
class VideoResnetBlock : public ResnetBlock { class VideoResnetBlock : public ResnetBlock {
protected: protected:
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_types, GGML_TYPE_F32); enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32);
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
} }
float get_alpha() { float get_alpha() {
float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]); float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
return sigmoid(alpha); return sigmoid(alpha);
} }
@ -182,7 +229,7 @@ public:
blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true)); blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [N, in_channels, h, w] aka [b*t, in_channels, h, w] // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
// return: [N, out_channels, h, w] aka [b*t, out_channels, h, w] // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
// t_emb is always None // t_emb is always None
@ -199,19 +246,19 @@ public:
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int64_t W = x->ne[0]; int64_t W = x->ne[0];
x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w) x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
auto x_mix = x; auto x_mix = x;
x = time_stack->forward(ctx, x); // b t c (h w) x = time_stack->forward(ctx, x); // b t c (h w)
float alpha = get_alpha(); float alpha = get_alpha();
x = ggml_add(ctx, x = ggml_add(ctx->ggml_ctx,
ggml_scale(ctx, x, alpha), ggml_scale(ctx->ggml_ctx, x, alpha),
ggml_scale(ctx, x_mix, 1.0f - alpha)); ggml_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha));
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
return x; return x;
} }
@ -233,7 +280,8 @@ public:
int num_res_blocks, int num_res_blocks,
int in_channels, int in_channels,
int z_channels, int z_channels,
bool double_z = true) bool double_z = true,
bool use_linear_projection = false)
: ch(ch), : ch(ch),
ch_mult(ch_mult), ch_mult(ch_mult),
num_res_blocks(num_res_blocks), num_res_blocks(num_res_blocks),
@ -264,14 +312,14 @@ public:
} }
blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in)); blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
blocks["mid.attn_1"] = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in)); blocks["mid.attn_1"] = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in)); blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in)); blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1})); blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
} }
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
auto conv_in = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]); auto conv_in = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
@ -307,8 +355,8 @@ public:
// end // end
h = norm_out->forward(ctx, h); h = norm_out->forward(ctx, h);
h = ggml_silu_inplace(ctx, h); // nonlinearity/swish h = ggml_silu_inplace(ctx->ggml_ctx, h); // nonlinearity/swish
h = conv_out->forward(ctx, h); // [N, z_channels*2, h, w] h = conv_out->forward(ctx, h); // [N, z_channels*2, h, w]
return h; return h;
} }
}; };
@ -351,8 +399,9 @@ public:
std::vector<int> ch_mult, std::vector<int> ch_mult,
int num_res_blocks, int num_res_blocks,
int z_channels, int z_channels,
bool video_decoder = false, bool use_linear_projection = false,
int video_kernel_size = 3) bool video_decoder = false,
int video_kernel_size = 3)
: ch(ch), : ch(ch),
out_ch(out_ch), out_ch(out_ch),
ch_mult(ch_mult), ch_mult(ch_mult),
@ -366,7 +415,7 @@ public:
blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1})); blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
blocks["mid.block_1"] = get_resnet_block(block_in, block_in); blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
blocks["mid.attn_1"] = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in)); blocks["mid.attn_1"] = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
blocks["mid.block_2"] = get_resnet_block(block_in, block_in); blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
for (int i = num_resolutions - 1; i >= 0; i--) { for (int i = num_resolutions - 1; i >= 0; i--) {
@ -388,7 +437,7 @@ public:
blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}); blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
} }
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) { virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
// z: [N, z_channels, h, w] // z: [N, z_channels, h, w]
// alpha is always 0 // alpha is always 0
// merge_strategy is always learned // merge_strategy is always learned
@ -429,8 +478,8 @@ public:
} }
h = norm_out->forward(ctx, h); h = norm_out->forward(ctx, h);
h = ggml_silu_inplace(ctx, h); // nonlinearity/swish h = ggml_silu_inplace(ctx->ggml_ctx, h); // nonlinearity/swish
h = conv_out->forward(ctx, h); // [N, out_ch, h*8, w*8] h = conv_out->forward(ctx, h); // [N, out_ch, h*8, w*8]
return h; return h;
} }
}; };
@ -438,6 +487,7 @@ public:
// ldm.models.autoencoder.AutoencoderKL // ldm.models.autoencoder.AutoencoderKL
class AutoencodingEngine : public GGMLBlock { class AutoencodingEngine : public GGMLBlock {
protected: protected:
SDVersion version;
bool decode_only = true; bool decode_only = true;
bool use_video_decoder = false; bool use_video_decoder = false;
bool use_quant = true; bool use_quant = true;
@ -454,13 +504,19 @@ protected:
} dd_config; } dd_config;
public: public:
AutoencodingEngine(bool decode_only = true, AutoencodingEngine(SDVersion version = VERSION_SD1,
bool use_video_decoder = false, bool decode_only = true,
SDVersion version = VERSION_SD1) bool use_linear_projection = false,
: decode_only(decode_only), use_video_decoder(use_video_decoder) { bool use_video_decoder = false)
: version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) {
if (sd_version_is_dit(version)) { if (sd_version_is_dit(version)) {
dd_config.z_channels = 16; if (sd_version_is_flux2(version)) {
use_quant = false; dd_config.z_channels = 32;
embed_dim = 32;
} else {
use_quant = false;
dd_config.z_channels = 16;
}
} }
if (use_video_decoder) { if (use_video_decoder) {
use_quant = false; use_quant = false;
@ -470,6 +526,7 @@ public:
dd_config.ch_mult, dd_config.ch_mult,
dd_config.num_res_blocks, dd_config.num_res_blocks,
dd_config.z_channels, dd_config.z_channels,
use_linear_projection,
use_video_decoder)); use_video_decoder));
if (use_quant) { if (use_quant) {
blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels, blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
@ -482,7 +539,8 @@ public:
dd_config.num_res_blocks, dd_config.num_res_blocks,
dd_config.in_channels, dd_config.in_channels,
dd_config.z_channels, dd_config.z_channels,
dd_config.double_z)); dd_config.double_z,
use_linear_projection));
if (use_quant) { if (use_quant) {
int factor = dd_config.double_z ? 2 : 1; int factor = dd_config.double_z ? 2 : 1;
@ -493,8 +551,26 @@ public:
} }
} }
struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) { struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
// z: [N, z_channels, h, w] // z: [N, z_channels, h, w]
if (sd_version_is_flux2(version)) {
// [N, C*p*p, h, w] -> [N, C, h*p, w*p]
int64_t p = 2;
int64_t N = z->ne[3];
int64_t C = z->ne[2] / p / p;
int64_t h = z->ne[1];
int64_t w = z->ne[0];
int64_t H = h * p;
int64_t W = w * p;
z = ggml_reshape_4d(ctx->ggml_ctx, z, w * h, p * p, C, N); // [N, C, p*p, h*w]
z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 1, 0, 2, 3)); // [N, C, h*w, p*p]
z = ggml_reshape_4d(ctx->ggml_ctx, z, p, p, w, h * C * N); // [N*C*h, w, p, p]
z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 0, 2, 1, 3)); // [N*C*h, p, w, p]
z = ggml_reshape_4d(ctx->ggml_ctx, z, W, H, C, N); // [N, C, h*p, w*p]
}
if (use_quant) { if (use_quant) {
auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]); auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w] z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w]
@ -507,56 +583,100 @@ public:
return h; return h;
} }
struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, in_channels, h, w] // x: [N, in_channels, h, w]
auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]); auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
auto h = encoder->forward(ctx, x); // [N, 2*z_channels, h/8, w/8] auto z = encoder->forward(ctx, x); // [N, 2*z_channels, h/8, w/8]
if (use_quant) { if (use_quant) {
auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]); auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
h = quant_conv->forward(ctx, h); // [N, 2*embed_dim, h/8, w/8] z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8]
} }
return h; if (sd_version_is_flux2(version)) {
z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];
// [N, C, H, W] -> [N, C*p*p, H/p, W/p]
int64_t p = 2;
int64_t N = z->ne[3];
int64_t C = z->ne[2];
int64_t H = z->ne[1];
int64_t W = z->ne[0];
int64_t h = H / p;
int64_t w = W / p;
z = ggml_reshape_4d(ctx->ggml_ctx, z, p, w, p, h * C * N); // [N*C*h, p, w, p]
z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 2, 1, 3)); // [N*C*h, w, p, p]
z = ggml_reshape_4d(ctx->ggml_ctx, z, p * p, w * h, C, N); // [N, C, h*w, p*p]
z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 1, 0, 2, 3)); // [N, C, p*p, h*w]
z = ggml_reshape_4d(ctx->ggml_ctx, z, w, h, p * p * C, N); // [N, C*p*p, h*w]
}
return z;
} }
}; };
struct VAE : public GGMLRunner { struct VAE : public GGMLRunner {
VAE(ggml_backend_t backend, bool offload_params_to_cpu) VAE(ggml_backend_t backend, bool offload_params_to_cpu)
: GGMLRunner(backend, offload_params_to_cpu) {} : GGMLRunner(backend, offload_params_to_cpu) {}
virtual void compute(const int n_threads, virtual bool compute(const int n_threads,
struct ggml_tensor* z, struct ggml_tensor* z,
bool decode_graph, bool decode_graph,
struct ggml_tensor** output, struct ggml_tensor** output,
struct ggml_context* output_ctx) = 0; struct ggml_context* output_ctx) = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0; virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
virtual void enable_conv2d_direct(){};
virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
}; };
struct FakeVAE : public VAE {
FakeVAE(ggml_backend_t backend, bool offload_params_to_cpu)
: VAE(backend, offload_params_to_cpu) {}
bool compute(const int n_threads,
struct ggml_tensor* z,
bool decode_graph,
struct ggml_tensor** output,
struct ggml_context* output_ctx) override {
if (*output == nullptr && output_ctx != nullptr) {
*output = ggml_dup_tensor(output_ctx, z);
}
ggml_ext_tensor_iter(z, [&](ggml_tensor* z, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = ggml_ext_tensor_get_f32(z, i0, i1, i2, i3);
ggml_ext_tensor_set_f32(*output, value, i0, i1, i2, i3);
});
return true;
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {}
std::string get_desc() override {
return "fake_vae";
}
};
struct AutoEncoderKL : public VAE { struct AutoEncoderKL : public VAE {
bool decode_only = true; bool decode_only = true;
AutoencodingEngine ae; AutoencodingEngine ae;
AutoEncoderKL(ggml_backend_t backend, AutoEncoderKL(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2TensorStorage& tensor_storage_map,
const std::string prefix, const std::string prefix,
bool decode_only = false, bool decode_only = false,
bool use_video_decoder = false, bool use_video_decoder = false,
SDVersion version = VERSION_SD1) SDVersion version = VERSION_SD1)
: decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend, offload_params_to_cpu) { : decode_only(decode_only), VAE(backend, offload_params_to_cpu) {
ae.init(params_ctx, tensor_types, prefix); bool use_linear_projection = false;
} for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
void enable_conv2d_direct() override { continue;
std::vector<GGMLBlock*> blocks; }
ae.get_all_blocks(blocks); if (ends_with(name, "attn_1.proj_out.weight")) {
for (auto block : blocks) { if (tensor_storage.n_dims == 2) {
if (block->get_desc() == "Conv2d") { use_linear_projection = true;
auto conv_block = (Conv2d*)block; }
conv_block->enable_direct(); break;
} }
} }
ae = AutoencodingEngine(version, decode_only, use_linear_projection, use_video_decoder);
ae.init(params_ctx, tensor_storage_map, prefix);
} }
void set_conv2d_scale(float scale) override { void set_conv2d_scale(float scale) override {
@ -583,14 +703,16 @@ struct AutoEncoderKL : public VAE {
z = to_backend(z); z = to_backend(z);
struct ggml_tensor* out = decode_graph ? ae.decode(compute_ctx, z) : ae.encode(compute_ctx, z); auto runner_ctx = get_context();
struct ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);
return gf; return gf;
} }
void compute(const int n_threads, bool compute(const int n_threads,
struct ggml_tensor* z, struct ggml_tensor* z,
bool decode_graph, bool decode_graph,
struct ggml_tensor** output, struct ggml_tensor** output,
@ -601,7 +723,7 @@ struct AutoEncoderKL : public VAE {
}; };
// ggml_set_f32(z, 0.5f); // ggml_set_f32(z, 0.5f);
// print_ggml_tensor(z); // print_ggml_tensor(z);
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
} }
void test() { void test() {

20
version.cpp Normal file
View File

@ -0,0 +1,20 @@
#include "stable-diffusion.h"
#ifndef SDCPP_BUILD_COMMIT
#define SDCPP_BUILD_COMMIT unknown
#endif
#ifndef SDCPP_BUILD_VERSION
#define SDCPP_BUILD_VERSION unknown
#endif
#define STRINGIZE2(x) #x
#define STRINGIZE(x) STRINGIZE2(x)
const char* sd_commit(void) {
return STRINGIZE(SDCPP_BUILD_COMMIT);
}
const char* sd_version(void) {
return STRINGIZE(SDCPP_BUILD_VERSION);
}

488508
vocab_mistral.hpp Normal file

File diff suppressed because it is too large Load Diff

566
wan.hpp

File diff suppressed because it is too large Load Diff

675
z_image.hpp Normal file
View File

@ -0,0 +1,675 @@
#ifndef __Z_IMAGE_HPP__
#define __Z_IMAGE_HPP__
#include <algorithm>
#include "flux.hpp"
#include "ggml_extend.hpp"
#include "mmdit.hpp"
// Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
// Ref: https://github.com/huggingface/diffusers/pull/12703
#ifndef MIN
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
namespace ZImage {
constexpr int Z_IMAGE_GRAPH_SIZE = 20480;
constexpr int ADALN_EMBED_DIM = 256;
constexpr int SEQ_MULTI_OF = 32;
struct JointAttention : public GGMLBlock {
protected:
int64_t head_dim;
int64_t num_heads;
int64_t num_kv_heads;
bool qk_norm;
public:
JointAttention(int64_t hidden_size, int64_t head_dim, int64_t num_heads, int64_t num_kv_heads, bool qk_norm)
: head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) {
blocks["qkv"] = std::make_shared<Linear>(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false);
float scale = 1.f;
#if GGML_USE_HIP
// Prevent NaN issues with certain ROCm setups
scale = 1.f / 16.f;
#endif
blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false, false, false, scale);
if (qk_norm) {
blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim);
blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim);
}
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* pe,
struct ggml_tensor* mask = nullptr) {
// x: [N, n_token, hidden_size]
int64_t n_token = x->ne[1];
int64_t N = x->ne[2];
auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out"]);
auto qkv = qkv_proj->forward(ctx, x); // [N, n_token, (num_heads + num_kv_heads*2)*head_dim]
qkv = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]); // [N, n_token, num_heads + num_kv_heads*2, head_dim]
qkv = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, qkv, 0, 2, 3, 1)); // [num_heads + num_kv_heads*2, N, n_token, head_dim]
auto q = ggml_view_4d(ctx->ggml_ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], num_heads, qkv->nb[1], qkv->nb[2], qkv->nb[3], 0); // [num_heads, N, n_token, head_dim]
auto k = ggml_view_4d(ctx->ggml_ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], num_kv_heads, qkv->nb[1], qkv->nb[2], qkv->nb[3], qkv->nb[3] * num_heads); // [num_kv_heads, N, n_token, head_dim]
auto v = ggml_view_4d(ctx->ggml_ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], num_kv_heads, qkv->nb[1], qkv->nb[2], qkv->nb[3], qkv->nb[3] * (num_heads + num_kv_heads)); // [num_kv_heads, N, n_token, head_dim]
q = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, q, 0, 3, 1, 2)); // [N, n_token, num_heads, head_dim]
k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 3, 1, 2)); // [N, n_token, num_kv_heads, head_dim]
v = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, v, 0, 3, 1, 2)); // [N, n_token, num_kv_heads, head_dim]
if (qk_norm) {
auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
auto k_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
q = q_norm->forward(ctx, q);
k = k_norm->forward(ctx, k);
}
x = Rope::attention(ctx, q, k, v, pe, mask, 1.f / 128.f); // [N, n_token, num_heads * head_dim]
x = out_proj->forward(ctx, x); // [N, n_token, hidden_size]
return x;
}
};
class FeedForward : public GGMLBlock {
public:
FeedForward(int64_t dim,
int64_t hidden_dim,
int64_t multiple_of,
float ffn_dim_multiplier = 0.f) {
if (ffn_dim_multiplier > 0.f) {
hidden_dim = static_cast<int64_t>(ffn_dim_multiplier * hidden_dim);
}
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) / multiple_of);
blocks["w1"] = std::make_shared<Linear>(dim, hidden_dim, false);
bool force_prec_f32 = false;
float scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using CUDA but the weights are k-quants.
blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);
blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
auto x1 = w1->forward(ctx, x);
auto x3 = w3->forward(ctx, x);
x = ggml_mul(ctx->ggml_ctx, ggml_silu(ctx->ggml_ctx, x1), x3);
x = w2->forward(ctx, x);
return x;
}
};
__STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* scale) {
// x: [N, L, C]
// scale: [N, C]
scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]); // [N, 1, C]
x = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
return x;
}
struct JointTransformerBlock : public GGMLBlock {
protected:
bool modulation;
public:
JointTransformerBlock(int layer_id,
int64_t hidden_size,
int64_t head_dim,
int64_t num_heads,
int64_t num_kv_heads,
int64_t multiple_of,
float ffn_dim_multiplier,
float norm_eps,
bool qk_norm,
bool modulation = true)
: modulation(modulation) {
blocks["attention"] = std::make_shared<JointAttention>(hidden_size, head_dim, num_heads, num_kv_heads, qk_norm);
blocks["feed_forward"] = std::make_shared<FeedForward>(hidden_size, hidden_size, multiple_of, ffn_dim_multiplier);
blocks["attention_norm1"] = std::make_shared<RMSNorm>(hidden_size, norm_eps);
blocks["ffn_norm1"] = std::make_shared<RMSNorm>(hidden_size, norm_eps);
blocks["attention_norm2"] = std::make_shared<RMSNorm>(hidden_size, norm_eps);
blocks["ffn_norm2"] = std::make_shared<RMSNorm>(hidden_size, norm_eps);
if (modulation) {
blocks["adaLN_modulation.0"] = std::make_shared<Linear>(MIN(hidden_size, ADALN_EMBED_DIM), 4 * hidden_size);
}
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* pe,
struct ggml_tensor* mask = nullptr,
struct ggml_tensor* adaln_input = nullptr) {
auto attention = std::dynamic_pointer_cast<JointAttention>(blocks["attention"]);
auto feed_forward = std::dynamic_pointer_cast<FeedForward>(blocks["feed_forward"]);
auto attention_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm1"]);
auto ffn_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
auto attention_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm2"]);
auto ffn_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
if (modulation) {
GGML_ASSERT(adaln_input != nullptr);
auto adaLN_modulation_0 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.0"]);
auto m = adaLN_modulation_0->forward(ctx, adaln_input); // [N, 4 * hidden_size]
auto mods = ggml_ext_chunk(ctx->ggml_ctx, m, 4, 0);
auto scale_msa = mods[0];
auto gate_msa = mods[1];
auto scale_mlp = mods[2];
auto gate_mlp = mods[3];
auto residual = x;
x = modulate(ctx->ggml_ctx, attention_norm1->forward(ctx, x), scale_msa);
x = attention->forward(ctx, x, pe, mask);
x = attention_norm2->forward(ctx, x);
x = ggml_mul(ctx->ggml_ctx, x, ggml_tanh(ctx->ggml_ctx, gate_msa));
x = ggml_add(ctx->ggml_ctx, x, residual);
residual = x;
x = modulate(ctx->ggml_ctx, ffn_norm1->forward(ctx, x), scale_mlp);
x = feed_forward->forward(ctx, x);
x = ffn_norm2->forward(ctx, x);
x = ggml_mul(ctx->ggml_ctx, x, ggml_tanh(ctx->ggml_ctx, gate_mlp));
x = ggml_add(ctx->ggml_ctx, x, residual);
} else {
GGML_ASSERT(adaln_input == nullptr);
auto residual = x;
x = attention_norm1->forward(ctx, x);
x = attention->forward(ctx, x, pe, mask);
x = attention_norm2->forward(ctx, x);
x = ggml_add(ctx->ggml_ctx, x, residual);
residual = x;
x = ffn_norm1->forward(ctx, x);
x = feed_forward->forward(ctx, x);
x = ffn_norm2->forward(ctx, x);
x = ggml_add(ctx->ggml_ctx, x, residual);
}
return x;
}
};
struct FinalLayer : public GGMLBlock {
public:
FinalLayer(int64_t hidden_size,
int64_t patch_size,
int64_t out_channels) {
blocks["norm_final"] = std::make_shared<LayerNorm>(hidden_size, 1e-06f, false);
blocks["linear"] = std::make_shared<Linear>(hidden_size, patch_size * patch_size * out_channels, true, true);
blocks["adaLN_modulation.1"] = std::make_shared<Linear>(MIN(hidden_size, ADALN_EMBED_DIM), hidden_size);
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* c) {
// x: [N, n_token, hidden_size]
// c: [N, hidden_size]
// return: [N, n_token, patch_size * patch_size * out_channels]
auto norm_final = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
auto scale = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, hidden_size]
x = norm_final->forward(ctx, x);
x = modulate(ctx->ggml_ctx, x, scale);
x = linear->forward(ctx, x);
return x;
}
};
struct ZImageParams {
int64_t patch_size = 2;
int64_t hidden_size = 3840;
int64_t in_channels = 16;
int64_t out_channels = 16;
int64_t num_layers = 30;
int64_t num_refiner_layers = 2;
int64_t head_dim = 128;
int64_t num_heads = 30;
int64_t num_kv_heads = 30;
int64_t multiple_of = 256;
float ffn_dim_multiplier = 8.0 / 3.0f;
float norm_eps = 1e-5f;
bool qk_norm = true;
int64_t cap_feat_dim = 2560;
float theta = 256.f;
std::vector<int> axes_dim = {32, 48, 48};
int64_t axes_dim_sum = 128;
};
class ZImageModel : public GGMLBlock {
protected:
ZImageParams z_image_params;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
params["x_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
}
public:
ZImageModel() = default;
ZImageModel(ZImageParams z_image_params)
: z_image_params(z_image_params) {
blocks["x_embedder"] = std::make_shared<Linear>(z_image_params.patch_size * z_image_params.patch_size * z_image_params.in_channels, z_image_params.hidden_size);
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(MIN(z_image_params.hidden_size, 1024), 256, 256);
blocks["cap_embedder.0"] = std::make_shared<RMSNorm>(z_image_params.cap_feat_dim, z_image_params.norm_eps);
blocks["cap_embedder.1"] = std::make_shared<Linear>(z_image_params.cap_feat_dim, z_image_params.hidden_size);
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
auto block = std::make_shared<JointTransformerBlock>(i,
z_image_params.hidden_size,
z_image_params.head_dim,
z_image_params.num_heads,
z_image_params.num_kv_heads,
z_image_params.multiple_of,
z_image_params.ffn_dim_multiplier,
z_image_params.norm_eps,
z_image_params.qk_norm,
true);
blocks["noise_refiner." + std::to_string(i)] = block;
}
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
auto block = std::make_shared<JointTransformerBlock>(i,
z_image_params.hidden_size,
z_image_params.head_dim,
z_image_params.num_heads,
z_image_params.num_kv_heads,
z_image_params.multiple_of,
z_image_params.ffn_dim_multiplier,
z_image_params.norm_eps,
z_image_params.qk_norm,
false);
blocks["context_refiner." + std::to_string(i)] = block;
}
for (int i = 0; i < z_image_params.num_layers; i++) {
auto block = std::make_shared<JointTransformerBlock>(i,
z_image_params.hidden_size,
z_image_params.head_dim,
z_image_params.num_heads,
z_image_params.num_kv_heads,
z_image_params.multiple_of,
z_image_params.ffn_dim_multiplier,
z_image_params.norm_eps,
z_image_params.qk_norm,
true);
blocks["layers." + std::to_string(i)] = block;
}
blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
}
struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
struct ggml_tensor* x) {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size;
int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size;
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w]
return x;
}
struct ggml_tensor* patchify(struct ggml_context* ctx,
struct ggml_tensor* x) {
// x: [N, C, H, W]
// return: [N, h*w, patch_size*patch_size*C]
int64_t N = x->ne[3];
int64_t C = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
int64_t p = z_image_params.patch_size;
int64_t h = H / z_image_params.patch_size;
int64_t w = W / z_image_params.patch_size;
GGML_ASSERT(h * p == H && w * p == W);
x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p]
x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p]
x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [N, h*w, C, p*p]
x = ggml_reshape_3d(ctx, x, C * p * p, w * h, N); // [N, h*w, p*p*C]
return x;
}
struct ggml_tensor* process_img(struct ggml_context* ctx,
struct ggml_tensor* x) {
x = pad_to_patch_size(ctx, x);
x = patchify(ctx, x);
return x;
}
struct ggml_tensor* unpatchify(struct ggml_context* ctx,
struct ggml_tensor* x,
int64_t h,
int64_t w) {
// x: [N, h*w, patch_size*patch_size*C]
// return: [N, C, H, W]
int64_t N = x->ne[2];
int64_t C = x->ne[0] / z_image_params.patch_size / z_image_params.patch_size;
int64_t H = h * z_image_params.patch_size;
int64_t W = w * z_image_params.patch_size;
int64_t p = z_image_params.patch_size;
GGML_ASSERT(C * p * p == x->ne[0]);
x = ggml_reshape_4d(ctx, x, C, p * p, w * h, N); // [N, h*w, p*p, C]
x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, h*w, p*p]
x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p]
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p]
x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p]
return x;
}
struct ggml_tensor* forward_core(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* timestep,
struct ggml_tensor* context,
struct ggml_tensor* pe) {
auto x_embedder = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]);
auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
auto cap_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["cap_embedder.0"]);
auto cap_embedder_1 = std::dynamic_pointer_cast<Linear>(blocks["cap_embedder.1"]);
auto norm_final = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_final"]);
auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
auto txt_pad_token = params["cap_pad_token"];
auto img_pad_token = params["x_pad_token"];
int64_t N = x->ne[2];
int64_t n_img_token = x->ne[1];
int64_t n_txt_token = context->ne[1];
auto t_emb = t_embedder->forward(ctx, timestep);
auto txt = cap_embedder_1->forward(ctx, cap_embedder_0->forward(ctx, context)); // [N, n_txt_token, hidden_size]
auto img = x_embedder->forward(ctx, x); // [N, n_img_token, hidden_size]
int64_t n_txt_pad_token = Rope::bound_mod(n_txt_token, SEQ_MULTI_OF);
if (n_txt_pad_token > 0) {
auto txt_pad_tokens = ggml_repeat_4d(ctx->ggml_ctx, txt_pad_token, txt_pad_token->ne[0], n_txt_pad_token, N, 1);
txt = ggml_concat(ctx->ggml_ctx, txt, txt_pad_tokens, 1); // [N, n_txt_token + n_txt_pad_token, hidden_size]
}
int64_t n_img_pad_token = Rope::bound_mod(n_img_token, SEQ_MULTI_OF);
if (n_img_pad_token > 0) {
auto img_pad_tokens = ggml_repeat_4d(ctx->ggml_ctx, img_pad_token, img_pad_token->ne[0], n_img_pad_token, N, 1);
img = ggml_concat(ctx->ggml_ctx, img, img_pad_tokens, 1); // [N, n_img_token + n_img_pad_token, hidden_size]
}
GGML_ASSERT(txt->ne[1] + img->ne[1] == pe->ne[3]);
auto txt_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt->ne[1]);
auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt->ne[1], pe->ne[3]);
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
txt = block->forward(ctx, txt, txt_pe, nullptr, nullptr);
}
for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
img = block->forward(ctx, img, img_pe, nullptr, t_emb);
}
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, hidden_size]
for (int i = 0; i < z_image_params.num_layers; i++) {
auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["layers." + std::to_string(i)]);
txt_img = block->forward(ctx, txt_img, pe, nullptr, t_emb);
}
txt_img = final_layer->forward(ctx, txt_img, t_emb); // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, ph*pw*C]
img = ggml_ext_slice(ctx->ggml_ctx, txt_img, 1, n_txt_token + n_txt_pad_token, n_txt_token + n_txt_pad_token + n_img_token); // [N, n_img_token, ph*pw*C]
return img;
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* timestep,
struct ggml_tensor* context,
struct ggml_tensor* pe,
std::vector<ggml_tensor*> ref_latents = {}) {
// Forward pass of DiT.
// x: [N, C, H, W]
// timestep: [N,]
// context: [N, L, D]
// pe: [L, d_head/2, 2, 2]
// return: [N, C, H, W]
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t C = x->ne[2];
int64_t N = x->ne[3];
auto img = process_img(ctx->ggml_ctx, x);
uint64_t n_img_token = img->ne[1];
if (ref_latents.size() > 0) {
for (ggml_tensor* ref : ref_latents) {
ref = process_img(ctx->ggml_ctx, ref);
img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
}
}
int64_t h_len = ((H + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
int64_t w_len = ((W + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
auto out = forward_core(ctx, img, timestep, context, pe);
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token); // [N, n_img_token, ph*pw*C]
out = unpatchify(ctx->ggml_ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w]
// slice
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H); // [N, C, H, W + pad_w]
out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W); // [N, C, H, W]
out = ggml_scale(ctx->ggml_ctx, out, -1.f);
return out;
}
};
struct ZImageRunner : public GGMLRunner {
public:
ZImageParams z_image_params;
ZImageModel z_image;
std::vector<float> pe_vec;
std::vector<float> timestep_vec;
SDVersion version;
ZImageRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
SDVersion version = VERSION_Z_IMAGE)
: GGMLRunner(backend, offload_params_to_cpu) {
z_image = ZImageModel(z_image_params);
z_image.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "z_image";
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
z_image.get_param_tensors(tensors, prefix);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false) {
GGML_ASSERT(x->ne[3] == 1);
struct ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE);
x = to_backend(x);
context = to_backend(context);
timesteps = to_backend(timesteps);
for (int i = 0; i < ref_latents.size(); i++) {
ref_latents[i] = to_backend(ref_latents[i]);
}
pe_vec = Rope::gen_z_image_pe(x->ne[1],
x->ne[0],
z_image_params.patch_size,
x->ne[3],
context->ne[1],
SEQ_MULTI_OF,
ref_latents,
increase_ref_index,
z_image_params.theta,
z_image_params.axes_dim);
int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2;
// LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, z_image_params.axes_dim_sum / 2, pos_len);
// pe->data = pe_vec.data();
// print_ggml_tensor(pe, true, "pe");
// pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data());
auto runner_ctx = get_context();
struct ggml_tensor* out = z_image.forward(&runner_ctx,
x,
timesteps,
context,
pe,
ref_latents);
ggml_build_forward_expand(gf, out);
return gf;
}
bool compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
std::vector<ggml_tensor*> ref_latents = {},
bool increase_ref_index = false,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w]
// timesteps: [N, ]
// context: [N, max_position, hidden_size]
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
};
return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
}
void test() {
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
params.mem_buffer = nullptr;
params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params);
GGML_ASSERT(work_ctx != nullptr);
{
// auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
// ggml_set_f32(x, 0.01f);
auto x = load_tensor_from_file(work_ctx, "./z_image_x.bin");
print_ggml_tensor(x);
std::vector<float> timesteps_vec(1, 0.f);
auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
// auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 2560, 256, 1);
// ggml_set_f32(context, 0.01f);
auto context = load_tensor_from_file(work_ctx, "./z_image_context.bin");
print_ggml_tensor(context);
struct ggml_tensor* out = nullptr;
int t0 = ggml_time_ms();
compute(8, x, timesteps, context, {}, false, &out, work_ctx);
int t1 = ggml_time_ms();
print_ggml_tensor(out);
LOG_DEBUG("z_image test done in %dms", t1 - t0);
}
}
static void load_from_file_and_test(const std::string& file_path) {
// cuda q8: pass
// cuda q8 fa: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_Q8_0;
ModelLoader model_loader;
if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return;
}
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
if (model_data_type != GGML_TYPE_COUNT) {
for (auto& [name, tensor_storage] : tensor_storage_map) {
if (ends_with(name, "weight")) {
tensor_storage.expected_type = model_data_type;
}
}
}
std::shared_ptr<ZImageRunner> z_image = std::make_shared<ZImageRunner>(backend,
false,
tensor_storage_map,
"model.diffusion_model",
VERSION_QWEN_IMAGE);
z_image->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
z_image->get_param_tensors(tensors, "model.diffusion_model");
bool success = model_loader.load_tensors(tensors);
if (!success) {
LOG_ERROR("load tensors from model loader failed");
return;
}
LOG_INFO("z_image model loaded");
z_image->test();
}
};
} // namespace ZImage
#endif // __Z_IMAGE_HPP__