diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 749366e7..a4f0365d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -444,12 +444,98 @@ jobs:
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
+ windows-latest-rocm:
+ runs-on: windows-2022
+
+ env:
+ ROCM_VERSION: "7.12.0"
+ GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
+
+ steps:
+ - uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Cache ROCm Installation
+ id: cache-rocm
+ uses: actions/cache@v4
+ with:
+ path: C:\TheRock\build
+ key: rocm-${{ env.ROCM_VERSION }}-gfx1151-${{ runner.os }}
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.16
+ with:
+ key: windows-latest-rocm-${{ env.ROCM_VERSION }}-x64
+ evict-old-files: 1d
+
+ - name: Install ROCm
+ if: steps.cache-rocm.outputs.cache-hit != 'true'
+ run: |
+ $ErrorActionPreference = "Stop"
+ write-host "Downloading AMD ROCm ${{ env.ROCM_VERSION }} tarball"
+ Invoke-WebRequest -Uri "https://repo.amd.com/rocm/tarball/therock-dist-windows-gfx1151-${{ env.ROCM_VERSION }}.tar.gz" -OutFile "${env:RUNNER_TEMP}\rocm.tar.gz"
+ write-host "Extracting ROCm tarball"
+ mkdir C:\TheRock\build -Force
+ tar -xzf "${env:RUNNER_TEMP}\rocm.tar.gz" -C C:\TheRock\build --strip-components=1
+ write-host "Completed ROCm extraction"
+
+ - name: Setup ROCm Environment
+ run: |
+ $rocmPath = "C:\TheRock\build"
+ echo "HIP_PATH=$rocmPath" >> $env:GITHUB_ENV
+ echo "HIP_DEVICE_LIB_PATH=$rocmPath\lib\llvm\amdgcn\bitcode" >> $env:GITHUB_ENV
+ echo "HIP_PLATFORM=amd" >> $env:GITHUB_ENV
+ echo "LLVM_PATH=$rocmPath\lib\llvm" >> $env:GITHUB_ENV
+ echo "$rocmPath\bin" >> $env:GITHUB_PATH
+ echo "$rocmPath\lib\llvm\bin" >> $env:GITHUB_PATH
+
+ - name: Build
+ run: |
+ mkdir build
+ cd build
+ cmake .. `
+ -G "Unix Makefiles" `
+ -DCMAKE_PREFIX_PATH="${env:HIP_PATH}" `
+ -DSD_HIPBLAS=ON `
+ -DSD_BUILD_SHARED_LIBS=ON `
+ -DGGML_NATIVE=OFF `
+ -DCMAKE_C_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang.exe" `
+ -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang++.exe" `
+ -DCMAKE_HIP_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang.exe" `
+ -DHIP_PATH="${env:HIP_PATH}" `
+ -DCMAKE_BUILD_TYPE=Release `
+ -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
+ cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
+
+ - name: Get commit hash
+ id: commit
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: pr-mpt/actions-commit-hash@v2
+
+ - name: Pack artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+ 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
+ path: |
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
+
windows-latest-cmake-hip:
runs-on: windows-2022
env:
- HIPSDK_INSTALLER_VERSION: "25.Q3"
- GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+ HIPSDK_INSTALLER_VERSION: "26.Q1"
+ ROCM_VERSION: "7.1.1"
+ GPU_TARGETS: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps:
- uses: actions/checkout@v3
@@ -484,7 +570,7 @@ jobs:
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
- Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+ Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$completed = $proc.WaitForExit(600000)
@@ -537,32 +623,38 @@ jobs:
run: |
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
- cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
- cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
- 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
+ 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
- name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
path: |
- sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
ubuntu-latest-rocm:
- runs-on: ubuntu-latest
- container: rocm/dev-ubuntu-24.04:7.2
+ runs-on: ubuntu-24.04
env:
- ROCM_VERSION: "7.2"
UBUNTU_VERSION: "24.04"
- GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+
+ strategy:
+ matrix:
+ include:
+ - ROCM_VERSION: "7.2.1"
+ gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
+ build: 'x64'
+ - ROCM_VERSION: "7.12.0"
+ gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
+ build: x64
steps:
- - run: apt-get update && apt-get install -y git
- name: Clone
id: checkout
uses: actions/checkout@v6
@@ -579,6 +671,38 @@ jobs:
with:
version: 10.15.1
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.16
+ with:
+ key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
+ evict-old-files: 1d
+
+ - name: Dependencies
+ id: depends
+ run: |
+ sudo apt install -y build-essential cmake wget zip ninja-build
+
+ - name: Setup Legacy ROCm
+ if: matrix.ROCM_VERSION == '7.2.1'
+ id: legacy_env
+ run: |
+ sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+ wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
+ gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+
+ sudo tee /etc/apt/sources.list.d/rocm.list << EOF
+ deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} noble main
+ EOF
+
+ sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
+ Package: *
+ Pin: release o=repo.radeon.com
+ Pin-Priority: 600
+ EOF
+
+ sudo apt update
+ sudo apt-get install -y libssl-dev rocm-hip-sdk
+
- name: Free disk space
run: |
# Remove preinstalled SDKs and caches not needed for this job
@@ -592,51 +716,17 @@ jobs:
sudo rm -rf /var/lib/apt/lists/* || true
sudo apt clean
- - name: Dependencies
- id: depends
+ - name: Setup TheRock
+ if: matrix.ROCM_VERSION != '7.2.1'
+ id: therock_env
run: |
- sudo apt-get update
- sudo apt install -y \
- cmake \
- hip-dev \
- hipblas-dev \
- ninja-build \
- rocm-dev \
- zip
- # Clean apt caches to recover disk space
- sudo apt clean
- sudo rm -rf /var/lib/apt/lists/* || true
-
- - name: Setup ROCm Environment
- run: |
- # Add ROCm to PATH for current session
- echo "/opt/rocm/bin" >> $GITHUB_PATH
-
- # Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
- TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
-
- # Remove library files for architectures we're not building for to save disk space
- echo "Cleaning up unneeded architecture files..."
- cd /opt/rocm/lib/rocblas/library
- # Keep only our target architectures
- for file in *; do
- if printf '%s' "$file" | grep -q 'gfx'; then
- if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
- echo "Removing $file" &&
- sudo rm -f "$file";
- fi
- fi
- done
-
- cd /opt/rocm/lib/hipblaslt/library
- for file in *; do
- if printf '%s' "$file" | grep -q 'gfx'; then
- if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
- echo "Removing $file" &&
- sudo rm -f "$file";
- fi
- fi
- done
+ wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
+ mkdir install
+ tar -xf *.tar.gz -C install
+ export ROCM_PATH=$(pwd)/install
+ echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
+ echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
+ echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
- name: Build
id: cmake_build
@@ -644,12 +734,12 @@ jobs:
mkdir build
cd build
cmake .. -G Ninja \
- -DCMAKE_CXX_COMPILER=amdclang++ \
- -DCMAKE_C_COMPILER=amdclang \
+ -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+ -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
-DCMAKE_BUILD_TYPE=Release \
-DSD_HIPBLAS=ON \
- -DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
- -DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+ -DHIP_PLATFORM=amd \
+ -DGPU_TARGETS="${{ matrix.gpu_targets }}" \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DSD_BUILD_SHARED_LIBS=ON
@@ -668,16 +758,6 @@ jobs:
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
- # Move ROCm runtime libraries (to avoid double space consumption)
- sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
- sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
- sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
- sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
- sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
- sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
- sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
- sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
-
- name: Fetch system info
id: system-info
run: |
@@ -692,15 +772,15 @@ jobs:
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
- zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
+ zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip ./build/bin
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
- name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip
path: |
- sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -715,6 +795,7 @@ jobs:
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-hip
+ - windows-latest-rocm
steps:
- name: Clone
diff --git a/README.md b/README.md
index 33c272e9..d889f28f 100644
--- a/README.md
+++ b/README.md
@@ -133,9 +133,11 @@ API and command-line option may change frequently.***
## Performance
If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
+For runtime and parameter backend placement, see the [backend selection guide](./docs/backend.md).
## More Guides
+- [Backend selection](./docs/backend.md)
- [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
diff --git a/docs/backend.md b/docs/backend.md
new file mode 100644
index 00000000..53088b0e
--- /dev/null
+++ b/docs/backend.md
@@ -0,0 +1,122 @@
+# Backend selection
+
+`stable-diffusion.cpp` has two backend assignments:
+
+- `--backend` selects the runtime backend used to execute model graphs.
+- `--params-backend` selects the backend used to allocate model parameters.
+
+If `--params-backend` is not set, parameters use the same backend as their module runtime backend.
+
+## Syntax
+
+A backend assignment can be a single backend name:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend cpu
+```
+
+This applies to every module that does not have a more specific assignment.
+
+Assignments can also target individual modules:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend te=cpu,vae=cuda0,diffusion=vulkan0
+```
+
+The same syntax is used for parameter placement:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend te=cpu,vae=cpu
+```
+
+Module names are case-insensitive. Hyphens and underscores in module names are ignored, so `clip_vision`, `clip-vision`, and `clipvision` are equivalent.
+
+`all=`, `default=`, and `*=` can be used to set the default backend inside a mixed assignment:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend all=cuda0,te=cpu
+```
+
+## Modules
+
+| Module | Purpose | Accepted names |
+| --- | --- | --- |
+| `diffusion` | UNet, DiT, MMDiT, Flux, Wan, Qwen Image, and other diffusion models | `diffusion`, `model`, `unet`, `dit` |
+| `te` | Text encoders and conditioners | `te`, `clip`, `text`, `textencoder`, `textencoders`, `conditioner`, `cond`, `llm`, `t5`, `t5xxl` |
+| `clip_vision` | CLIP vision encoder | `clip_vision`, `clipvision`, `clip-vision`, `vision` |
+| `vae` | VAE and TAE | `vae`, `firststage`, `autoencoder`, `tae` |
+| `controlnet` | ControlNet | `controlnet`, `control` |
+| `photomaker` | PhotoMaker ID encoder and PhotoMaker LoRA | `photomaker`, `photomakerid`, `pmid`, `photo` |
+| `upscaler` | ESRGAN upscaler | `upscaler`, `esrgan`, `hires` |
+
+`te` is the preferred module name for text encoders. `clip` is kept as an accepted alias because many existing commands and model names use CLIP terminology.
+
+## Backend names
+
+Backend names are resolved against the GGML backend device list. Matching is case-insensitive and accepts exact names or unique prefixes, so common values include names such as:
+
+- `cpu`
+- `cuda0`
+- `vulkan0`
+- `metal`
+
+The special values `auto`, `default`, and an empty backend name select the default backend. The default preference is GPU, then integrated GPU, then CPU.
+
+The special value `gpu` selects the first GPU backend, falling back to the first integrated GPU backend.
+
+## Runtime backend vs. parameter backend
+
+The runtime backend controls where graph execution runs. The parameter backend controls where model weights are allocated.
+
+For example:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend cpu
+```
+
+This runs all modules on `cuda0`, but stores parameters in CPU RAM. During execution, parameters are moved to the runtime backend as needed.
+
+Per-module assignments can be mixed:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend diffusion=cuda0,te=cpu,vae=cpu --params-backend diffusion=cuda0,te=cpu,vae=cpu
+```
+
+This keeps text encoding and VAE execution on CPU while the diffusion model runs on GPU.
+
+## Backend sharing and lifetime
+
+Backends are managed by `SDBackendManager`.
+
+Within one manager, backend instances are cached by resolved backend device name. If multiple modules request the same backend, they share the same `ggml_backend_t`.
+
+For example:
+
+```shell
+--backend te=cpu,vae=cpu
+```
+
+uses one shared CPU backend for both `te` and `vae` runtime execution.
+
+Runtime and parameter assignments also share the same backend cache. If `--backend diffusion=cuda0` and `--params-backend diffusion=cuda0` resolve to the same device, both use the same backend instance.
+
+`SDBackendManager` owns the backend instances and frees them when the context or upscaler is destroyed. Model runners receive non-owning runtime and parameter backend pointers and do not free them.
+
+## Compatibility flags
+
+The older CPU placement flags are still supported:
+
+- `--clip-on-cpu`
+- `--vae-on-cpu`
+- `--control-net-cpu`
+- `--offload-to-cpu`
+
+`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` affect runtime backend assignment only when `--backend` is not set. They map to `te=cpu`, `vae=cpu`, and `controlnet=cpu`.
+
+`--offload-to-cpu` affects parameter backend assignment only when `--params-backend` is not set. It is equivalent to:
+
+```shell
+--params-backend cpu
+```
+
+Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
diff --git a/docs/hipBLAS_on_Windows.md b/docs/hipBLAS_on_Windows.md
index b5105ad1..cd846572 100644
--- a/docs/hipBLAS_on_Windows.md
+++ b/docs/hipBLAS_on_Windows.md
@@ -26,12 +26,12 @@ Fortunately, `AMD` provides complete help documentation, you can use the help do
Then we must set `ROCM` as environment variables before running cmake.
-Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\5.5\bin`
+Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\7.1.1\bin`
This is what I use to set the clang:
```Commandline
-set CC=C:\Program Files\AMD\ROCm\5.5\bin\clang.exe
-set CXX=C:\Program Files\AMD\ROCm\5.5\bin\clang++.exe
+set CC=C:\Program Files\AMD\ROCm\7.1.1\bin\clang.exe
+set CXX=C:\Program Files\AMD\ROCm\7.1.1\bin\clang++.exe
```
## Ninja
@@ -46,7 +46,7 @@ set ninja=C:\Program Files\ninja\ninja.exe
## Building stable-diffusion.cpp
The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` ,
-`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`
+`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032`
>**Notice**: check the `clang` and `clang++` information:
```Commandline
@@ -59,26 +59,29 @@ If you see like this, we can continue:
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc
Thread model: posix
-InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
+InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin
```
```
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc
Thread model: posix
-InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
+InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin
```
->**Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
+>**Notice** that the GPU targets are now compatible with multiple GPU architectures (ROCm 7.1.1 targets). You can change them to match your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
-My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`.
+Examples:
+- AMD Radeon™ RX 7900 XTX Graphics: `gfx1100`
+- AMD Radeon™ RX 7900 XT Graphics: `gfx1101`
+- AMD Radeon™ RX 7900 GRE Graphics: `gfx1102`
option:
```commandline
mkdir build
cd build
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
cmake --build . --config Release
```
diff --git a/docs/z_image.md b/docs/z_image.md
index 2ea66f9b..58c2d1fe 100644
--- a/docs/z_image.md
+++ b/docs/z_image.md
@@ -21,7 +21,7 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
### Z-Image-Turbo
```
-.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
+.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512 --steps 8
```
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index d13ca6d9..dc5013b3 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -800,7 +800,9 @@ int main(int argc, const char* argv[]) {
ctx_params.offload_params_to_cpu,
ctx_params.diffusion_conv_direct,
ctx_params.n_threads,
- gen_params.upscale_tile_size));
+ gen_params.upscale_tile_size,
+ ctx_params.backend.c_str(),
+ ctx_params.params_backend.c_str()));
if (upscaler_ctx == nullptr) {
LOG_ERROR("new_upscaler_ctx failed");
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index ff90930a..e9bc212b 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -388,6 +388,14 @@ ArgOptions SDContextParams::get_options() {
"--upscale-model",
"path to esrgan model.",
&esrgan_path},
+ {"",
+ "--backend",
+ "runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0",
+ &backend},
+ {"",
+ "--params-backend",
+ "parameter backend assignment, e.g. cpu or diffusion=cpu,clip=cpu",
+ ¶ms_backend},
};
options.int_options = {
@@ -686,6 +694,8 @@ std::string SDContextParams::to_string() const {
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
<< " max_vram: " << max_vram << ",\n"
+ << " backend: \"" << backend << "\",\n"
+ << " params_backend: \"" << params_backend << "\",\n"
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
<< " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
<< " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
@@ -763,6 +773,8 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
chroma_t5_mask_pad,
qwen_image_zero_cond_t,
max_vram,
+ backend.c_str(),
+ params_backend.c_str(),
};
return sd_ctx_params;
}
diff --git a/examples/common/common.h b/examples/common/common.h
index a83a2bc8..a515adf4 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -112,14 +112,16 @@ struct SDContextParams {
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
bool offload_params_to_cpu = false;
float max_vram = 0.f;
- bool enable_mmap = false;
- bool control_net_cpu = false;
- bool clip_on_cpu = false;
- bool vae_on_cpu = false;
- bool flash_attn = false;
- bool diffusion_flash_attn = false;
- bool diffusion_conv_direct = false;
- bool vae_conv_direct = false;
+ std::string backend;
+ std::string params_backend;
+ bool enable_mmap = false;
+ bool control_net_cpu = false;
+ bool clip_on_cpu = false;
+ bool vae_on_cpu = false;
+ bool flash_attn = false;
+ bool diffusion_flash_attn = false;
+ bool diffusion_conv_direct = false;
+ bool vae_conv_direct = false;
bool circular = false;
bool circular_x = false;
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index bb68dd76..55bd52ad 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -209,6 +209,8 @@ typedef struct {
int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t;
float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
+ const char* backend;
+ const char* params_backend;
} sd_ctx_params_t;
typedef struct {
@@ -443,7 +445,9 @@ SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
bool offload_params_to_cpu,
bool direct,
int n_threads,
- int tile_size);
+ int tile_size,
+ const char* backend,
+ const char* params_backend);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
diff --git a/src/anima.hpp b/src/anima.hpp
index 4bfc0474..486aec3a 100644
--- a/src/anima.hpp
+++ b/src/anima.hpp
@@ -526,10 +526,10 @@ namespace Anima {
AnimaNet net;
AnimaRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
- : GGMLRunner(backend, offload_params_to_cpu) {
+ : GGMLRunner(backend, params_backend) {
int64_t num_layers = 0;
std::string layer_tag = prefix + ".net.blocks.";
for (const auto& kv : tensor_storage_map) {
diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp
index 4fb28a16..489f8fd3 100644
--- a/src/auto_encoder_kl.hpp
+++ b/src/auto_encoder_kl.hpp
@@ -664,13 +664,13 @@ struct AutoEncoderKL : public VAE {
AutoEncoderKLModel ae;
AutoEncoderKL(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool decode_only = false,
bool use_video_decoder = false,
SDVersion version = VERSION_SD1)
- : decode_only(decode_only), VAE(version, backend, offload_params_to_cpu) {
+ : decode_only(decode_only), VAE(version, backend, params_backend) {
if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
scale_factor = 0.18215f;
shift_factor = 0.f;
diff --git a/src/clip.hpp b/src/clip.hpp
index 8b2084c4..a3567324 100644
--- a/src/clip.hpp
+++ b/src/clip.hpp
@@ -469,13 +469,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
std::vector attention_mask_vec;
CLIPTextModelRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
bool force_clip_f32 = false)
- : GGMLRunner(backend, offload_params_to_cpu) {
+ : GGMLRunner(backend, params_backend) {
bool proj_in = false;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index 8831f1a1..e5a702b9 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -147,7 +147,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::map> embedding_pos_map;
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::map& orig_embedding_map,
SDVersion version = VERSION_SD1,
@@ -161,12 +161,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
}
bool force_clip_f32 = !embedding_map.empty();
if (sd_version_is_sd1(version)) {
- text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
+ text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
} else if (sd_version_is_sd2(version)) {
- text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
+ text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
} else if (sd_version_is_sdxl(version)) {
- text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
- text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
+ text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+ text_model2 = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
}
}
@@ -683,9 +683,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
CLIPVisionModelProjection vision_model;
FrozenCLIPVisionEmbedder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {})
- : GGMLRunner(backend, offload_params_to_cpu) {
+ : GGMLRunner(backend, params_backend) {
std::string prefix = "cond_stage_model.transformer";
bool proj_in = false;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
@@ -742,7 +742,7 @@ struct SD3CLIPEmbedder : public Conditioner {
std::shared_ptr t5;
SD3CLIPEmbedder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {})
: clip_g_tokenizer(0) {
bool use_clip_l = false;
@@ -762,13 +762,13 @@ struct SD3CLIPEmbedder : public Conditioner {
return;
}
if (use_clip_l) {
- clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+ clip_l = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
}
if (use_clip_g) {
- clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+ clip_g = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
}
if (use_t5) {
- t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+ t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer");
}
}
@@ -1110,7 +1110,7 @@ struct FluxCLIPEmbedder : public Conditioner {
size_t chunk_len = 256;
FluxCLIPEmbedder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {}) {
bool use_clip_l = false;
bool use_t5 = false;
@@ -1128,12 +1128,12 @@ struct FluxCLIPEmbedder : public Conditioner {
}
if (use_clip_l) {
- clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+ clip_l = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
} else {
LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
}
if (use_t5) {
- t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+ t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer");
} else {
LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
}
@@ -1364,7 +1364,7 @@ struct T5CLIPEmbedder : public Conditioner {
bool is_umt5 = false;
T5CLIPEmbedder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
bool use_mask = false,
int mask_pad = 0,
@@ -1381,7 +1381,7 @@ struct T5CLIPEmbedder : public Conditioner {
LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
return;
} else {
- t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5);
+ t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5);
}
}
@@ -1566,12 +1566,12 @@ struct AnimaConditioner : public Conditioner {
std::shared_ptr llm;
AnimaConditioner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {}) {
qwen_tokenizer = std::make_shared();
llm = std::make_shared(LLM::LLMArch::QWEN3,
backend,
- offload_params_to_cpu,
+ params_backend,
tensor_storage_map,
"text_encoders.llm",
false);
@@ -1638,10 +1638,11 @@ struct AnimaConditioner : public Conditioner {
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
- std::vector curr_tokens = t5_tokenizer.tokenize(curr_text, nullptr, true);
+ std::vector curr_tokens = t5_tokenizer.encode(curr_text);
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}
+ t5_tokenizer.pad_tokens(t5_tokens, &t5_weights, nullptr);
return {qwen_tokens, qwen_weights, t5_tokens, t5_weights};
}
@@ -1684,7 +1685,7 @@ struct LLMEmbedder : public Conditioner {
std::shared_ptr llm;
LLMEmbedder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_QWEN_IMAGE,
const std::string prefix = "",
@@ -1705,7 +1706,7 @@ struct LLMEmbedder : public Conditioner {
}
llm = std::make_shared(arch,
backend,
- offload_params_to_cpu,
+ params_backend,
tensor_storage_map,
"text_encoders.llm",
enable_vision);
@@ -2069,10 +2070,10 @@ struct LTXAVTextProjectionRunner : public GGMLRunner {
LTXAVTextProjection model;
LTXAVTextProjectionRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "")
- : GGMLRunner(backend, offload_params_to_cpu),
+ : GGMLRunner(backend, params_backend),
model(tensor_storage_map.find(prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end()) {
model.init(params_ctx, tensor_storage_map, prefix);
}
@@ -2113,20 +2114,20 @@ struct LTXAVEmbedder : public Conditioner {
bool dual_projection = false;
LTXAVEmbedder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& llm_prefix = "text_encoders.llm",
const std::string& projector_prefix = "text_embedding_projection") {
tokenizer = std::make_shared();
llm = std::make_shared(LLM::LLMArch::GEMMA3_12B,
backend,
- offload_params_to_cpu,
+ params_backend,
tensor_storage_map,
llm_prefix,
false);
dual_projection = tensor_storage_map.find(projector_prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end();
projector = std::make_shared(backend,
- offload_params_to_cpu,
+ params_backend,
tensor_storage_map,
projector_prefix);
}
diff --git a/src/control.hpp b/src/control.hpp
index d227ec94..fd1f6d86 100644
--- a/src/control.hpp
+++ b/src/control.hpp
@@ -319,10 +319,10 @@ struct ControlNet : public GGMLRunner {
bool guided_hint_cached = false;
ControlNet(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_SD1)
- : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
+ : GGMLRunner(backend, params_backend), control_net(version) {
control_net.init(params_ctx, tensor_storage_map, "");
}
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index 0d344226..bbfab57d 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -68,10 +68,10 @@ struct UNetModel : public DiffusionModel {
UNetModelRunner unet;
UNetModel(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_SD1)
- : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
+ : unet(backend, params_backend, tensor_storage_map, "model.diffusion_model", version) {
}
std::string get_desc() override {
@@ -139,9 +139,9 @@ struct MMDiTModel : public DiffusionModel {
MMDiTRunner mmdit;
MMDiTModel(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {})
- : mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
+ : mmdit(backend, params_backend, tensor_storage_map, "model.diffusion_model") {
}
std::string get_desc() override {
@@ -206,11 +206,11 @@ struct FluxModel : public DiffusionModel {
Flux::FluxRunner flux;
FluxModel(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_FLUX,
bool use_mask = false)
- : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
+ : flux(backend, params_backend, tensor_storage_map, "model.diffusion_model", version, use_mask) {
}
std::string get_desc() override {
@@ -281,10 +281,10 @@ struct AnimaModel : public DiffusionModel {
Anima::AnimaRunner anima;
AnimaModel(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
- : prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
+ : prefix(prefix), anima(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
@@ -349,11 +349,11 @@ struct WanModel : public DiffusionModel {
WAN::WanRunner wan;
WanModel(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_WAN2)
- : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
+ : prefix(prefix), wan(backend, params_backend, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
@@ -421,12 +421,12 @@ struct QwenImageModel : public DiffusionModel {
Qwen::QwenImageRunner qwen_image;
QwenImageModel(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_QWEN_IMAGE,
bool zero_cond_t = false)
- : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version, zero_cond_t) {
+ : prefix(prefix), qwen_image(backend, params_backend, tensor_storage_map, prefix, version, zero_cond_t) {
}
std::string get_desc() override {
@@ -492,10 +492,10 @@ struct HiDreamO1Model : public DiffusionModel {
HiDreamO1::HiDreamO1Runner hidream_o1;
HiDreamO1Model(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model")
- : prefix(prefix), hidream_o1(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
+ : prefix(prefix), hidream_o1(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
@@ -568,11 +568,11 @@ struct ZImageModel : public DiffusionModel {
ZImage::ZImageRunner z_image;
ZImageModel(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_Z_IMAGE)
- : prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
+ : prefix(prefix), z_image(backend, params_backend, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
@@ -638,10 +638,10 @@ struct ErnieImageModel : public DiffusionModel {
ErnieImage::ErnieImageRunner ernie_image;
ErnieImageModel(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
- : prefix(prefix), ernie_image(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
+ : prefix(prefix), ernie_image(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
@@ -704,10 +704,10 @@ struct LTXAVModel : public DiffusionModel {
LTXV::LTXAVRunner ltxav;
LTXAVModel(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
- : prefix(prefix), ltxav(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
+ : prefix(prefix), ltxav(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
diff --git a/src/ernie_image.hpp b/src/ernie_image.hpp
index 931794f1..bea84cdf 100644
--- a/src/ernie_image.hpp
+++ b/src/ernie_image.hpp
@@ -331,10 +331,10 @@ namespace ErnieImage {
std::vector pe_vec;
ErnieImageRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
- : GGMLRunner(backend, offload_params_to_cpu) {
+ : GGMLRunner(backend, params_backend) {
ernie_params.num_layers = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
diff --git a/src/esrgan.hpp b/src/esrgan.hpp
index f84b77a2..f54baca3 100644
--- a/src/esrgan.hpp
+++ b/src/esrgan.hpp
@@ -161,10 +161,10 @@ struct ESRGAN : public GGMLRunner {
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
ESRGAN(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
int tile_size = 128,
const String2TensorStorage& tensor_storage_map = {})
- : GGMLRunner(backend, offload_params_to_cpu) {
+ : GGMLRunner(backend, params_backend) {
this->tile_size = tile_size;
}
diff --git a/src/flux.hpp b/src/flux.hpp
index 732a3719..2aac3be0 100644
--- a/src/flux.hpp
+++ b/src/flux.hpp
@@ -1189,12 +1189,12 @@ namespace Flux {
bool use_mask = false;
FluxRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
SDVersion version = VERSION_FLUX,
bool use_mask = false)
- : GGMLRunner(backend, offload_params_to_cpu), version(version), use_mask(use_mask) {
+ : GGMLRunner(backend, params_backend), version(version), use_mask(use_mask) {
flux_params.version = version;
flux_params.guidance_embed = false;
flux_params.depth = 0;
@@ -1564,7 +1564,7 @@ namespace Flux {
}
std::shared_ptr flux = std::make_shared(backend,
- false,
+ backend,
tensor_storage_map,
"model.diffusion_model",
VERSION_FLUX2,
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index d2e655ce..8e2ed694 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -26,7 +26,7 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml.h"
-#include "ggml_extend_backend.hpp"
+#include "ggml_extend_backend.h"
#include "ggml_graph_cut.h"
#include "model.h"
@@ -73,48 +73,6 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha
}
}
-__STATIC_INLINE__ bool backend_name_exists(std::string name) {
- ggml_backend_load_all_once();
- const size_t device_count = ggml_backend_dev_count();
- for (size_t i = 0; i < device_count; ++i) {
- if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) {
- return true;
- }
- }
- return false;
-}
-
-__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) {
- if (name == "" || backend_name_exists(name)) {
- return name;
- } else {
- LOG_WARN("Backend %s not found, using default backend", name.c_str());
- return "";
- }
-}
-
-__STATIC_INLINE__ std::string get_default_backend_name() {
- ggml_backend_load_all_once();
- // should pick the same backend as ggml_backend_init_best
- ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
- dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
- dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- if (dev == nullptr) {
- return "";
- }
- return ggml_backend_dev_name(dev);
-}
-
-__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") {
- ggml_backend_load_all_once();
- LOG_DEBUG("Initializing backend: %s", name.c_str());
- if (name.empty()) {
- return ggml_backend_init_best();
- } else {
- return ggml_backend_init_by_name(name.c_str(), nullptr);
- }
-}
-
static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
// n-mode tensor-matrix product
@@ -190,7 +148,7 @@ __STATIC_INLINE__ void ggml_ext_im_set_randn_f32(ggml_tensor* tensor, std::share
uint32_t n = (uint32_t)ggml_nelements(tensor);
std::vector random_numbers = rng->randn(n);
for (uint32_t i = 0; i < n; i++) {
- ggml_set_f32_1d(tensor, i, random_numbers[i]);
+ ggml_ext_im_set_f32_1d(tensor, i, random_numbers[i]);
}
}
@@ -422,39 +380,6 @@ __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const st
// file.close();
// }
-__STATIC_INLINE__ void copy_ggml_tensor(ggml_tensor* dst, ggml_tensor* src) {
- if (dst->type == src->type) {
- dst->nb[0] = src->nb[0];
- dst->nb[1] = src->nb[1];
- dst->nb[2] = src->nb[2];
- dst->nb[3] = src->nb[3];
-
- memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
- return;
- }
- ggml_init_params params;
- params.mem_size = 10 * 1024 * 1024; // for padding
- params.mem_buffer = nullptr;
- params.no_alloc = false;
- ggml_context* ctx = ggml_init(params);
- if (!ctx) {
- LOG_ERROR("ggml_init() failed");
- return;
- }
- ggml_tensor* final = ggml_cpy(ctx, src, dst);
-
- ggml_cgraph* graph = ggml_new_graph(ctx);
- ggml_build_forward_expand(graph, final);
- ggml_graph_compute_with_ctx(ctx, graph, 1);
- ggml_free(ctx);
-}
-
-__STATIC_INLINE__ ggml_tensor* ggml_ext_dup_and_cpy_tensor(ggml_context* ctx, ggml_tensor* src) {
- ggml_tensor* dup = ggml_dup_tensor(ctx, src);
- copy_ggml_tensor(dup, src);
- return dup;
-}
-
__STATIC_INLINE__ float sigmoid(float x) {
return 1 / (1.0f + expf(-x));
}
@@ -2669,13 +2594,11 @@ protected:
public:
virtual std::string get_desc() = 0;
- GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
- : runtime_backend(backend) {
- if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
- params_backend = ggml_backend_cpu_init();
- } else {
- params_backend = runtime_backend;
- }
+ GGMLRunner(ggml_backend_t backend, ggml_backend_t params_backend)
+ : params_backend(params_backend),
+ runtime_backend(backend) {
+ GGML_ASSERT(runtime_backend != nullptr);
+ GGML_ASSERT(params_backend != nullptr);
alloc_params_ctx();
}
@@ -2684,9 +2607,6 @@ public:
free_compute_buffer();
free_params_ctx();
free_compute_ctx();
- if (params_backend != runtime_backend) {
- ggml_backend_free(params_backend);
- }
free_cache_ctx_and_buffer();
}
diff --git a/src/ggml_extend_backend.cpp b/src/ggml_extend_backend.cpp
new file mode 100644
index 00000000..4bf8268e
--- /dev/null
+++ b/src/ggml_extend_backend.cpp
@@ -0,0 +1,600 @@
+#include "ggml_extend_backend.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "util.h"
+
+static std::string trim_copy(const std::string& value) {
+ size_t begin = 0;
+ while (begin < value.size() && std::isspace(static_cast(value[begin]))) {
+ ++begin;
+ }
+ size_t end = value.size();
+ while (end > begin && std::isspace(static_cast(value[end - 1]))) {
+ --end;
+ }
+ return value.substr(begin, end - begin);
+}
+
+static std::string lower_copy(std::string value) {
+ std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
+ return static_cast(std::tolower(c));
+ });
+ return value;
+}
+
+static std::vector split_copy(const std::string& value, char delimiter) {
+ std::vector parts;
+ std::string part;
+ std::istringstream stream(value);
+ while (std::getline(stream, part, delimiter)) {
+ parts.push_back(part);
+ }
+ return parts;
+}
+
+static bool is_default_backend_token(const std::string& name) {
+ const std::string lower = lower_copy(trim_copy(name));
+ return lower.empty() || lower == "default" || lower == "auto";
+}
+
+static bool parse_backend_module(const std::string& raw_name, SDBackendModule* module) {
+ std::string name = lower_copy(trim_copy(raw_name));
+ name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
+ name.erase(std::remove(name.begin(), name.end(), '_'), name.end());
+
+ if (name == "diffusion" || name == "model" || name == "unet" || name == "dit") {
+ *module = SDBackendModule::DIFFUSION;
+ return true;
+ }
+ if (name == "te" || name == "clip" || name == "text" || name == "textencoder" || name == "textencoders" || name == "conditioner" || name == "cond" || name == "llm" || name == "t5" || name == "t5xxl") {
+ *module = SDBackendModule::TE;
+ return true;
+ }
+ if (name == "clipvision" || name == "vision") {
+ *module = SDBackendModule::CLIP_VISION;
+ return true;
+ }
+ if (name == "vae" || name == "firststage" || name == "autoencoder" || name == "tae") {
+ *module = SDBackendModule::VAE;
+ return true;
+ }
+ if (name == "controlnet" || name == "control") {
+ *module = SDBackendModule::CONTROL_NET;
+ return true;
+ }
+ if (name == "photomaker" || name == "photomakerid" || name == "pmid" || name == "photo") {
+ *module = SDBackendModule::PHOTOMAKER;
+ return true;
+ }
+ if (name == "upscaler" || name == "esrgan" || name == "hires") {
+ *module = SDBackendModule::UPSCALER;
+ return true;
+ }
+ return false;
+}
+
+static std::string module_assignment_name(const SDBackendAssignment& assignment, SDBackendModule module) {
+ auto it = assignment.module_names.find(module);
+ if (it != assignment.module_names.end()) {
+ return it->second;
+ }
+ return assignment.default_name;
+}
+
+static std::string backend_cache_key(ggml_backend_t backend) {
+ if (backend == nullptr) {
+ return "";
+ }
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+ if (dev != nullptr) {
+ return lower_copy(ggml_backend_dev_name(dev));
+ }
+ const char* backend_name = ggml_backend_name(backend);
+ return backend_name != nullptr ? lower_copy(backend_name) : "";
+}
+
+static std::string resolve_first_device_by_type(enum ggml_backend_dev_type type) {
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
+ if (dev == nullptr) {
+ return "";
+ }
+ return ggml_backend_dev_name(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
+ if (tensor == nullptr) {
+ return nullptr;
+ }
+
+ return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+}
+
+static bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) {
+ if (tensor == nullptr || tensor->data == nullptr) {
+ return false;
+ }
+
+ ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor);
+ return buffer == nullptr || ggml_backend_buffer_is_host(buffer);
+}
+
+static size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+ return static_cast(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]);
+}
+
+template
+static void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) {
+ const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3);
+
+ if (ggml_backend_tensor_is_host_accessible(tensor)) {
+ auto* dst = reinterpret_cast(reinterpret_cast(tensor->data) + offset);
+ *dst = value;
+ return;
+ }
+
+ ggml_backend_tensor_set(const_cast(tensor), &value, offset, sizeof(T));
+}
+
+static void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) {
+ switch (tensor->type) {
+ case GGML_TYPE_I8:
+ ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value));
+ break;
+ case GGML_TYPE_I16:
+ ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value));
+ break;
+ case GGML_TYPE_I32:
+ ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value));
+ break;
+ case GGML_TYPE_F16:
+ ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value));
+ break;
+ case GGML_TYPE_BF16:
+ ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value));
+ break;
+ case GGML_TYPE_F32:
+ ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value);
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+}
+
+void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) {
+ if (!ggml_is_contiguous(tensor)) {
+ int64_t id[4] = {0, 0, 0, 0};
+ ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+ ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
+ return;
+ }
+
+ switch (tensor->type) {
+ case GGML_TYPE_I8:
+ ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value));
+ break;
+ case GGML_TYPE_I16:
+ ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value));
+ break;
+ case GGML_TYPE_I32:
+ ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value));
+ break;
+ case GGML_TYPE_F16:
+ ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value));
+ break;
+ case GGML_TYPE_BF16:
+ ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value));
+ break;
+ case GGML_TYPE_F32:
+ ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value);
+ break;
+ default:
+ GGML_ABORT("fatal error");
+ }
+}
+
+static void ggml_backend_load_all_once() {
+ // If the registry already has devices and the CPU backend is present,
+ // assume either static registration or explicit host-side preloading has
+ // completed and avoid rescanning the default paths.
+ if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
+ return;
+ }
+ // In dynamic-backend mode the backend modules are discovered at runtime,
+ // so we must load them before asking for the CPU backend or its proc table.
+ // If the host preloaded only a subset of backends, allow one default-path
+ // scan so missing modules can still be discovered.
+ static std::once_flag once;
+ std::call_once(once, []() {
+ if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
+ return;
+ }
+ ggml_backend_load_all();
+ });
+}
+
+bool sd_backend_is(ggml_backend_t backend, const std::string& name) {
+ if (!backend) {
+ return false;
+ }
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+ if (!dev) {
+ return false;
+ }
+ std::string dev_name = ggml_backend_dev_name(dev);
+ return lower_copy(dev_name).find(lower_copy(name)) != std::string::npos;
+}
+
+static std::string get_default_backend_name() {
+ ggml_backend_load_all_once();
+ // should pick the same backend preference as ggml_backend_init_best
+ std::string name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+ if (!name.empty()) {
+ return name;
+ }
+ name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
+ if (!name.empty()) {
+ return name;
+ }
+ return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+}
+
+static std::string sd_resolve_backend_name(const std::string& name) {
+ ggml_backend_load_all_once();
+ std::string requested = trim_copy(name);
+ std::string lower = lower_copy(requested);
+
+ if (is_default_backend_token(lower)) {
+ return get_default_backend_name();
+ }
+ if (lower == "gpu") {
+ std::string result = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+ if (!result.empty()) {
+ return result;
+ }
+ return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
+ }
+
+ const size_t device_count = ggml_backend_dev_count();
+ for (size_t i = 0; i < device_count; ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ std::string dev_name = ggml_backend_dev_name(dev);
+ if (lower_copy(dev_name) == lower) {
+ return dev_name;
+ }
+ }
+
+ for (size_t i = 0; i < device_count; ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ std::string dev_name = ggml_backend_dev_name(dev);
+ std::string dev_lower = lower_copy(dev_name);
+ if (dev_lower.rfind(lower, 0) == 0) {
+ return dev_name;
+ }
+ }
+
+ return "";
+}
+
+static bool backend_name_exists(const std::string& name) {
+ return !sd_resolve_backend_name(name).empty();
+}
+
+static ggml_backend_t init_named_backend(const std::string& name) {
+ ggml_backend_load_all_once();
+ LOG_DEBUG("Initializing backend: %s", name.c_str());
+ if (trim_copy(name).empty()) {
+ return ggml_backend_init_best();
+ }
+
+ std::string resolved = sd_resolve_backend_name(name);
+ if (resolved.empty()) {
+ return nullptr;
+ }
+ return ggml_backend_init_by_name(resolved.c_str(), nullptr);
+}
+
+static ggml_backend_t sd_get_default_backend() {
+ ggml_backend_load_all_once();
+ static std::once_flag once;
+ std::call_once(once, []() {
+ size_t dev_count = ggml_backend_dev_count();
+ if (dev_count == 0) {
+ LOG_ERROR("No devices found!");
+ } else {
+ LOG_DEBUG("Found %zu backend devices:", dev_count);
+ for (size_t i = 0; i < dev_count; ++i) {
+ auto dev = ggml_backend_dev_get(i);
+ LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev));
+ }
+ }
+ });
+
+ ggml_backend_t backend = nullptr;
+ const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
+ if (SD_VK_DEVICE != nullptr) {
+ std::string sd_vk_device_str = SD_VK_DEVICE;
+ try {
+ unsigned long long device = std::stoull(sd_vk_device_str);
+ std::string vk_device_name = "Vulkan" + std::to_string(device);
+ if (backend_name_exists(vk_device_name)) {
+ LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str());
+ backend = init_named_backend(vk_device_name);
+ if (!backend) {
+ LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str());
+ }
+ } else {
+ LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str());
+ }
+ } catch (const std::invalid_argument&) {
+ LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE);
+ } catch (const std::out_of_range&) {
+ LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE);
+ }
+ }
+
+ if (!backend) {
+ std::string dev_name = get_default_backend_name();
+ backend = init_named_backend(dev_name);
+ if (!backend && !dev_name.empty()) {
+ LOG_WARN("device %s failed to init", dev_name.c_str());
+ }
+ }
+
+ if (!backend) {
+ LOG_WARN("loading CPU backend");
+ backend = ggml_backend_cpu_init();
+ }
+
+ if (ggml_backend_is_cpu(backend)) {
+ LOG_DEBUG("Using CPU backend");
+ }
+
+ return backend;
+}
+
+static bool sd_parse_backend_assignment(const std::string& spec, SDBackendAssignment* assignment, std::string* error) {
+ if (assignment == nullptr) {
+ return false;
+ }
+
+ *assignment = {};
+ const std::string in = trim_copy(spec);
+ if (in.empty()) {
+ return true;
+ }
+
+ for (const std::string& raw_part : split_copy(in, ',')) {
+ const std::string part = trim_copy(raw_part);
+ if (part.empty()) {
+ continue;
+ }
+
+ const size_t eq = part.find('=');
+ if (eq == std::string::npos) {
+ assignment->set_default(part);
+ continue;
+ }
+
+ const std::string key = trim_copy(part.substr(0, eq));
+ const std::string value = trim_copy(part.substr(eq + 1));
+ if (key.empty() || value.empty()) {
+ if (error != nullptr) {
+ *error = "invalid backend assignment '" + part + "'";
+ }
+ return false;
+ }
+
+ const std::string key_lower = lower_copy(key);
+ if (key_lower == "all" || key_lower == "default" || key_lower == "*") {
+ assignment->set_default(value);
+ continue;
+ }
+
+ SDBackendModule module = SDBackendModule::DIFFUSION;
+ if (!parse_backend_module(key, &module)) {
+ if (error != nullptr) {
+ *error = "unknown backend module '" + key + "'";
+ }
+ return false;
+ }
+ assignment->set_module(module, value);
+ }
+ return true;
+}
+
+bool SDBackendAssignment::empty() const {
+ return default_name.empty() && module_names.empty();
+}
+
+std::string SDBackendAssignment::get(SDBackendModule module) const {
+ return module_assignment_name(*this, module);
+}
+
+void SDBackendAssignment::set_default(const std::string& name) {
+ default_name = trim_copy(name);
+}
+
+void SDBackendAssignment::set_module(SDBackendModule module, const std::string& name) {
+ module_names[module] = trim_copy(name);
+}
+
+void SDBackendHandleDeleter::operator()(ggml_backend_t backend) const {
+ ggml_backend_free(backend);
+}
+
+SDBackendManager::~SDBackendManager() {
+ reset();
+}
+
+void SDBackendManager::reset() {
+ backends_.clear();
+ runtime_assignment_ = {};
+ params_assignment_ = {};
+}
+
+ggml_backend_t SDBackendManager::runtime_backend(SDBackendModule module) {
+ return init_cached_backend(runtime_assignment_.get(module));
+}
+
+ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) {
+ std::string name = params_assignment_.get(module);
+ if (name.empty()) {
+ return runtime_backend(module);
+ }
+ return init_cached_backend(name);
+}
+
+bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) {
+ return ggml_backend_is_cpu(runtime_backend(module));
+}
+
+bool SDBackendManager::params_backend_is_cpu(SDBackendModule module) {
+ return ggml_backend_is_cpu(params_backend(module));
+}
+
+bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) {
+ ggml_backend_t backend = runtime_backend(module);
+ if (backend == nullptr) {
+ return false;
+ }
+ if (ggml_backend_is_cpu(backend)) {
+ return true;
+ }
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+ if (dev == nullptr) {
+ return false;
+ }
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ return props.caps.buffer_from_host_ptr;
+}
+
+bool SDBackendManager::init(const char* backend_spec,
+ const char* params_backend_spec,
+ bool offload_params_to_cpu,
+ bool keep_clip_on_cpu,
+ bool keep_vae_on_cpu,
+ bool keep_control_net_on_cpu,
+ std::string* error) {
+ reset();
+
+ if (!sd_parse_backend_assignment(SAFE_STR(backend_spec), &runtime_assignment_, error)) {
+ return false;
+ }
+ if (!sd_parse_backend_assignment(SAFE_STR(params_backend_spec), ¶ms_assignment_, error)) {
+ return false;
+ }
+
+ if (runtime_assignment_.empty()) {
+ if (keep_clip_on_cpu) {
+ runtime_assignment_.set_module(SDBackendModule::TE, "cpu");
+ }
+ if (keep_vae_on_cpu) {
+ runtime_assignment_.set_module(SDBackendModule::VAE, "cpu");
+ }
+ if (keep_control_net_on_cpu) {
+ runtime_assignment_.set_module(SDBackendModule::CONTROL_NET, "cpu");
+ }
+ }
+
+ if (params_assignment_.empty() && offload_params_to_cpu) {
+ params_assignment_.set_default("cpu");
+ }
+
+ return validate(error);
+}
+
+bool SDBackendManager::validate(std::string* error) const {
+ auto validate_name = [&](const std::string& name) -> bool {
+ if (is_default_backend_token(name)) {
+ return true;
+ }
+ if (!sd_resolve_backend_name(name).empty()) {
+ return true;
+ }
+ if (error != nullptr) {
+ *error = "backend '" + name + "' was not found";
+ }
+ return false;
+ };
+
+ if (!validate_name(runtime_assignment_.default_name) ||
+ !validate_name(params_assignment_.default_name)) {
+ return false;
+ }
+ for (const auto& kv : runtime_assignment_.module_names) {
+ if (!validate_name(kv.second)) {
+ return false;
+ }
+ }
+ for (const auto& kv : params_assignment_.module_names) {
+ if (!validate_name(kv.second)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+ggml_backend_t SDBackendManager::init_cached_backend(const std::string& name) {
+ std::string resolved = sd_resolve_backend_name(name);
+ std::string key = lower_copy(resolved);
+ ggml_backend_t backend = nullptr;
+
+ if (!key.empty()) {
+ auto it = backends_.find(key);
+ if (it != backends_.end()) {
+ return it->second.get();
+ }
+ } else if (!is_default_backend_token(name)) {
+ LOG_ERROR("backend '%s' was not found", name.c_str());
+ return nullptr;
+ }
+
+ backend = is_default_backend_token(name) ? sd_get_default_backend() : init_named_backend(resolved);
+ if (backend == nullptr) {
+ LOG_ERROR("failed to initialize backend '%s'", name.c_str());
+ return nullptr;
+ }
+
+ std::string actual_key = backend_cache_key(backend);
+ if (actual_key.empty()) {
+ actual_key = !key.empty() ? key : lower_copy(trim_copy(name));
+ }
+
+ auto it = backends_.find(actual_key);
+ if (it != backends_.end()) {
+ ggml_backend_free(backend);
+ return it->second.get();
+ }
+
+ SDBackendHandle handle(backend);
+ backends_.emplace(actual_key, std::move(handle));
+ return backend;
+}
+
+const char* sd_backend_module_name(SDBackendModule module) {
+ switch (module) {
+ case SDBackendModule::DIFFUSION:
+ return "diffusion";
+ case SDBackendModule::TE:
+ return "te";
+ case SDBackendModule::CLIP_VISION:
+ return "clip_vision";
+ case SDBackendModule::VAE:
+ return "vae";
+ case SDBackendModule::CONTROL_NET:
+ return "controlnet";
+ case SDBackendModule::PHOTOMAKER:
+ return "photomaker";
+ case SDBackendModule::UPSCALER:
+ return "upscaler";
+ }
+ return "unknown";
+}
diff --git a/src/ggml_extend_backend.h b/src/ggml_extend_backend.h
new file mode 100644
index 00000000..b98efc10
--- /dev/null
+++ b/src/ggml_extend_backend.h
@@ -0,0 +1,77 @@
+#ifndef __SD_GGML_EXTEND_BACKEND_H__
+#define __SD_GGML_EXTEND_BACKEND_H__
+
+#include
+#include
+#include
+#include
+#include
+
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#include "ggml.h"
+
+enum class SDBackendModule {
+ DIFFUSION,
+ TE,
+ CLIP_VISION,
+ VAE,
+ CONTROL_NET,
+ PHOTOMAKER,
+ UPSCALER,
+};
+
+struct SDBackendAssignment {
+ std::string default_name;
+ std::unordered_map module_names;
+
+ bool empty() const;
+ std::string get(SDBackendModule module) const;
+ void set_default(const std::string& name);
+ void set_module(SDBackendModule module, const std::string& name);
+};
+
+struct SDBackendHandleDeleter {
+ void operator()(ggml_backend_t backend) const;
+};
+
+using SDBackendHandle = std::unique_ptr;
+
+class SDBackendManager {
+private:
+ SDBackendAssignment runtime_assignment_;
+ SDBackendAssignment params_assignment_;
+ std::unordered_map backends_;
+
+public:
+ SDBackendManager() = default;
+ ~SDBackendManager();
+
+ SDBackendManager(const SDBackendManager&) = delete;
+ SDBackendManager& operator=(const SDBackendManager&) = delete;
+
+ bool init(const char* backend_spec,
+ const char* params_backend_spec,
+ bool offload_params_to_cpu,
+ bool keep_clip_on_cpu,
+ bool keep_vae_on_cpu,
+ bool keep_control_net_on_cpu,
+ std::string* error);
+ void reset();
+
+ ggml_backend_t runtime_backend(SDBackendModule module);
+ ggml_backend_t params_backend(SDBackendModule module);
+
+ bool runtime_backend_is_cpu(SDBackendModule module);
+ bool params_backend_is_cpu(SDBackendModule module);
+ bool runtime_backend_supports_host_buffer(SDBackendModule module);
+
+private:
+ bool validate(std::string* error) const;
+ ggml_backend_t init_cached_backend(const std::string& name);
+};
+
+bool sd_backend_is(ggml_backend_t backend, const std::string& name);
+const char* sd_backend_module_name(SDBackendModule module);
+void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
+#endif
diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp
deleted file mode 100644
index 50158c88..00000000
--- a/src/ggml_extend_backend.hpp
+++ /dev/null
@@ -1,298 +0,0 @@
-#ifndef __GGML_EXTEND_BACKEND_HPP__
-#define __GGML_EXTEND_BACKEND_HPP__
-
-#include
-#include
-
-#include "ggml-backend.h"
-#include "ggml.h"
-
-#ifndef __STATIC_INLINE__
-#define __STATIC_INLINE__ static inline
-#endif
-
-inline void ggml_backend_load_all_once() {
- // If the registry already has devices and the CPU backend is present,
- // assume either static registration or explicit host-side preloading has
- // completed and avoid rescanning the default paths.
- if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
- return;
- }
- // In dynamic-backend mode the backend modules are discovered at runtime,
- // so we must load them before asking for the CPU backend or its proc table.
- // If the host preloaded only a subset of backends, allow one default-path
- // scan so missing modules can still be discovered.
- static std::once_flag once;
- std::call_once(once, []() {
- if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
- return;
- }
- ggml_backend_load_all();
- });
-}
-
-// Do not gate this branch on GGML_CPU or GGML_CPU_ALL_VARIANTS:
-// those are CMake options used to configure ggml itself, but they are not
-// exported as PUBLIC compile definitions to stable-diffusion in backend-DL mode.
-// In practice, this target can reliably see GGML_BACKEND_DL, but not whether
-// the CPU backend was compiled as a loadable module. We therefore use runtime
-// backend discovery instead of compile-time assumptions.
-
-__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_cpu_reg() {
- ggml_backend_reg_t reg = ggml_backend_reg_by_name("CPU");
- if (reg != nullptr) {
- return reg;
- }
-
- ggml_backend_load_all_once();
- return ggml_backend_reg_by_name("CPU");
-}
-
-__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_reg_from_backend(ggml_backend_t backend) {
- if (backend != nullptr) {
- ggml_backend_dev_t device = ggml_backend_get_device(backend);
- if (device != nullptr) {
- return ggml_backend_dev_backend_reg(device);
- }
- }
-
- return ggml_backend_cpu_reg();
-}
-
-__STATIC_INLINE__ ggml_backend_t ggml_backend_cpu_init() {
- ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
- if (backend != nullptr) {
- return backend;
- }
-
- ggml_backend_load_all_once();
- return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-}
-
-__STATIC_INLINE__ bool ggml_backend_is_cpu(ggml_backend_t backend) {
- if (backend == nullptr) {
- return false;
- }
-
- ggml_backend_dev_t device = ggml_backend_get_device(backend);
- if (device != nullptr) {
- return ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU;
- }
-
- const char* backend_name = ggml_backend_name(backend);
- return backend_name != nullptr && std::strcmp(backend_name, "CPU") == 0;
-}
-
-__STATIC_INLINE__ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
- ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
- if (reg == nullptr) {
- return;
- }
-
- auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"));
- if (fn != nullptr) {
- fn(backend_cpu, n_threads);
- }
-}
-
-using __ggml_backend_cpu_set_threadpool_t = void (*)(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-
-__STATIC_INLINE__ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
- ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
- if (reg == nullptr) {
- return;
- }
-
- auto fn = reinterpret_cast<__ggml_backend_cpu_set_threadpool_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"));
- if (fn != nullptr) {
- fn(backend_cpu, threadpool);
- }
-}
-
-__STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void* abort_callback_data) {
- ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
- if (reg == nullptr) {
- return;
- }
-
- auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"));
- if (fn != nullptr) {
- fn(backend_cpu, abort_callback, abort_callback_data);
- }
-}
-
-__STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
- if (tensor == nullptr) {
- return nullptr;
- }
-
- return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-}
-
-__STATIC_INLINE__ bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) {
- if (tensor == nullptr || tensor->data == nullptr) {
- return false;
- }
-
- ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor);
- return buffer == nullptr || ggml_backend_buffer_is_host(buffer);
-}
-
-__STATIC_INLINE__ size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
- return (size_t)(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]);
-}
-
-template
-__STATIC_INLINE__ void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) {
- const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3);
-
- if (ggml_backend_tensor_is_host_accessible(tensor)) {
- auto* dst = reinterpret_cast(reinterpret_cast(tensor->data) + offset);
- *dst = value;
- return;
- }
-
- ggml_backend_tensor_set(const_cast(tensor), &value, offset, sizeof(T));
-}
-
-__STATIC_INLINE__ void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) {
- switch (tensor->type) {
- case GGML_TYPE_I8:
- ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value));
- break;
- case GGML_TYPE_I16:
- ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value));
- break;
- case GGML_TYPE_I32:
- ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value));
- break;
- case GGML_TYPE_F16:
- ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value));
- break;
- case GGML_TYPE_BF16:
- ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value));
- break;
- case GGML_TYPE_F32:
- ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value);
- break;
- default:
- GGML_ABORT("fatal error");
- }
-}
-
-__STATIC_INLINE__ void ggml_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) {
- if (!ggml_is_contiguous(tensor)) {
- int64_t id[4] = {0, 0, 0, 0};
- ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
- ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
- return;
- }
-
- switch (tensor->type) {
- case GGML_TYPE_I8:
- ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value));
- break;
- case GGML_TYPE_I16:
- ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value));
- break;
- case GGML_TYPE_I32:
- ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value));
- break;
- case GGML_TYPE_F16:
- ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value));
- break;
- case GGML_TYPE_BF16:
- ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value));
- break;
- case GGML_TYPE_F32:
- ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value);
- break;
- default:
- GGML_ABORT("fatal error");
- }
-}
-
-__STATIC_INLINE__ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context* ctx, struct ggml_cgraph* cgraph, int n_threads) {
- (void)ctx;
-
- // The legacy ggml_graph_compute_with_ctx() symbol lives in ggml-cpu, but
- // the backend proc table does not expose it in GGML_BACKEND_DL mode.
- // Recreate the old behavior by initializing the CPU backend explicitly and
- // executing the graph through the generic backend API.
- ggml_backend_t backend = ggml_backend_cpu_init();
- if (backend == nullptr) {
- return GGML_STATUS_ALLOC_FAILED;
- }
-
- ggml_backend_cpu_set_n_threads(backend, n_threads);
-
- const enum ggml_status status = ggml_backend_graph_compute(backend, cgraph);
- ggml_backend_free(backend);
-
- return status;
-}
-
-__STATIC_INLINE__ ggml_tensor* ggml_set_f32(struct ggml_tensor* tensor, float value) {
- GGML_ASSERT(tensor != nullptr);
-
- if (ggml_backend_tensor_is_host_accessible(tensor) && ggml_is_contiguous(tensor)) {
- const int64_t nelements = ggml_nelements(tensor);
-
- switch (tensor->type) {
- case GGML_TYPE_I8: {
- auto* data = reinterpret_cast(tensor->data);
- const int8_t v = static_cast(value);
- for (int64_t i = 0; i < nelements; ++i) {
- data[i] = v;
- }
- } break;
- case GGML_TYPE_I16: {
- auto* data = reinterpret_cast(tensor->data);
- const int16_t v = static_cast(value);
- for (int64_t i = 0; i < nelements; ++i) {
- data[i] = v;
- }
- } break;
- case GGML_TYPE_I32: {
- auto* data = reinterpret_cast(tensor->data);
- const int32_t v = static_cast(value);
- for (int64_t i = 0; i < nelements; ++i) {
- data[i] = v;
- }
- } break;
- case GGML_TYPE_F16: {
- auto* data = reinterpret_cast(tensor->data);
- const ggml_fp16_t v = ggml_fp32_to_fp16(value);
- for (int64_t i = 0; i < nelements; ++i) {
- data[i] = v;
- }
- } break;
- case GGML_TYPE_BF16: {
- auto* data = reinterpret_cast(tensor->data);
- const ggml_bf16_t v = ggml_fp32_to_bf16(value);
- for (int64_t i = 0; i < nelements; ++i) {
- data[i] = v;
- }
- } break;
- case GGML_TYPE_F32: {
- auto* data = reinterpret_cast(tensor->data);
- for (int64_t i = 0; i < nelements; ++i) {
- data[i] = value;
- }
- } break;
- default:
- GGML_ABORT("fatal error");
- }
-
- return tensor;
- }
-
- const int64_t nelements = ggml_nelements(tensor);
- for (int64_t i = 0; i < nelements; ++i) {
- ggml_set_f32_1d(tensor, static_cast(i), value);
- }
-
- return tensor;
-}
-
-#endif
diff --git a/src/hidream_o1.hpp b/src/hidream_o1.hpp
index 908f2de3..d72739d5 100644
--- a/src/hidream_o1.hpp
+++ b/src/hidream_o1.hpp
@@ -279,10 +279,10 @@ namespace HiDreamO1 {
std::array, 4> pos_embed_weight_data_;
HiDreamO1VisionRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model.visual")
- : GGMLRunner(backend, offload_params_to_cpu),
+ : GGMLRunner(backend, params_backend),
params(make_hidream_o1_params()),
model(std::make_shared(false, params.llm.vision)) {
model->init(params_ctx, tensor_storage_map, prefix);
@@ -336,10 +336,10 @@ namespace HiDreamO1 {
std::vector attention_mask_vec;
HiDreamO1Runner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model")
- : GGMLRunner(backend, offload_params_to_cpu),
+ : GGMLRunner(backend, params_backend),
params(make_hidream_o1_params()) {
model = HiDreamO1Model(params);
model.init(params_ctx, tensor_storage_map, prefix);
@@ -461,9 +461,9 @@ namespace HiDreamO1 {
std::shared_ptr vision_runner;
HiDreamO1Conditioner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {})
- : vision_runner(std::make_shared(backend, offload_params_to_cpu, tensor_storage_map)) {}
+ : vision_runner(std::make_shared(backend, params_backend, tensor_storage_map)) {}
void get_param_tensors(std::map& tensors) override {
vision_runner->get_param_tensors(tensors);
diff --git a/src/llm.hpp b/src/llm.hpp
index bb83312c..ab673b20 100644
--- a/src/llm.hpp
+++ b/src/llm.hpp
@@ -1166,11 +1166,11 @@ namespace LLM {
public:
LLMRunner(LLMArch arch,
ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool enable_vision_ = false)
- : GGMLRunner(backend, offload_params_to_cpu), enable_vision(enable_vision_) {
+ : GGMLRunner(backend, params_backend), enable_vision(enable_vision_) {
params.arch = arch;
if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
params.head_dim = 128;
@@ -1477,11 +1477,11 @@ namespace LLM {
LLMEmbedder(LLMArch arch,
ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
bool enable_vision = false)
- : model(arch, backend, offload_params_to_cpu, tensor_storage_map, prefix, enable_vision) {
+ : model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) {
if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) {
tokenizer = std::make_shared();
} else {
@@ -1731,7 +1731,7 @@ namespace LLM {
std::shared_ptr llm = std::make_shared(arch,
backend,
- true,
+ backend,
tensor_storage_map,
"text_encoders.llm",
true);
diff --git a/src/lora.hpp b/src/lora.hpp
index b57bc422..3d2b7699 100644
--- a/src/lora.hpp
+++ b/src/lora.hpp
@@ -22,10 +22,11 @@ struct LoraModel : public GGMLRunner {
LoraModel(const std::string& lora_id,
ggml_backend_t backend,
+ ggml_backend_t params_backend,
const std::string& file_path = "",
std::string prefix = "",
SDVersion version = VERSION_COUNT)
- : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, false) {
+ : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, params_backend) {
prefix = "lora." + prefix;
if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) {
load_failed = true;
diff --git a/src/ltx_audio_vae.h b/src/ltx_audio_vae.h
index d5ee30bc..d1d765d7 100644
--- a/src/ltx_audio_vae.h
+++ b/src/ltx_audio_vae.h
@@ -963,10 +963,10 @@ namespace LTXV {
LTXAudioVAE model;
LTXAudioVAERunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string& prefix = "")
- : GGMLRunner(backend, offload_params_to_cpu),
+ : GGMLRunner(backend, params_backend),
config(LTXAudioVAEConfig::detect_from_weights(tensor_storage_map)),
model(config) {
model.init(params_ctx, tensor_storage_map, prefix);
@@ -1086,7 +1086,7 @@ namespace LTXV {
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
auto ltx_audio_vae = std::make_shared(backend,
- false,
+ backend,
tensor_storage_map,
prefix);
diff --git a/src/ltx_vae.hpp b/src/ltx_vae.hpp
index 8bcc1ca8..a4d47b50 100644
--- a/src/ltx_vae.hpp
+++ b/src/ltx_vae.hpp
@@ -1037,7 +1037,7 @@ struct LTXVideoVAE : public VAE {
LTXVAE::VideoVAE vae;
LTXVideoVAE(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string& prefix,
bool decode_only = true,
@@ -1053,7 +1053,7 @@ struct LTXVideoVAE : public VAE {
patch_size,
tensor_storage_map,
prefix),
- VAE(version, backend, offload_params_to_cpu) {
+ VAE(version, backend, params_backend) {
vae.init(params_ctx, tensor_storage_map, prefix);
decode_timestep_tensor.values()[0] = vae.decode_timestep;
}
@@ -1176,7 +1176,7 @@ struct LTXVideoVAE : public VAE {
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
std::shared_ptr vae = std::make_shared(backend,
- false,
+ backend,
tensor_storage_map,
"first_stage_model",
true,
diff --git a/src/ltxv.hpp b/src/ltxv.hpp
index 7cb52a1f..50142cb1 100644
--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
@@ -1450,10 +1450,10 @@ namespace LTXV {
}
LTXAVRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model.diffusion_model")
- : GGMLRunner(backend, offload_params_to_cpu),
+ : GGMLRunner(backend, params_backend),
prefix(prefix),
params(),
model(params) {
@@ -1864,7 +1864,7 @@ namespace LTXV {
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
std::shared_ptr ltxav = std::make_shared(backend,
- false,
+ backend,
tensor_storage_map,
"model.diffusion_model");
diff --git a/src/mmdit.hpp b/src/mmdit.hpp
index e57041dc..6fcd732e 100644
--- a/src/mmdit.hpp
+++ b/src/mmdit.hpp
@@ -828,10 +828,10 @@ struct MMDiTRunner : public GGMLRunner {
MMDiT mmdit;
MMDiTRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
- : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_storage_map) {
+ : GGMLRunner(backend, params_backend), mmdit(tensor_storage_map) {
mmdit.init(params_ctx, tensor_storage_map, prefix);
}
@@ -934,7 +934,7 @@ struct MMDiTRunner : public GGMLRunner {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16;
- std::shared_ptr mmdit = std::make_shared(backend, false);
+ std::shared_ptr mmdit = std::make_shared(backend, backend);
{
LOG_INFO("loading from '%s'", file_path.c_str());
diff --git a/src/model.cpp b/src/model.cpp
index b3aa8b0d..9929605e 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -24,7 +24,7 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml.h"
-#include "ggml_extend_backend.hpp"
+#include "ggml_extend_backend.h"
#include "zip.h"
#include "name_conversion.h"
diff --git a/src/pmid.hpp b/src/pmid.hpp
index f19a8c3c..2a9d2da7 100644
--- a/src/pmid.hpp
+++ b/src/pmid.hpp
@@ -411,13 +411,13 @@ public:
public:
PhotoMakerIDEncoder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
SDVersion version = VERSION_SDXL,
PMVersion pm_v = PM_VERSION_1,
float sty = 20.f)
- : GGMLRunner(backend, offload_params_to_cpu),
+ : GGMLRunner(backend, params_backend),
version(version),
pm_version(pm_v),
style_strength(sty) {
@@ -568,11 +568,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
bool applied = false;
PhotoMakerIDEmbed(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
ModelLoader* ml,
const std::string& file_path = "",
const std::string& prefix = "")
- : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
+ : file_path(file_path), GGMLRunner(backend, params_backend), model_loader(ml) {
if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
load_failed = true;
}
diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp
index 35d32109..73c1f9ae 100644
--- a/src/qwen_image.hpp
+++ b/src/qwen_image.hpp
@@ -488,12 +488,12 @@ namespace Qwen {
SDVersion version;
QwenImageRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
SDVersion version = VERSION_QWEN_IMAGE,
bool zero_cond_t = false)
- : GGMLRunner(backend, offload_params_to_cpu) {
+ : GGMLRunner(backend, params_backend) {
qwen_image_params.num_layers = 0;
qwen_image_params.zero_cond_t = zero_cond_t;
for (auto pair : tensor_storage_map) {
@@ -686,7 +686,7 @@ namespace Qwen {
}
std::shared_ptr qwen_image = std::make_shared(backend,
- false,
+ backend,
tensor_storage_map,
"model.diffusion_model",
VERSION_QWEN_IMAGE);
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 8a08f12c..ac2ff224 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -116,11 +116,7 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
class StableDiffusionGGML {
public:
std::vector mmap_tensor_store;
- ggml_backend_t backend = nullptr; // general backend
- ggml_backend_t clip_backend = nullptr;
- ggml_backend_t control_net_backend = nullptr;
- ggml_backend_t vae_backend = nullptr;
- ggml_backend_t audio_backend = nullptr;
+ SDBackendManager backend_manager;
SDVersion version;
bool vae_decode_only = false;
@@ -156,6 +152,8 @@ public:
bool offload_params_to_cpu = false;
float max_vram = 0.f;
bool use_pmid = false;
+ std::string backend_spec;
+ std::string params_backend_spec;
bool is_using_v_parameterization = false;
bool is_using_edm_v_parameterization = false;
@@ -169,24 +167,44 @@ public:
StableDiffusionGGML() = default;
- ~StableDiffusionGGML() {
- if (clip_backend != backend) {
- ggml_backend_free(clip_backend);
+ ~StableDiffusionGGML() = default;
+
+ ggml_backend_t backend_for(SDBackendModule module) {
+ ggml_backend_t module_backend = backend_manager.runtime_backend(module);
+ if (module_backend == nullptr) {
+ LOG_ERROR("failed to initialize %s backend", sd_backend_module_name(module));
}
- if (control_net_backend != backend) {
- ggml_backend_free(control_net_backend);
- }
- if (vae_backend != backend) {
- ggml_backend_free(vae_backend);
- }
- if (audio_backend != nullptr && audio_backend != backend) {
- ggml_backend_free(audio_backend);
- }
- ggml_backend_free(backend);
+ return module_backend;
}
- void init_backend() {
- backend = sd_get_default_backend();
+ ggml_backend_t params_backend_for(SDBackendModule module) {
+ ggml_backend_t module_backend = backend_manager.params_backend(module);
+ if (module_backend == nullptr) {
+ LOG_ERROR("failed to initialize %s params backend", sd_backend_module_name(module));
+ }
+ return module_backend;
+ }
+
+ bool ensure_backend_pair(SDBackendModule module) {
+ if (backend_for(module) == nullptr) {
+ return false;
+ }
+ return params_backend_for(module) != nullptr;
+ }
+
+ bool init_backend(const sd_ctx_params_t* sd_ctx_params) {
+ std::string error;
+ if (!backend_manager.init(sd_ctx_params->backend,
+ sd_ctx_params->params_backend,
+ sd_ctx_params->offload_params_to_cpu,
+ sd_ctx_params->keep_clip_on_cpu,
+ sd_ctx_params->keep_vae_on_cpu,
+ sd_ctx_params->keep_control_net_on_cpu,
+ &error)) {
+ LOG_ERROR("backend config failed: %s", error.c_str());
+ return false;
+ }
+ return ensure_backend_pair(SDBackendModule::DIFFUSION);
}
std::shared_ptr get_rng(rng_type_t rng_type) {
@@ -205,6 +223,8 @@ public:
free_params_immediately = sd_ctx_params->free_params_immediately;
offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu;
max_vram = sd_ctx_params->max_vram;
+ backend_spec = SAFE_STR(sd_ctx_params->backend);
+ params_backend_spec = SAFE_STR(sd_ctx_params->params_backend);
bool use_tae = false;
bool use_audio_vae = false;
@@ -218,8 +238,10 @@ public:
ggml_log_set(ggml_log_callback_default, nullptr);
- init_backend();
- max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend);
+ if (!init_backend(sd_ctx_params)) {
+ return false;
+ }
+ max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION));
ModelLoader model_loader;
@@ -393,7 +415,6 @@ public:
std::map mmap_able_tensors;
bool enable_mmap_tensors = false;
- bool main_backend_mmap = false;
bool needs_writable_mmap = false;
if (sd_ctx_params->enable_mmap) {
if (apply_lora_immediately) {
@@ -401,21 +422,19 @@ public:
LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap");
}
enable_mmap_tensors = true;
- if (offload_params_to_cpu) {
- main_backend_mmap = true;
- } else {
- ggml_backend_dev_t dev = ggml_backend_get_device(backend);
- struct ggml_backend_dev_props props;
- ggml_backend_dev_get_props(dev, &props);
- main_backend_mmap = props.caps.buffer_from_host_ptr;
- }
}
// split definition to avoid msvc choking on the extra parameter handling
- auto get_param_tensors_p = [&](auto&& model, bool force_cpu, const char* prefix) {
+ auto module_can_mmap = [&](SDBackendModule module) {
+ return enable_mmap_tensors &&
+ (backend_manager.runtime_backend_is_cpu(module) ||
+ backend_manager.params_backend_is_cpu(module) ||
+ backend_manager.runtime_backend_supports_host_buffer(module));
+ };
+
+ auto get_param_tensors_p = [&](auto&& model, bool do_mmap, const char* prefix) {
std::map temp;
model->get_param_tensors(temp, prefix);
- bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu);
for (const auto& [key, tensor] : temp) {
tensors[key] = tensor;
if (do_mmap) {
@@ -424,10 +443,9 @@ public:
}
};
- auto get_param_tensors = [&](auto&& model, bool force_cpu = false) {
+ auto get_param_tensors = [&](auto&& model, bool do_mmap) {
std::map temp;
model->get_param_tensors(temp);
- bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu);
for (const auto& [key, tensor] : temp) {
tensors[key] = tensor;
if (do_mmap) {
@@ -454,22 +472,20 @@ public:
LOG_INFO("Using circular padding for convolutions");
}
- bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
-
const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(max_vram);
{
- clip_backend = backend;
- if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
- LOG_INFO("CLIP: Using CPU backend");
- clip_backend = ggml_backend_cpu_init();
+ if (!ensure_backend_pair(SDBackendModule::TE) ||
+ !ensure_backend_pair(SDBackendModule::DIFFUSION)) {
+ return false;
}
+
if (sd_version_is_sd3(version)) {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map);
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map);
} else if (sd_version_is_flux(version)) {
bool is_chroma = false;
@@ -489,62 +505,62 @@ public:
"--chroma-disable-dit-mask as a workaround.");
}
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map,
sd_ctx_params->chroma_use_t5_mask,
sd_ctx_params->chroma_t5_mask_pad);
} else if (version == VERSION_OVIS_IMAGE) {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version,
"",
false);
} else {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map);
}
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
version,
sd_ctx_params->chroma_use_dit_mask);
} else if (sd_version_is_flux2(version)) {
bool is_chroma = false;
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version);
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
version,
sd_ctx_params->chroma_use_dit_mask);
} else if (sd_version_is_ltxav(version)) {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map);
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model");
} else if (sd_version_is_wan(version)) {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map,
true,
0,
true);
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
version);
if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
- high_noise_diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ high_noise_diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.high_noise_diffusion_model",
version);
@@ -552,62 +568,65 @@ public:
if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" ||
diffusion_model->get_desc() == "Wan2.1-I2V-1.3B") {
- clip_vision = std::make_shared(backend,
- offload_params_to_cpu,
+ if (!ensure_backend_pair(SDBackendModule::CLIP_VISION)) {
+ return false;
+ }
+ clip_vision = std::make_shared(backend_for(SDBackendModule::CLIP_VISION),
+ params_backend_for(SDBackendModule::CLIP_VISION),
tensor_storage_map);
clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes);
- get_param_tensors(clip_vision);
+ get_param_tensors(clip_vision, module_can_mmap(SDBackendModule::CLIP_VISION));
}
} else if (sd_version_is_qwen_image(version)) {
bool enable_vision = false;
if (!vae_decode_only) {
enable_vision = true;
}
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version,
"",
enable_vision);
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
version,
sd_ctx_params->qwen_image_zero_cond_t);
} else if (version == VERSION_HIDREAM_O1) {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map);
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model");
} else if (sd_version_is_anima(version)) {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map);
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model");
} else if (sd_version_is_z_image(version)) {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version);
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
version);
} else if (sd_version_is_ernie_image(version)) {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version);
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model");
} else { // SD1.x SD2.x SDXL
@@ -616,21 +635,21 @@ public:
embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
}
if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map,
embbeding_map,
version,
PM_VERSION_2);
} else {
- cond_stage_model = std::make_shared(clip_backend,
- offload_params_to_cpu,
+ cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE),
+ params_backend_for(SDBackendModule::TE),
tensor_storage_map,
embbeding_map,
version);
}
- diffusion_model = std::make_shared(backend,
- offload_params_to_cpu,
+ diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION),
+ params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
version);
if (sd_ctx_params->diffusion_conv_direct) {
@@ -640,10 +659,10 @@ public:
}
cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
- get_param_tensors(cond_stage_model, clip_on_cpu);
+ get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE));
diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
- get_param_tensors(diffusion_model);
+ get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
if (sd_version_is_unet_edit(version)) {
vae_decode_only = false;
@@ -651,30 +670,27 @@ public:
if (high_noise_diffusion_model) {
high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
- get_param_tensors(high_noise_diffusion_model);
+ get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
}
- if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
- LOG_INFO("VAE Autoencoder: Using CPU backend");
- vae_backend = ggml_backend_cpu_init();
- } else {
- vae_backend = backend;
+ if (!ensure_backend_pair(SDBackendModule::VAE)) {
+ return false;
}
auto create_tae = [&]() -> std::shared_ptr {
if (sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) ||
sd_version_is_anima(version)) {
- return std::make_shared(vae_backend,
- offload_params_to_cpu,
+ return std::make_shared(backend_for(SDBackendModule::VAE),
+ params_backend_for(SDBackendModule::VAE),
tensor_storage_map,
"decoder",
vae_decode_only,
version);
} else {
- auto model = std::make_shared(vae_backend,
- offload_params_to_cpu,
+ auto model = std::make_shared(backend_for(SDBackendModule::VAE),
+ params_backend_for(SDBackendModule::VAE),
tensor_storage_map,
"decoder.layers",
vae_decode_only,
@@ -685,8 +701,8 @@ public:
auto create_vae = [&]() -> std::shared_ptr {
if (sd_version_is_ltxav(version)) {
- return std::make_shared(vae_backend,
- offload_params_to_cpu,
+ return std::make_shared(backend_for(SDBackendModule::VAE),
+ params_backend_for(SDBackendModule::VAE),
tensor_storage_map,
"first_stage_model",
true,
@@ -694,15 +710,15 @@ public:
} else if (sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) ||
sd_version_is_anima(version)) {
- return std::make_shared(vae_backend,
- offload_params_to_cpu,
+ return std::make_shared(backend_for(SDBackendModule::VAE),
+ params_backend_for(SDBackendModule::VAE),
tensor_storage_map,
"first_stage_model",
vae_decode_only,
version);
} else {
- auto model = std::make_shared(vae_backend,
- offload_params_to_cpu,
+ auto model = std::make_shared(backend_for(SDBackendModule::VAE),
+ params_backend_for(SDBackendModule::VAE),
tensor_storage_map,
"first_stage_model",
vae_decode_only,
@@ -721,43 +737,36 @@ public:
}
};
- bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu;
+ bool vae_mmap = module_can_mmap(SDBackendModule::VAE);
if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) {
LOG_INFO("using FakeVAE");
first_stage_model = std::make_shared(version,
- vae_backend,
- offload_params_to_cpu);
+ backend_for(SDBackendModule::VAE),
+ params_backend_for(SDBackendModule::VAE));
} else if (use_tae && !tae_preview_only) {
LOG_INFO("using TAE for encoding / decoding");
first_stage_model = create_tae();
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
- get_param_tensors_p(first_stage_model, force_vae_cpu, "tae");
+ get_param_tensors_p(first_stage_model, vae_mmap, "tae");
} else {
LOG_INFO("using VAE for encoding / decoding");
first_stage_model = create_vae();
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
- get_param_tensors_p(first_stage_model, force_vae_cpu, "first_stage_model");
+ get_param_tensors_p(first_stage_model, vae_mmap, "first_stage_model");
if (use_tae && tae_preview_only) {
LOG_INFO("using TAE for preview");
preview_vae = create_tae();
preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes);
- get_param_tensors_p(first_stage_model, force_vae_cpu, "vae");
+ get_param_tensors_p(first_stage_model, vae_mmap, "vae");
}
}
if (use_audio_vae) {
- if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
- LOG_INFO("LTX audio VAE: Using CPU backend");
- audio_backend = ggml_backend_cpu_init();
- } else {
- audio_backend = backend;
- }
- audio_vae_model = std::make_shared(audio_backend,
- false,
+ audio_vae_model = std::make_shared(backend_for(SDBackendModule::VAE),
+ params_backend_for(SDBackendModule::VAE),
tensor_storage_map);
- audio_vae_model->alloc_params_buffer();
- audio_vae_model->get_param_tensors(tensors, "");
+ get_param_tensors_p(audio_vae_model, vae_mmap, "");
}
if (sd_ctx_params->vae_conv_direct) {
@@ -769,15 +778,11 @@ public:
}
if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
- ggml_backend_t controlnet_backend = nullptr;
- if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
- LOG_DEBUG("ControlNet: Using CPU backend");
- controlnet_backend = ggml_backend_cpu_init();
- } else {
- controlnet_backend = backend;
+ if (!ensure_backend_pair(SDBackendModule::CONTROL_NET)) {
+ return false;
}
- control_net = std::make_shared(controlnet_backend,
- offload_params_to_cpu,
+ control_net = std::make_shared(backend_for(SDBackendModule::CONTROL_NET),
+ params_backend_for(SDBackendModule::CONTROL_NET),
tensor_storage_map,
version);
if (sd_ctx_params->diffusion_conv_direct) {
@@ -786,23 +791,31 @@ public:
}
}
- if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
- pmid_model = std::make_shared(backend,
- offload_params_to_cpu,
- tensor_storage_map,
- "pmid",
- version,
- PM_VERSION_2);
- LOG_INFO("using PhotoMaker Version 2");
- } else {
- pmid_model = std::make_shared(backend,
- offload_params_to_cpu,
- tensor_storage_map,
- "pmid",
- version);
- }
if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
- pmid_lora = std::make_shared("pmid", backend, sd_ctx_params->photo_maker_path, "", version);
+ if (!ensure_backend_pair(SDBackendModule::PHOTOMAKER)) {
+ return false;
+ }
+ if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
+ pmid_model = std::make_shared(backend_for(SDBackendModule::PHOTOMAKER),
+ params_backend_for(SDBackendModule::PHOTOMAKER),
+ tensor_storage_map,
+ "pmid",
+ version,
+ PM_VERSION_2);
+ LOG_INFO("using PhotoMaker Version 2");
+ } else {
+ pmid_model = std::make_shared(backend_for(SDBackendModule::PHOTOMAKER),
+ params_backend_for(SDBackendModule::PHOTOMAKER),
+ tensor_storage_map,
+ "pmid",
+ version);
+ }
+ pmid_lora = std::make_shared("pmid",
+ backend_for(SDBackendModule::PHOTOMAKER),
+ params_backend_for(SDBackendModule::PHOTOMAKER),
+ sd_ctx_params->photo_maker_path,
+ "",
+ version);
auto lora_tensor_filter = [&](const std::string& tensor_name) {
if (starts_with(tensor_name, "lora.model")) {
return true;
@@ -821,7 +834,7 @@ public:
}
}
if (use_pmid) {
- get_param_tensors_p(pmid_model, false, "pmid");
+ get_param_tensors_p(pmid_model, module_can_mmap(SDBackendModule::PHOTOMAKER), "pmid");
}
if (sd_ctx_params->flash_attn) {
@@ -917,8 +930,10 @@ public:
}
}
- if (clip_vision) {
- clip_vision->alloc_params_buffer();
+ if (clip_vision && !clip_vision->alloc_params_buffer()) {
+ LOG_ERROR("CLIP vision params buffer allocation failed");
+ ggml_free(ctx);
+ return false;
}
if (cond_stage_model) {
cond_stage_model->alloc_params_buffer();
@@ -929,18 +944,25 @@ public:
if (high_noise_diffusion_model) {
high_noise_diffusion_model->alloc_params_buffer();
}
- if (first_stage_model) {
- first_stage_model->alloc_params_buffer();
+ if (first_stage_model && !first_stage_model->alloc_params_buffer()) {
+ LOG_ERROR("VAE params buffer allocation failed");
+ ggml_free(ctx);
+ return false;
}
- if (preview_vae) {
- preview_vae->alloc_params_buffer();
+ if (preview_vae && !preview_vae->alloc_params_buffer()) {
+ LOG_ERROR("preview VAE params buffer allocation failed");
+ ggml_free(ctx);
+ return false;
}
- if (use_pmid && pmid_model) {
- if (!pmid_model->alloc_params_buffer()) {
- LOG_ERROR(" pmid model params buffer allocation failed");
- ggml_free(ctx);
- return false;
- }
+ if (audio_vae_model && !audio_vae_model->alloc_params_buffer()) {
+ LOG_ERROR("LTX audio VAE params buffer allocation failed");
+ ggml_free(ctx);
+ return false;
+ }
+ if (use_pmid && pmid_model && !pmid_model->alloc_params_buffer()) {
+ LOG_ERROR("PhotoMaker params buffer allocation failed");
+ ggml_free(ctx);
+ return false;
}
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
@@ -969,6 +991,7 @@ public:
size_t control_net_params_mem_size = 0;
if (control_net) {
if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {
+ ggml_free(ctx);
return false;
}
control_net_params_mem_size = control_net->get_params_buffer_size();
@@ -980,28 +1003,39 @@ public:
size_t total_params_ram_size = 0;
size_t total_params_vram_size = 0;
- if (ggml_backend_is_cpu(clip_backend)) {
- total_params_ram_size += clip_params_mem_size + pmid_params_mem_size;
- } else {
- total_params_vram_size += clip_params_mem_size + pmid_params_mem_size;
- }
+ auto add_params_memory = [&](size_t size, SDBackendModule module) {
+ if (size == 0) {
+ return true;
+ }
+ ggml_backend_t module_backend = params_backend_for(module);
+ if (module_backend == nullptr) {
+ return false;
+ }
+ if (ggml_backend_is_cpu(module_backend)) {
+ total_params_ram_size += size;
+ } else {
+ total_params_vram_size += size;
+ }
+ return true;
+ };
+ auto params_memory_location = [&](size_t size, SDBackendModule module) {
+ if (size == 0) {
+ return "N/A";
+ }
+ ggml_backend_t module_backend = params_backend_for(module);
+ if (module_backend == nullptr) {
+ return "N/A";
+ }
+ return ggml_backend_is_cpu(module_backend) ? "RAM" : "VRAM";
+ };
- if (ggml_backend_is_cpu(backend)) {
- total_params_ram_size += unet_params_mem_size;
- } else {
- total_params_vram_size += unet_params_mem_size;
- }
-
- if (ggml_backend_is_cpu(vae_backend)) {
- total_params_ram_size += vae_params_mem_size;
- } else {
- total_params_vram_size += vae_params_mem_size;
- }
-
- if (ggml_backend_is_cpu(control_net_backend)) {
- total_params_ram_size += control_net_params_mem_size;
- } else {
- total_params_vram_size += control_net_params_mem_size;
+ if (!add_params_memory(clip_params_mem_size, SDBackendModule::TE) ||
+ !add_params_memory(pmid_params_mem_size, SDBackendModule::PHOTOMAKER) ||
+ !add_params_memory(unet_params_mem_size, SDBackendModule::DIFFUSION) ||
+ !add_params_memory(vae_params_mem_size, SDBackendModule::VAE) ||
+ !add_params_memory(control_net_params_mem_size, SDBackendModule::CONTROL_NET)) {
+ ggml_free(ctx);
+ return false;
}
size_t total_params_size = total_params_ram_size + total_params_vram_size;
@@ -1012,15 +1046,15 @@ public:
total_params_vram_size / 1024.0 / 1024.0,
total_params_ram_size / 1024.0 / 1024.0,
clip_params_mem_size / 1024.0 / 1024.0,
- ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM",
+ params_memory_location(clip_params_mem_size, SDBackendModule::TE),
unet_params_mem_size / 1024.0 / 1024.0,
- ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
+ params_memory_location(unet_params_mem_size, SDBackendModule::DIFFUSION),
vae_params_mem_size / 1024.0 / 1024.0,
- ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM",
+ params_memory_location(vae_params_mem_size, SDBackendModule::VAE),
control_net_params_mem_size / 1024.0 / 1024.0,
- ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM",
+ params_memory_location(control_net_params_mem_size, SDBackendModule::CONTROL_NET),
pmid_params_mem_size / 1024.0 / 1024.0,
- ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
+ params_memory_location(pmid_params_mem_size, SDBackendModule::PHOTOMAKER));
}
// init denoiser
@@ -1163,7 +1197,7 @@ public:
std::shared_ptr load_lora_model_from_file(const std::string& lora_id,
float multiplier,
- ggml_backend_t backend,
+ SDBackendModule module,
LoraModel::filter_t lora_tensor_filter = nullptr) {
std::string lora_path = lora_id;
static std::string high_noise_tag = "|high_noise|";
@@ -1173,7 +1207,15 @@ public:
is_high_noise = true;
LOG_DEBUG("high noise lora: %s", lora_path.c_str());
}
- auto lora = std::make_shared(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
+ if (!ensure_backend_pair(module)) {
+ return nullptr;
+ }
+ auto lora = std::make_shared(lora_id,
+ backend_for(module),
+ params_backend_for(module),
+ lora_path,
+ is_high_noise ? "model.high_noise_" : "",
+ version);
if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
return nullptr;
@@ -1212,7 +1254,7 @@ public:
for (auto& kv : lora_state_diff) {
int64_t t0 = ggml_time_ms();
- auto lora = load_lora_model_from_file(kv.first, kv.second, backend);
+ auto lora = load_lora_model_from_file(kv.first, kv.second, SDBackendModule::DIFFUSION);
if (!lora || lora->lora_tensors.empty()) {
continue;
}
@@ -1270,7 +1312,7 @@ public:
const std::string& lora_id = kv.first;
float multiplier = kv.second;
- auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter);
+ auto lora = load_lora_model_from_file(lora_id, multiplier, SDBackendModule::TE, lora_tensor_filter);
if (lora && !lora->lora_tensors.empty()) {
lora->preprocess_lora_tensors(tensors);
cond_stage_lora_models.push_back(lora);
@@ -1307,7 +1349,7 @@ public:
const std::string& lora_name = kv.first;
float multiplier = kv.second;
- auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter);
+ auto lora = load_lora_model_from_file(lora_name, multiplier, SDBackendModule::DIFFUSION, lora_tensor_filter);
if (lora && !lora->lora_tensors.empty()) {
lora->preprocess_lora_tensors(tensors);
diffusion_lora_models.push_back(lora);
@@ -1345,7 +1387,7 @@ public:
const std::string& lora_name = kv.first;
float multiplier = kv.second;
- auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, lora_tensor_filter);
+ auto lora = load_lora_model_from_file(lora_name, multiplier, SDBackendModule::VAE, lora_tensor_filter);
if (lora && !lora->lora_tensors.empty()) {
lora->preprocess_lora_tensors(tensors);
first_stage_lora_models.push_back(lora);
@@ -2387,15 +2429,17 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
sd_ctx_params->chroma_use_dit_mask = true;
sd_ctx_params->chroma_use_t5_mask = false;
sd_ctx_params->chroma_t5_mask_pad = 1;
+ sd_ctx_params->backend = nullptr;
+ sd_ctx_params->params_backend = nullptr;
}
char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
- char* buf = (char*)malloc(4096);
+ char* buf = (char*)malloc(8192);
if (!buf)
return nullptr;
buf[0] = '\0';
- snprintf(buf + strlen(buf), 4096 - strlen(buf),
+ snprintf(buf + strlen(buf), 8192 - strlen(buf),
"model_path: %s\n"
"clip_l_path: %s\n"
"clip_g_path: %s\n"
@@ -2421,6 +2465,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
"prediction: %s\n"
"offload_params_to_cpu: %s\n"
"max_vram: %.3f\n"
+ "backend: %s\n"
+ "params_backend: %s\n"
"keep_clip_on_cpu: %s\n"
"keep_control_net_on_cpu: %s\n"
"keep_vae_on_cpu: %s\n"
@@ -2456,6 +2502,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
sd_prediction_name(sd_ctx_params->prediction),
BOOL_STR(sd_ctx_params->offload_params_to_cpu),
sd_ctx_params->max_vram,
+ SAFE_STR(sd_ctx_params->backend),
+ SAFE_STR(sd_ctx_params->params_backend),
BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
@@ -3822,7 +3870,9 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
LOG_INFO("hires fix: loading model upscaler from '%s'", request.hires.model_path);
hires_upscaler = std::make_unique(sd_ctx->sd->n_threads,
false,
- request.hires.upscale_tile_size);
+ request.hires.upscale_tile_size,
+ sd_ctx->sd->backend_spec,
+ sd_ctx->sd->params_backend_spec);
const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (!hires_upscaler->load_from_file(request.hires.model_path,
diff --git a/src/t5.hpp b/src/t5.hpp
index 71545e52..01c35d7d 100644
--- a/src/t5.hpp
+++ b/src/t5.hpp
@@ -321,11 +321,11 @@ struct T5Runner : public GGMLRunner {
std::vector relative_position_bucket_vec;
T5Runner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool is_umt5 = false)
- : GGMLRunner(backend, offload_params_to_cpu) {
+ : GGMLRunner(backend, params_backend) {
if (is_umt5) {
params.vocab_size = 256384;
params.relative_attention = false;
@@ -464,11 +464,11 @@ struct T5Embedder {
T5Runner model;
T5Embedder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
bool is_umt5 = false)
- : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
+ : model(backend, params_backend, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
}
void get_param_tensors(std::map& tensors, const std::string prefix) {
@@ -576,7 +576,7 @@ struct T5Embedder {
}
}
- std::shared_ptr t5 = std::make_shared(backend, false, tensor_storage_map, "", true);
+ std::shared_ptr t5 = std::make_shared(backend, backend, tensor_storage_map, "", true);
t5->alloc_params_buffer();
std::map tensors;
diff --git a/src/tae.hpp b/src/tae.hpp
index 41b53515..823cff2d 100644
--- a/src/tae.hpp
+++ b/src/tae.hpp
@@ -541,14 +541,14 @@ struct TinyImageAutoEncoder : public VAE {
bool decode_only = false;
TinyImageAutoEncoder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool decoder_only = true,
SDVersion version = VERSION_SD1)
: decode_only(decoder_only),
taesd(decoder_only, version),
- VAE(version, backend, offload_params_to_cpu) {
+ VAE(version, backend, params_backend) {
scale_input = false;
taesd.init(params_ctx, tensor_storage_map, prefix);
}
@@ -603,14 +603,14 @@ struct TinyVideoAutoEncoder : public VAE {
bool decode_only = false;
TinyVideoAutoEncoder(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
bool decoder_only = true,
SDVersion version = VERSION_WAN2)
: decode_only(decoder_only),
taehv(decoder_only, version),
- VAE(version, backend, offload_params_to_cpu) {
+ VAE(version, backend, params_backend) {
scale_input = false;
taehv.init(params_ctx, tensor_storage_map, prefix);
}
diff --git a/src/unet.hpp b/src/unet.hpp
index d7ea8c3f..a67fe076 100644
--- a/src/unet.hpp
+++ b/src/unet.hpp
@@ -603,11 +603,11 @@ struct UNetModelRunner : public GGMLRunner {
UnetModelBlock unet;
UNetModelRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
SDVersion version = VERSION_SD1)
- : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_storage_map) {
+ : GGMLRunner(backend, params_backend), unet(version, tensor_storage_map) {
unet.init(params_ctx, tensor_storage_map, prefix);
}
diff --git a/src/upscaler.cpp b/src/upscaler.cpp
index 25fc0c5d..1197ce35 100644
--- a/src/upscaler.cpp
+++ b/src/upscaler.cpp
@@ -4,12 +4,18 @@
#include "stable-diffusion.h"
#include "util.h"
+#include
+
UpscalerGGML::UpscalerGGML(int n_threads,
bool direct,
- int tile_size)
+ int tile_size,
+ std::string backend_spec,
+ std::string params_backend_spec)
: n_threads(n_threads),
direct(direct),
- tile_size(tile_size) {
+ tile_size(tile_size),
+ backend_spec(std::move(backend_spec)),
+ params_backend_spec(std::move(params_backend_spec)) {
}
void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) {
@@ -24,19 +30,51 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path,
int n_threads) {
ggml_log_set(ggml_log_callback_default, nullptr);
- backend = sd_get_default_backend();
+ std::string error;
+ if (!backend_manager.init(backend_spec.c_str(),
+ params_backend_spec.c_str(),
+ offload_params_to_cpu,
+ false,
+ false,
+ false,
+ &error)) {
+ LOG_ERROR("upscaler backend config failed: %s", error.c_str());
+ return false;
+ }
+ auto backend_for = [&](SDBackendModule module) {
+ ggml_backend_t module_backend = backend_manager.runtime_backend(module);
+ if (module_backend == nullptr) {
+ LOG_ERROR("failed to initialize %s backend", sd_backend_module_name(module));
+ }
+ return module_backend;
+ };
+ auto params_backend_for = [&](SDBackendModule module) {
+ ggml_backend_t module_backend = backend_manager.params_backend(module);
+ if (module_backend == nullptr) {
+ LOG_ERROR("failed to initialize %s params backend", sd_backend_module_name(module));
+ }
+ return module_backend;
+ };
+ auto ensure_backend_pair = [&](SDBackendModule module) {
+ if (backend_for(module) == nullptr) {
+ return false;
+ }
+ return params_backend_for(module) != nullptr;
+ };
+ if (!ensure_backend_pair(SDBackendModule::UPSCALER)) {
+ return false;
+ }
ModelLoader model_loader;
if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
}
model_loader.set_wtype_override(model_data_type);
- if (!backend) {
- LOG_DEBUG("Using CPU backend");
- backend = ggml_backend_cpu_init();
- }
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
- esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
+ esrgan_upscaler = std::make_shared(backend_for(SDBackendModule::UPSCALER),
+ params_backend_for(SDBackendModule::UPSCALER),
+ tile_size,
+ model_loader.get_tensor_storage_map());
esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (direct) {
esrgan_upscaler->set_conv2d_direct_enabled(true);
@@ -110,14 +148,16 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
bool offload_params_to_cpu,
bool direct,
int n_threads,
- int tile_size) {
+ int tile_size,
+ const char* backend,
+ const char* params_backend) {
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
if (upscaler_ctx == nullptr) {
return nullptr;
}
std::string esrgan_path(esrgan_path_c_str);
- upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
+ upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size, SAFE_STR(backend), SAFE_STR(params_backend));
if (upscaler_ctx->upscaler == nullptr) {
return nullptr;
}
diff --git a/src/upscaler.h b/src/upscaler.h
index d667a6f1..e3967865 100644
--- a/src/upscaler.h
+++ b/src/upscaler.h
@@ -2,6 +2,7 @@
#define __SD_UPSCALER_H__
#include "esrgan.hpp"
+#include "ggml_extend_backend.h"
#include "stable-diffusion.h"
#include "tensor.hpp"
@@ -9,7 +10,7 @@
#include
struct UpscalerGGML {
- ggml_backend_t backend = nullptr; // general backend
+ SDBackendManager backend_manager;
ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr esrgan_upscaler;
std::string esrgan_path;
@@ -17,10 +18,14 @@ struct UpscalerGGML {
bool direct = false;
int tile_size = 128;
size_t max_graph_vram_bytes = 0;
+ std::string backend_spec;
+ std::string params_backend_spec;
UpscalerGGML(int n_threads,
- bool direct = false,
- int tile_size = 128);
+ bool direct = false,
+ int tile_size = 128,
+ std::string backend_spec = "",
+ std::string params_backend_spec = "");
bool load_from_file(const std::string& esrgan_path,
bool offload_params_to_cpu,
diff --git a/src/util.cpp b/src/util.cpp
index 586284c8..1c2e5e89 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -25,7 +25,7 @@
#include "ggml-backend.h"
#include "ggml.h"
-#include "ggml_extend_backend.hpp"
+#include "ggml_extend_backend.h"
#include "stable-diffusion.h"
bool ends_with(const std::string& str, const std::string& ending) {
@@ -758,76 +758,6 @@ std::vector> parse_prompt_attention(const std::str
return res;
}
-// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
-bool sd_backend_is(ggml_backend_t backend, const std::string& name) {
- if (!backend) {
- return false;
- }
- ggml_backend_dev_t dev = ggml_backend_get_device(backend);
- if (!dev)
- return false;
- std::string dev_name = ggml_backend_dev_name(dev);
- return dev_name.find(name) != std::string::npos;
-}
-
-ggml_backend_t sd_get_default_backend() {
- ggml_backend_load_all_once();
- static std::once_flag once;
- std::call_once(once, []() {
- size_t dev_count = ggml_backend_dev_count();
- if (dev_count == 0) {
- LOG_ERROR("No devices found!");
- } else {
- LOG_DEBUG("Found %zu backend devices:", dev_count);
- for (size_t i = 0; i < dev_count; ++i) {
- auto dev = ggml_backend_dev_get(i);
- LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev));
- }
- }
- });
- ggml_backend_t backend = nullptr;
- const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
- if (SD_VK_DEVICE != nullptr) {
- std::string sd_vk_device_str = SD_VK_DEVICE;
- try {
- unsigned long long device = std::stoull(sd_vk_device_str);
- std::string vk_device_name = "Vulkan" + std::to_string(device);
- if (backend_name_exists(vk_device_name)) {
- LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str());
- backend = init_named_backend(vk_device_name);
- if (!backend) {
- LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str());
- }
- } else {
- LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str());
- }
- } catch (const std::invalid_argument&) {
- LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE);
- } catch (const std::out_of_range&) {
- LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE);
- }
- }
-
- if (!backend) {
- std::string dev_name = get_default_backend_name();
- backend = init_named_backend(dev_name);
- if (!backend && !dev_name.empty()) {
- LOG_WARN("device %s failed to init", dev_name.c_str());
- }
- }
-
- if (!backend) {
- LOG_WARN("loading CPU backend");
- backend = ggml_backend_cpu_init();
- }
-
- if (ggml_backend_is_cpu(backend)) {
- LOG_DEBUG("Using CPU backend");
- }
-
- return backend;
-}
-
// namespace is needed to avoid conflicts with ggml_backend_extend.hpp
namespace ggml_cpu {
#include "ggml-cpu.h"
diff --git a/src/util.h b/src/util.h
index 628a1f9d..9843ae18 100644
--- a/src/util.h
+++ b/src/util.h
@@ -86,7 +86,6 @@ bool sd_should_preview_noisy();
// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
bool sd_backend_is(ggml_backend_t backend, const std::string& name);
-ggml_backend_t sd_get_default_backend();
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
diff --git a/src/vae.hpp b/src/vae.hpp
index 4e6fa714..d7e0fdee 100644
--- a/src/vae.hpp
+++ b/src/vae.hpp
@@ -62,8 +62,8 @@ protected:
}
public:
- VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu)
- : version(version), GGMLRunner(backend, offload_params_to_cpu) {}
+ VAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend)
+ : version(version), GGMLRunner(backend, params_backend) {}
int get_scale_factor() {
int scale_factor = 8;
@@ -219,8 +219,8 @@ public:
};
struct FakeVAE : public VAE {
- FakeVAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu)
- : VAE(version, backend, offload_params_to_cpu) {}
+ FakeVAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend)
+ : VAE(version, backend, params_backend) {}
int get_encoder_output_channels(int input_channels) {
return input_channels;
diff --git a/src/wan.hpp b/src/wan.hpp
index 64890c04..ddd9680e 100644
--- a/src/wan.hpp
+++ b/src/wan.hpp
@@ -1126,12 +1126,12 @@ namespace WAN {
WanVAE ae;
WanVAERunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
bool decode_only = false,
SDVersion version = VERSION_WAN2)
- : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, offload_params_to_cpu) {
+ : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, params_backend) {
ae.init(params_ctx, tensor_storage_map, prefix);
}
@@ -1329,7 +1329,7 @@ namespace WAN {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16;
- std::shared_ptr vae = std::make_shared(backend, false, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V);
+ std::shared_ptr vae = std::make_shared(backend, backend, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V);
{
LOG_INFO("loading from '%s'", file_path.c_str());
@@ -2094,11 +2094,11 @@ namespace WAN {
SDVersion version;
WanRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
SDVersion version = VERSION_WAN2)
- : GGMLRunner(backend, offload_params_to_cpu) {
+ : GGMLRunner(backend, params_backend) {
wan_params.num_layers = 0;
for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first;
@@ -2346,7 +2346,7 @@ namespace WAN {
}
std::shared_ptr wan = std::make_shared(backend,
- false,
+ backend,
tensor_storage_map,
"model.diffusion_model",
VERSION_WAN2_2_TI2V);
diff --git a/src/z_image.hpp b/src/z_image.hpp
index 00b69c26..c0546931 100644
--- a/src/z_image.hpp
+++ b/src/z_image.hpp
@@ -473,11 +473,11 @@ namespace ZImage {
SDVersion version;
ZImageRunner(ggml_backend_t backend,
- bool offload_params_to_cpu,
+ ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
SDVersion version = VERSION_Z_IMAGE)
- : GGMLRunner(backend, offload_params_to_cpu) {
+ : GGMLRunner(backend, params_backend) {
z_image = ZImageModel(z_image_params);
z_image.init(params_ctx, tensor_storage_map, prefix);
}
@@ -620,7 +620,7 @@ namespace ZImage {
}
std::shared_ptr z_image = std::make_shared(backend,
- false,
+ backend,
tensor_storage_map,
"model.diffusion_model",
VERSION_QWEN_IMAGE);