diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 749366e7..a4f0365d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -444,12 +444,98 @@ jobs: path: | sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip + windows-latest-rocm: + runs-on: windows-2022 + + env: + ROCM_VERSION: "7.12.0" + GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201" + + steps: + - uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Cache ROCm Installation + id: cache-rocm + uses: actions/cache@v4 + with: + path: C:\TheRock\build + key: rocm-${{ env.ROCM_VERSION }}-gfx1151-${{ runner.os }} + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: windows-latest-rocm-${{ env.ROCM_VERSION }}-x64 + evict-old-files: 1d + + - name: Install ROCm + if: steps.cache-rocm.outputs.cache-hit != 'true' + run: | + $ErrorActionPreference = "Stop" + write-host "Downloading AMD ROCm ${{ env.ROCM_VERSION }} tarball" + Invoke-WebRequest -Uri "https://repo.amd.com/rocm/tarball/therock-dist-windows-gfx1151-${{ env.ROCM_VERSION }}.tar.gz" -OutFile "${env:RUNNER_TEMP}\rocm.tar.gz" + write-host "Extracting ROCm tarball" + mkdir C:\TheRock\build -Force + tar -xzf "${env:RUNNER_TEMP}\rocm.tar.gz" -C C:\TheRock\build --strip-components=1 + write-host "Completed ROCm extraction" + + - name: Setup ROCm Environment + run: | + $rocmPath = "C:\TheRock\build" + echo "HIP_PATH=$rocmPath" >> $env:GITHUB_ENV + echo "HIP_DEVICE_LIB_PATH=$rocmPath\lib\llvm\amdgcn\bitcode" >> $env:GITHUB_ENV + echo "HIP_PLATFORM=amd" >> $env:GITHUB_ENV + echo "LLVM_PATH=$rocmPath\lib\llvm" >> $env:GITHUB_ENV + echo "$rocmPath\bin" >> $env:GITHUB_PATH + echo "$rocmPath\lib\llvm\bin" >> $env:GITHUB_PATH + + - name: Build + run: | + mkdir build + cd build + cmake .. ` + -G "Unix Makefiles" ` + -DCMAKE_PREFIX_PATH="${env:HIP_PATH}" ` + -DSD_HIPBLAS=ON ` + -DSD_BUILD_SHARED_LIBS=ON ` + -DGGML_NATIVE=OFF ` + -DCMAKE_C_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang.exe" ` + -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang++.exe" ` + -DCMAKE_HIP_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang.exe" ` + -DHIP_PATH="${env:HIP_PATH}" ` + -DCMAKE_BUILD_TYPE=Release ` + -DGPU_TARGETS="${{ env.GPU_TARGETS }}" + cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS} + + - name: Get commit hash + id: commit + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: pr-mpt/actions-commit-hash@v2 + + - name: Pack artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" + 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip + path: | + sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip + windows-latest-cmake-hip: runs-on: windows-2022 env: - HIPSDK_INSTALLER_VERSION: "25.Q3" - GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" + HIPSDK_INSTALLER_VERSION: "26.Q1" + ROCM_VERSION: "7.1.1" + GPU_TARGETS: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" steps: - uses: actions/checkout@v3 @@ -484,7 +570,7 @@ jobs: run: | $ErrorActionPreference = "Stop" write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru $completed = $proc.WaitForExit(600000) @@ -537,32 +623,38 @@ jobs: run: | md "build\bin\rocblas\library\" md "build\bin\hipblaslt\library" - cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\" cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\" - 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\* + 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\* - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip + name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip path: | - sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip + sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip ubuntu-latest-rocm: - runs-on: ubuntu-latest - container: rocm/dev-ubuntu-24.04:7.2 + runs-on: ubuntu-24.04 env: - ROCM_VERSION: "7.2" UBUNTU_VERSION: "24.04" - GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" + + strategy: + matrix: + include: + - ROCM_VERSION: "7.2.1" + gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201" + build: 'x64' + - ROCM_VERSION: "7.12.0" + gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201" + build: x64 steps: - - run: apt-get update && apt-get install -y git - name: Clone id: checkout uses: actions/checkout@v6 @@ -579,6 +671,38 @@ jobs: with: version: 10.15.1 + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }} + evict-old-files: 1d + + - name: Dependencies + id: depends + run: | + sudo apt install -y build-essential cmake wget zip ninja-build + + - name: Setup Legacy ROCm + if: matrix.ROCM_VERSION == '7.2.1' + id: legacy_env + run: | + sudo mkdir --parents --mode=0755 /etc/apt/keyrings + wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \ + gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + + sudo tee /etc/apt/sources.list.d/rocm.list << EOF + deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} noble main + EOF + + sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF + Package: * + Pin: release o=repo.radeon.com + Pin-Priority: 600 + EOF + + sudo apt update + sudo apt-get install -y libssl-dev rocm-hip-sdk + - name: Free disk space run: | # Remove preinstalled SDKs and caches not needed for this job @@ -592,51 +716,17 @@ jobs: sudo rm -rf /var/lib/apt/lists/* || true sudo apt clean - - name: Dependencies - id: depends + - name: Setup TheRock + if: matrix.ROCM_VERSION != '7.2.1' + id: therock_env run: | - sudo apt-get update - sudo apt install -y \ - cmake \ - hip-dev \ - hipblas-dev \ - ninja-build \ - rocm-dev \ - zip - # Clean apt caches to recover disk space - sudo apt clean - sudo rm -rf /var/lib/apt/lists/* || true - - - name: Setup ROCm Environment - run: | - # Add ROCm to PATH for current session - echo "/opt/rocm/bin" >> $GITHUB_PATH - - # Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring) - TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))" - - # Remove library files for architectures we're not building for to save disk space - echo "Cleaning up unneeded architecture files..." - cd /opt/rocm/lib/rocblas/library - # Keep only our target architectures - for file in *; do - if printf '%s' "$file" | grep -q 'gfx'; then - if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then - echo "Removing $file" && - sudo rm -f "$file"; - fi - fi - done - - cd /opt/rocm/lib/hipblaslt/library - for file in *; do - if printf '%s' "$file" | grep -q 'gfx'; then - if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then - echo "Removing $file" && - sudo rm -f "$file"; - fi - fi - done + wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz + mkdir install + tar -xf *.tar.gz -C install + export ROCM_PATH=$(pwd)/install + echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV + echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV + echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV - name: Build id: cmake_build @@ -644,12 +734,12 @@ jobs: mkdir build cd build cmake .. -G Ninja \ - -DCMAKE_CXX_COMPILER=amdclang++ \ - -DCMAKE_C_COMPILER=amdclang \ + -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ + -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \ -DCMAKE_BUILD_TYPE=Release \ -DSD_HIPBLAS=ON \ - -DGPU_TARGETS="${{ env.GPU_TARGETS }}" \ - -DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \ + -DHIP_PLATFORM=amd \ + -DGPU_TARGETS="${{ matrix.gpu_targets }}" \ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DSD_BUILD_SHARED_LIBS=ON @@ -668,16 +758,6 @@ jobs: cp ggml/LICENSE ./build/bin/ggml.txt cp LICENSE ./build/bin/stable-diffusion.cpp.txt - # Move ROCm runtime libraries (to avoid double space consumption) - sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/ - sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/ - sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/ - sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/ - sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/ - sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/ - sudo mv /opt/rocm/lib/rocblas/ ./build/bin/ - sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/ - - name: Fetch system info id: system-info run: | @@ -692,15 +772,15 @@ jobs: run: | cp ggml/LICENSE ./build/bin/ggml.txt cp LICENSE ./build/bin/stable-diffusion.cpp.txt - zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin + zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip ./build/bin - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip + name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip path: | - sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip + sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -715,6 +795,7 @@ jobs: - macOS-latest-cmake - windows-latest-cmake - windows-latest-cmake-hip + - windows-latest-rocm steps: - name: Clone diff --git a/README.md b/README.md index 33c272e9..d889f28f 100644 --- a/README.md +++ b/README.md @@ -133,9 +133,11 @@ API and command-line option may change frequently.*** ## Performance If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md). +For runtime and parameter backend placement, see the [backend selection guide](./docs/backend.md). ## More Guides +- [Backend selection](./docs/backend.md) - [SD1.x/SD2.x/SDXL](./docs/sd.md) - [SD3/SD3.5](./docs/sd3.md) - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md) diff --git a/docs/backend.md b/docs/backend.md new file mode 100644 index 00000000..53088b0e --- /dev/null +++ b/docs/backend.md @@ -0,0 +1,122 @@ +# Backend selection + +`stable-diffusion.cpp` has two backend assignments: + +- `--backend` selects the runtime backend used to execute model graphs. +- `--params-backend` selects the backend used to allocate model parameters. + +If `--params-backend` is not set, parameters use the same backend as their module runtime backend. + +## Syntax + +A backend assignment can be a single backend name: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend cpu +``` + +This applies to every module that does not have a more specific assignment. + +Assignments can also target individual modules: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend te=cpu,vae=cuda0,diffusion=vulkan0 +``` + +The same syntax is used for parameter placement: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend te=cpu,vae=cpu +``` + +Module names are case-insensitive. Hyphens and underscores in module names are ignored, so `clip_vision`, `clip-vision`, and `clipvision` are equivalent. + +`all=`, `default=`, and `*=` can be used to set the default backend inside a mixed assignment: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend all=cuda0,te=cpu +``` + +## Modules + +| Module | Purpose | Accepted names | +| --- | --- | --- | +| `diffusion` | UNet, DiT, MMDiT, Flux, Wan, Qwen Image, and other diffusion models | `diffusion`, `model`, `unet`, `dit` | +| `te` | Text encoders and conditioners | `te`, `clip`, `text`, `textencoder`, `textencoders`, `conditioner`, `cond`, `llm`, `t5`, `t5xxl` | +| `clip_vision` | CLIP vision encoder | `clip_vision`, `clipvision`, `clip-vision`, `vision` | +| `vae` | VAE and TAE | `vae`, `firststage`, `autoencoder`, `tae` | +| `controlnet` | ControlNet | `controlnet`, `control` | +| `photomaker` | PhotoMaker ID encoder and PhotoMaker LoRA | `photomaker`, `photomakerid`, `pmid`, `photo` | +| `upscaler` | ESRGAN upscaler | `upscaler`, `esrgan`, `hires` | + +`te` is the preferred module name for text encoders. `clip` is kept as an accepted alias because many existing commands and model names use CLIP terminology. + +## Backend names + +Backend names are resolved against the GGML backend device list. Matching is case-insensitive and accepts exact names or unique prefixes, so common values include names such as: + +- `cpu` +- `cuda0` +- `vulkan0` +- `metal` + +The special values `auto`, `default`, and an empty backend name select the default backend. The default preference is GPU, then integrated GPU, then CPU. + +The special value `gpu` selects the first GPU backend, falling back to the first integrated GPU backend. + +## Runtime backend vs. parameter backend + +The runtime backend controls where graph execution runs. The parameter backend controls where model weights are allocated. + +For example: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend cpu +``` + +This runs all modules on `cuda0`, but stores parameters in CPU RAM. During execution, parameters are moved to the runtime backend as needed. + +Per-module assignments can be mixed: + +```shell +sd-cli -m model.safetensors -p "a cat" --backend diffusion=cuda0,te=cpu,vae=cpu --params-backend diffusion=cuda0,te=cpu,vae=cpu +``` + +This keeps text encoding and VAE execution on CPU while the diffusion model runs on GPU. + +## Backend sharing and lifetime + +Backends are managed by `SDBackendManager`. + +Within one manager, backend instances are cached by resolved backend device name. If multiple modules request the same backend, they share the same `ggml_backend_t`. + +For example: + +```shell +--backend te=cpu,vae=cpu +``` + +uses one shared CPU backend for both `te` and `vae` runtime execution. + +Runtime and parameter assignments also share the same backend cache. If `--backend diffusion=cuda0` and `--params-backend diffusion=cuda0` resolve to the same device, both use the same backend instance. + +`SDBackendManager` owns the backend instances and frees them when the context or upscaler is destroyed. Model runners receive non-owning runtime and parameter backend pointers and do not free them. + +## Compatibility flags + +The older CPU placement flags are still supported: + +- `--clip-on-cpu` +- `--vae-on-cpu` +- `--control-net-cpu` +- `--offload-to-cpu` + +`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` affect runtime backend assignment only when `--backend` is not set. They map to `te=cpu`, `vae=cpu`, and `controlnet=cpu`. + +`--offload-to-cpu` affects parameter backend assignment only when `--params-backend` is not set. It is equivalent to: + +```shell +--params-backend cpu +``` + +Explicit `--backend` and `--params-backend` assignments are preferred for new commands. diff --git a/docs/hipBLAS_on_Windows.md b/docs/hipBLAS_on_Windows.md index b5105ad1..cd846572 100644 --- a/docs/hipBLAS_on_Windows.md +++ b/docs/hipBLAS_on_Windows.md @@ -26,12 +26,12 @@ Fortunately, `AMD` provides complete help documentation, you can use the help do Then we must set `ROCM` as environment variables before running cmake. -Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\5.5\bin` +Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\7.1.1\bin` This is what I use to set the clang: ```Commandline -set CC=C:\Program Files\AMD\ROCm\5.5\bin\clang.exe -set CXX=C:\Program Files\AMD\ROCm\5.5\bin\clang++.exe +set CC=C:\Program Files\AMD\ROCm\7.1.1\bin\clang.exe +set CXX=C:\Program Files\AMD\ROCm\7.1.1\bin\clang++.exe ``` ## Ninja @@ -46,7 +46,7 @@ set ninja=C:\Program Files\ninja\ninja.exe ## Building stable-diffusion.cpp The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` , -`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100` +`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032` >**Notice**: check the `clang` and `clang++` information: ```Commandline @@ -59,26 +59,29 @@ If you see like this, we can continue: clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be) Target: x86_64-pc-windows-msvc Thread model: posix -InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin +InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin ``` ``` clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be) Target: x86_64-pc-windows-msvc Thread model: posix -InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin +InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin ``` ->**Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus) +>**Notice** that the GPU targets are now compatible with multiple GPU architectures (ROCm 7.1.1 targets). You can change them to match your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus) -My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`. +Examples: +- AMD Radeon™ RX 7900 XTX Graphics: `gfx1100` +- AMD Radeon™ RX 7900 XT Graphics: `gfx1101` +- AMD Radeon™ RX 7900 GRE Graphics: `gfx1102` option: ```commandline mkdir build cd build -cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100 +cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" cmake --build . --config Release ``` diff --git a/docs/z_image.md b/docs/z_image.md index 2ea66f9b..58c2d1fe 100644 --- a/docs/z_image.md +++ b/docs/z_image.md @@ -21,7 +21,7 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev ### Z-Image-Turbo ``` -.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512 +.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512 --steps 8 ``` z-image example diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index d13ca6d9..dc5013b3 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -800,7 +800,9 @@ int main(int argc, const char* argv[]) { ctx_params.offload_params_to_cpu, ctx_params.diffusion_conv_direct, ctx_params.n_threads, - gen_params.upscale_tile_size)); + gen_params.upscale_tile_size, + ctx_params.backend.c_str(), + ctx_params.params_backend.c_str())); if (upscaler_ctx == nullptr) { LOG_ERROR("new_upscaler_ctx failed"); diff --git a/examples/common/common.cpp b/examples/common/common.cpp index ff90930a..e9bc212b 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -388,6 +388,14 @@ ArgOptions SDContextParams::get_options() { "--upscale-model", "path to esrgan model.", &esrgan_path}, + {"", + "--backend", + "runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0", + &backend}, + {"", + "--params-backend", + "parameter backend assignment, e.g. cpu or diffusion=cpu,clip=cpu", + ¶ms_backend}, }; options.int_options = { @@ -686,6 +694,8 @@ std::string SDContextParams::to_string() const { << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " max_vram: " << max_vram << ",\n" + << " backend: \"" << backend << "\",\n" + << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" @@ -763,6 +773,8 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f chroma_t5_mask_pad, qwen_image_zero_cond_t, max_vram, + backend.c_str(), + params_backend.c_str(), }; return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index a83a2bc8..a515adf4 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -112,14 +112,16 @@ struct SDContextParams { rng_type_t sampler_rng_type = RNG_TYPE_COUNT; bool offload_params_to_cpu = false; float max_vram = 0.f; - bool enable_mmap = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; - bool flash_attn = false; - bool diffusion_flash_attn = false; - bool diffusion_conv_direct = false; - bool vae_conv_direct = false; + std::string backend; + std::string params_backend; + bool enable_mmap = false; + bool control_net_cpu = false; + bool clip_on_cpu = false; + bool vae_on_cpu = false; + bool flash_attn = false; + bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; bool circular = false; bool circular_x = false; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index bb68dd76..55bd52ad 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -209,6 +209,8 @@ typedef struct { int chroma_t5_mask_pad; bool qwen_image_zero_cond_t; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB) + const char* backend; + const char* params_backend; } sd_ctx_params_t; typedef struct { @@ -443,7 +445,9 @@ SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, bool offload_params_to_cpu, bool direct, int n_threads, - int tile_size); + int tile_size, + const char* backend, + const char* params_backend); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, diff --git a/src/anima.hpp b/src/anima.hpp index 4bfc0474..486aec3a 100644 --- a/src/anima.hpp +++ b/src/anima.hpp @@ -526,10 +526,10 @@ namespace Anima { AnimaNet net; AnimaRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model") - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { int64_t num_layers = 0; std::string layer_tag = prefix + ".net.blocks."; for (const auto& kv : tensor_storage_map) { diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp index 4fb28a16..489f8fd3 100644 --- a/src/auto_encoder_kl.hpp +++ b/src/auto_encoder_kl.hpp @@ -664,13 +664,13 @@ struct AutoEncoderKL : public VAE { AutoEncoderKLModel ae; AutoEncoderKL(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool decode_only = false, bool use_video_decoder = false, SDVersion version = VERSION_SD1) - : decode_only(decode_only), VAE(version, backend, offload_params_to_cpu) { + : decode_only(decode_only), VAE(version, backend, params_backend) { if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { scale_factor = 0.18215f; shift_factor = 0.f; diff --git a/src/clip.hpp b/src/clip.hpp index 8b2084c4..a3567324 100644 --- a/src/clip.hpp +++ b/src/clip.hpp @@ -469,13 +469,13 @@ struct CLIPTextModelRunner : public GGMLRunner { std::vector attention_mask_vec; CLIPTextModelRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool with_final_ln = true, bool force_clip_f32 = false) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { bool proj_in = false; for (const auto& [name, tensor_storage] : tensor_storage_map) { if (!starts_with(name, prefix)) { diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 8831f1a1..e5a702b9 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -147,7 +147,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::map> embedding_pos_map; FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::map& orig_embedding_map, SDVersion version = VERSION_SD1, @@ -161,12 +161,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } bool force_clip_f32 = !embedding_map.empty(); if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); + text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); + text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); - text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); + text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); + text_model2 = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); } } @@ -683,9 +683,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { CLIPVisionModelProjection vision_model; FrozenCLIPVisionEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { std::string prefix = "cond_stage_model.transformer"; bool proj_in = false; for (const auto& [name, tensor_storage] : tensor_storage_map) { @@ -742,7 +742,7 @@ struct SD3CLIPEmbedder : public Conditioner { std::shared_ptr t5; SD3CLIPEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) : clip_g_tokenizer(0) { bool use_clip_l = false; @@ -762,13 +762,13 @@ struct SD3CLIPEmbedder : public Conditioner { return; } if (use_clip_l) { - clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + clip_l = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); } if (use_clip_g) { - clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); + clip_g = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); } if (use_t5) { - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer"); + t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer"); } } @@ -1110,7 +1110,7 @@ struct FluxCLIPEmbedder : public Conditioner { size_t chunk_len = 256; FluxCLIPEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) { bool use_clip_l = false; bool use_t5 = false; @@ -1128,12 +1128,12 @@ struct FluxCLIPEmbedder : public Conditioner { } if (use_clip_l) { - clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); + clip_l = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); } else { LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded."); } if (use_t5) { - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer"); + t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer"); } else { LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded."); } @@ -1364,7 +1364,7 @@ struct T5CLIPEmbedder : public Conditioner { bool is_umt5 = false; T5CLIPEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, bool use_mask = false, int mask_pad = 0, @@ -1381,7 +1381,7 @@ struct T5CLIPEmbedder : public Conditioner { LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!"); return; } else { - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5); + t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5); } } @@ -1566,12 +1566,12 @@ struct AnimaConditioner : public Conditioner { std::shared_ptr llm; AnimaConditioner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) { qwen_tokenizer = std::make_shared(); llm = std::make_shared(LLM::LLMArch::QWEN3, backend, - offload_params_to_cpu, + params_backend, tensor_storage_map, "text_encoders.llm", false); @@ -1638,10 +1638,11 @@ struct AnimaConditioner : public Conditioner { for (const auto& item : parsed_attention) { const std::string& curr_text = item.first; float curr_weight = item.second; - std::vector curr_tokens = t5_tokenizer.tokenize(curr_text, nullptr, true); + std::vector curr_tokens = t5_tokenizer.encode(curr_text); t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); } + t5_tokenizer.pad_tokens(t5_tokens, &t5_weights, nullptr); return {qwen_tokens, qwen_weights, t5_tokens, t5_weights}; } @@ -1684,7 +1685,7 @@ struct LLMEmbedder : public Conditioner { std::shared_ptr llm; LLMEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, SDVersion version = VERSION_QWEN_IMAGE, const std::string prefix = "", @@ -1705,7 +1706,7 @@ struct LLMEmbedder : public Conditioner { } llm = std::make_shared(arch, backend, - offload_params_to_cpu, + params_backend, tensor_storage_map, "text_encoders.llm", enable_vision); @@ -2069,10 +2070,10 @@ struct LTXAVTextProjectionRunner : public GGMLRunner { LTXAVTextProjection model; LTXAVTextProjectionRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string& prefix = "") - : GGMLRunner(backend, offload_params_to_cpu), + : GGMLRunner(backend, params_backend), model(tensor_storage_map.find(prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end()) { model.init(params_ctx, tensor_storage_map, prefix); } @@ -2113,20 +2114,20 @@ struct LTXAVEmbedder : public Conditioner { bool dual_projection = false; LTXAVEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string& llm_prefix = "text_encoders.llm", const std::string& projector_prefix = "text_embedding_projection") { tokenizer = std::make_shared(); llm = std::make_shared(LLM::LLMArch::GEMMA3_12B, backend, - offload_params_to_cpu, + params_backend, tensor_storage_map, llm_prefix, false); dual_projection = tensor_storage_map.find(projector_prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end(); projector = std::make_shared(backend, - offload_params_to_cpu, + params_backend, tensor_storage_map, projector_prefix); } diff --git a/src/control.hpp b/src/control.hpp index d227ec94..fd1f6d86 100644 --- a/src/control.hpp +++ b/src/control.hpp @@ -319,10 +319,10 @@ struct ControlNet : public GGMLRunner { bool guided_hint_cached = false; ControlNet(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, SDVersion version = VERSION_SD1) - : GGMLRunner(backend, offload_params_to_cpu), control_net(version) { + : GGMLRunner(backend, params_backend), control_net(version) { control_net.init(params_ctx, tensor_storage_map, ""); } diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 0d344226..bbfab57d 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -68,10 +68,10 @@ struct UNetModel : public DiffusionModel { UNetModelRunner unet; UNetModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, SDVersion version = VERSION_SD1) - : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) { + : unet(backend, params_backend, tensor_storage_map, "model.diffusion_model", version) { } std::string get_desc() override { @@ -139,9 +139,9 @@ struct MMDiTModel : public DiffusionModel { MMDiTRunner mmdit; MMDiTModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) - : mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") { + : mmdit(backend, params_backend, tensor_storage_map, "model.diffusion_model") { } std::string get_desc() override { @@ -206,11 +206,11 @@ struct FluxModel : public DiffusionModel { Flux::FluxRunner flux; FluxModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, SDVersion version = VERSION_FLUX, bool use_mask = false) - : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) { + : flux(backend, params_backend, tensor_storage_map, "model.diffusion_model", version, use_mask) { } std::string get_desc() override { @@ -281,10 +281,10 @@ struct AnimaModel : public DiffusionModel { Anima::AnimaRunner anima; AnimaModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model") - : prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + : prefix(prefix), anima(backend, params_backend, tensor_storage_map, prefix) { } std::string get_desc() override { @@ -349,11 +349,11 @@ struct WanModel : public DiffusionModel { WAN::WanRunner wan; WanModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model", SDVersion version = VERSION_WAN2) - : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) { + : prefix(prefix), wan(backend, params_backend, tensor_storage_map, prefix, version) { } std::string get_desc() override { @@ -421,12 +421,12 @@ struct QwenImageModel : public DiffusionModel { Qwen::QwenImageRunner qwen_image; QwenImageModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model", SDVersion version = VERSION_QWEN_IMAGE, bool zero_cond_t = false) - : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version, zero_cond_t) { + : prefix(prefix), qwen_image(backend, params_backend, tensor_storage_map, prefix, version, zero_cond_t) { } std::string get_desc() override { @@ -492,10 +492,10 @@ struct HiDreamO1Model : public DiffusionModel { HiDreamO1::HiDreamO1Runner hidream_o1; HiDreamO1Model(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string& prefix = "model") - : prefix(prefix), hidream_o1(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + : prefix(prefix), hidream_o1(backend, params_backend, tensor_storage_map, prefix) { } std::string get_desc() override { @@ -568,11 +568,11 @@ struct ZImageModel : public DiffusionModel { ZImage::ZImageRunner z_image; ZImageModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model", SDVersion version = VERSION_Z_IMAGE) - : prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) { + : prefix(prefix), z_image(backend, params_backend, tensor_storage_map, prefix, version) { } std::string get_desc() override { @@ -638,10 +638,10 @@ struct ErnieImageModel : public DiffusionModel { ErnieImage::ErnieImageRunner ernie_image; ErnieImageModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model") - : prefix(prefix), ernie_image(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + : prefix(prefix), ernie_image(backend, params_backend, tensor_storage_map, prefix) { } std::string get_desc() override { @@ -704,10 +704,10 @@ struct LTXAVModel : public DiffusionModel { LTXV::LTXAVRunner ltxav; LTXAVModel(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "model.diffusion_model") - : prefix(prefix), ltxav(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + : prefix(prefix), ltxav(backend, params_backend, tensor_storage_map, prefix) { } std::string get_desc() override { diff --git a/src/ernie_image.hpp b/src/ernie_image.hpp index 931794f1..bea84cdf 100644 --- a/src/ernie_image.hpp +++ b/src/ernie_image.hpp @@ -331,10 +331,10 @@ namespace ErnieImage { std::vector pe_vec; ErnieImageRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { ernie_params.num_layers = 0; for (const auto& [name, tensor_storage] : tensor_storage_map) { if (!starts_with(name, prefix)) { diff --git a/src/esrgan.hpp b/src/esrgan.hpp index f84b77a2..f54baca3 100644 --- a/src/esrgan.hpp +++ b/src/esrgan.hpp @@ -161,10 +161,10 @@ struct ESRGAN : public GGMLRunner { int tile_size = 128; // avoid cuda OOM for 4gb VRAM ESRGAN(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, int tile_size = 128, const String2TensorStorage& tensor_storage_map = {}) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { this->tile_size = tile_size; } diff --git a/src/flux.hpp b/src/flux.hpp index 732a3719..2aac3be0 100644 --- a/src/flux.hpp +++ b/src/flux.hpp @@ -1189,12 +1189,12 @@ namespace Flux { bool use_mask = false; FluxRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", SDVersion version = VERSION_FLUX, bool use_mask = false) - : GGMLRunner(backend, offload_params_to_cpu), version(version), use_mask(use_mask) { + : GGMLRunner(backend, params_backend), version(version), use_mask(use_mask) { flux_params.version = version; flux_params.guidance_embed = false; flux_params.depth = 0; @@ -1564,7 +1564,7 @@ namespace Flux { } std::shared_ptr flux = std::make_shared(backend, - false, + backend, tensor_storage_map, "model.diffusion_model", VERSION_FLUX2, diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index d2e655ce..8e2ed694 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -26,7 +26,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml.h" -#include "ggml_extend_backend.hpp" +#include "ggml_extend_backend.h" #include "ggml_graph_cut.h" #include "model.h" @@ -73,48 +73,6 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha } } -__STATIC_INLINE__ bool backend_name_exists(std::string name) { - ggml_backend_load_all_once(); - const size_t device_count = ggml_backend_dev_count(); - for (size_t i = 0; i < device_count; ++i) { - if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) { - return true; - } - } - return false; -} - -__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) { - if (name == "" || backend_name_exists(name)) { - return name; - } else { - LOG_WARN("Backend %s not found, using default backend", name.c_str()); - return ""; - } -} - -__STATIC_INLINE__ std::string get_default_backend_name() { - ggml_backend_load_all_once(); - // should pick the same backend as ggml_backend_init_best - ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); - dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); - dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (dev == nullptr) { - return ""; - } - return ggml_backend_dev_name(dev); -} - -__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") { - ggml_backend_load_all_once(); - LOG_DEBUG("Initializing backend: %s", name.c_str()); - if (name.empty()) { - return ggml_backend_init_best(); - } else { - return ggml_backend_init_by_name(name.c_str(), nullptr); - } -} - static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128"); // n-mode tensor-matrix product @@ -190,7 +148,7 @@ __STATIC_INLINE__ void ggml_ext_im_set_randn_f32(ggml_tensor* tensor, std::share uint32_t n = (uint32_t)ggml_nelements(tensor); std::vector random_numbers = rng->randn(n); for (uint32_t i = 0; i < n; i++) { - ggml_set_f32_1d(tensor, i, random_numbers[i]); + ggml_ext_im_set_f32_1d(tensor, i, random_numbers[i]); } } @@ -422,39 +380,6 @@ __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const st // file.close(); // } -__STATIC_INLINE__ void copy_ggml_tensor(ggml_tensor* dst, ggml_tensor* src) { - if (dst->type == src->type) { - dst->nb[0] = src->nb[0]; - dst->nb[1] = src->nb[1]; - dst->nb[2] = src->nb[2]; - dst->nb[3] = src->nb[3]; - - memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst)); - return; - } - ggml_init_params params; - params.mem_size = 10 * 1024 * 1024; // for padding - params.mem_buffer = nullptr; - params.no_alloc = false; - ggml_context* ctx = ggml_init(params); - if (!ctx) { - LOG_ERROR("ggml_init() failed"); - return; - } - ggml_tensor* final = ggml_cpy(ctx, src, dst); - - ggml_cgraph* graph = ggml_new_graph(ctx); - ggml_build_forward_expand(graph, final); - ggml_graph_compute_with_ctx(ctx, graph, 1); - ggml_free(ctx); -} - -__STATIC_INLINE__ ggml_tensor* ggml_ext_dup_and_cpy_tensor(ggml_context* ctx, ggml_tensor* src) { - ggml_tensor* dup = ggml_dup_tensor(ctx, src); - copy_ggml_tensor(dup, src); - return dup; -} - __STATIC_INLINE__ float sigmoid(float x) { return 1 / (1.0f + expf(-x)); } @@ -2669,13 +2594,11 @@ protected: public: virtual std::string get_desc() = 0; - GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false) - : runtime_backend(backend) { - if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) { - params_backend = ggml_backend_cpu_init(); - } else { - params_backend = runtime_backend; - } + GGMLRunner(ggml_backend_t backend, ggml_backend_t params_backend) + : params_backend(params_backend), + runtime_backend(backend) { + GGML_ASSERT(runtime_backend != nullptr); + GGML_ASSERT(params_backend != nullptr); alloc_params_ctx(); } @@ -2684,9 +2607,6 @@ public: free_compute_buffer(); free_params_ctx(); free_compute_ctx(); - if (params_backend != runtime_backend) { - ggml_backend_free(params_backend); - } free_cache_ctx_and_buffer(); } diff --git a/src/ggml_extend_backend.cpp b/src/ggml_extend_backend.cpp new file mode 100644 index 00000000..4bf8268e --- /dev/null +++ b/src/ggml_extend_backend.cpp @@ -0,0 +1,600 @@ +#include "ggml_extend_backend.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" + +static std::string trim_copy(const std::string& value) { + size_t begin = 0; + while (begin < value.size() && std::isspace(static_cast(value[begin]))) { + ++begin; + } + size_t end = value.size(); + while (end > begin && std::isspace(static_cast(value[end - 1]))) { + --end; + } + return value.substr(begin, end - begin); +} + +static std::string lower_copy(std::string value) { + std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + return value; +} + +static std::vector split_copy(const std::string& value, char delimiter) { + std::vector parts; + std::string part; + std::istringstream stream(value); + while (std::getline(stream, part, delimiter)) { + parts.push_back(part); + } + return parts; +} + +static bool is_default_backend_token(const std::string& name) { + const std::string lower = lower_copy(trim_copy(name)); + return lower.empty() || lower == "default" || lower == "auto"; +} + +static bool parse_backend_module(const std::string& raw_name, SDBackendModule* module) { + std::string name = lower_copy(trim_copy(raw_name)); + name.erase(std::remove(name.begin(), name.end(), '-'), name.end()); + name.erase(std::remove(name.begin(), name.end(), '_'), name.end()); + + if (name == "diffusion" || name == "model" || name == "unet" || name == "dit") { + *module = SDBackendModule::DIFFUSION; + return true; + } + if (name == "te" || name == "clip" || name == "text" || name == "textencoder" || name == "textencoders" || name == "conditioner" || name == "cond" || name == "llm" || name == "t5" || name == "t5xxl") { + *module = SDBackendModule::TE; + return true; + } + if (name == "clipvision" || name == "vision") { + *module = SDBackendModule::CLIP_VISION; + return true; + } + if (name == "vae" || name == "firststage" || name == "autoencoder" || name == "tae") { + *module = SDBackendModule::VAE; + return true; + } + if (name == "controlnet" || name == "control") { + *module = SDBackendModule::CONTROL_NET; + return true; + } + if (name == "photomaker" || name == "photomakerid" || name == "pmid" || name == "photo") { + *module = SDBackendModule::PHOTOMAKER; + return true; + } + if (name == "upscaler" || name == "esrgan" || name == "hires") { + *module = SDBackendModule::UPSCALER; + return true; + } + return false; +} + +static std::string module_assignment_name(const SDBackendAssignment& assignment, SDBackendModule module) { + auto it = assignment.module_names.find(module); + if (it != assignment.module_names.end()) { + return it->second; + } + return assignment.default_name; +} + +static std::string backend_cache_key(ggml_backend_t backend) { + if (backend == nullptr) { + return ""; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (dev != nullptr) { + return lower_copy(ggml_backend_dev_name(dev)); + } + const char* backend_name = ggml_backend_name(backend); + return backend_name != nullptr ? lower_copy(backend_name) : ""; +} + +static std::string resolve_first_device_by_type(enum ggml_backend_dev_type type) { + ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); + if (dev == nullptr) { + return ""; + } + return ggml_backend_dev_name(dev); +} + +static ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { + if (tensor == nullptr) { + return nullptr; + } + + return tensor->view_src ? tensor->view_src->buffer : tensor->buffer; +} + +static bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) { + if (tensor == nullptr || tensor->data == nullptr) { + return false; + } + + ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor); + return buffer == nullptr || ggml_backend_buffer_is_host(buffer); +} + +static size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + return static_cast(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]); +} + +template +static void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) { + const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3); + + if (ggml_backend_tensor_is_host_accessible(tensor)) { + auto* dst = reinterpret_cast(reinterpret_cast(tensor->data) + offset); + *dst = value; + return; + } + + ggml_backend_tensor_set(const_cast(tensor), &value, offset, sizeof(T)); +} + +static void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) { + switch (tensor->type) { + case GGML_TYPE_I8: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_I16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_I32: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_F16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value)); + break; + case GGML_TYPE_BF16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value)); + break; + case GGML_TYPE_F32: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value); + break; + default: + GGML_ABORT("fatal error"); + } +} + +void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = {0, 0, 0, 0}; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } + + switch (tensor->type) { + case GGML_TYPE_I8: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_I16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_I32: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_F16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value)); + break; + case GGML_TYPE_BF16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value)); + break; + case GGML_TYPE_F32: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value); + break; + default: + GGML_ABORT("fatal error"); + } +} + +static void ggml_backend_load_all_once() { + // If the registry already has devices and the CPU backend is present, + // assume either static registration or explicit host-side preloading has + // completed and avoid rescanning the default paths. + if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { + return; + } + // In dynamic-backend mode the backend modules are discovered at runtime, + // so we must load them before asking for the CPU backend or its proc table. + // If the host preloaded only a subset of backends, allow one default-path + // scan so missing modules can still be discovered. + static std::once_flag once; + std::call_once(once, []() { + if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { + return; + } + ggml_backend_load_all(); + }); +} + +bool sd_backend_is(ggml_backend_t backend, const std::string& name) { + if (!backend) { + return false; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (!dev) { + return false; + } + std::string dev_name = ggml_backend_dev_name(dev); + return lower_copy(dev_name).find(lower_copy(name)) != std::string::npos; +} + +static std::string get_default_backend_name() { + ggml_backend_load_all_once(); + // should pick the same backend preference as ggml_backend_init_best + std::string name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); + if (!name.empty()) { + return name; + } + name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); + if (!name.empty()) { + return name; + } + return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); +} + +static std::string sd_resolve_backend_name(const std::string& name) { + ggml_backend_load_all_once(); + std::string requested = trim_copy(name); + std::string lower = lower_copy(requested); + + if (is_default_backend_token(lower)) { + return get_default_backend_name(); + } + if (lower == "gpu") { + std::string result = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); + if (!result.empty()) { + return result; + } + return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); + } + + const size_t device_count = ggml_backend_dev_count(); + for (size_t i = 0; i < device_count; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + std::string dev_name = ggml_backend_dev_name(dev); + if (lower_copy(dev_name) == lower) { + return dev_name; + } + } + + for (size_t i = 0; i < device_count; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + std::string dev_name = ggml_backend_dev_name(dev); + std::string dev_lower = lower_copy(dev_name); + if (dev_lower.rfind(lower, 0) == 0) { + return dev_name; + } + } + + return ""; +} + +static bool backend_name_exists(const std::string& name) { + return !sd_resolve_backend_name(name).empty(); +} + +static ggml_backend_t init_named_backend(const std::string& name) { + ggml_backend_load_all_once(); + LOG_DEBUG("Initializing backend: %s", name.c_str()); + if (trim_copy(name).empty()) { + return ggml_backend_init_best(); + } + + std::string resolved = sd_resolve_backend_name(name); + if (resolved.empty()) { + return nullptr; + } + return ggml_backend_init_by_name(resolved.c_str(), nullptr); +} + +static ggml_backend_t sd_get_default_backend() { + ggml_backend_load_all_once(); + static std::once_flag once; + std::call_once(once, []() { + size_t dev_count = ggml_backend_dev_count(); + if (dev_count == 0) { + LOG_ERROR("No devices found!"); + } else { + LOG_DEBUG("Found %zu backend devices:", dev_count); + for (size_t i = 0; i < dev_count; ++i) { + auto dev = ggml_backend_dev_get(i); + LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev)); + } + } + }); + + ggml_backend_t backend = nullptr; + const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); + if (SD_VK_DEVICE != nullptr) { + std::string sd_vk_device_str = SD_VK_DEVICE; + try { + unsigned long long device = std::stoull(sd_vk_device_str); + std::string vk_device_name = "Vulkan" + std::to_string(device); + if (backend_name_exists(vk_device_name)) { + LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str()); + backend = init_named_backend(vk_device_name); + if (!backend) { + LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str()); + } + } else { + LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str()); + } + } catch (const std::invalid_argument&) { + LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE); + } catch (const std::out_of_range&) { + LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE); + } + } + + if (!backend) { + std::string dev_name = get_default_backend_name(); + backend = init_named_backend(dev_name); + if (!backend && !dev_name.empty()) { + LOG_WARN("device %s failed to init", dev_name.c_str()); + } + } + + if (!backend) { + LOG_WARN("loading CPU backend"); + backend = ggml_backend_cpu_init(); + } + + if (ggml_backend_is_cpu(backend)) { + LOG_DEBUG("Using CPU backend"); + } + + return backend; +} + +static bool sd_parse_backend_assignment(const std::string& spec, SDBackendAssignment* assignment, std::string* error) { + if (assignment == nullptr) { + return false; + } + + *assignment = {}; + const std::string in = trim_copy(spec); + if (in.empty()) { + return true; + } + + for (const std::string& raw_part : split_copy(in, ',')) { + const std::string part = trim_copy(raw_part); + if (part.empty()) { + continue; + } + + const size_t eq = part.find('='); + if (eq == std::string::npos) { + assignment->set_default(part); + continue; + } + + const std::string key = trim_copy(part.substr(0, eq)); + const std::string value = trim_copy(part.substr(eq + 1)); + if (key.empty() || value.empty()) { + if (error != nullptr) { + *error = "invalid backend assignment '" + part + "'"; + } + return false; + } + + const std::string key_lower = lower_copy(key); + if (key_lower == "all" || key_lower == "default" || key_lower == "*") { + assignment->set_default(value); + continue; + } + + SDBackendModule module = SDBackendModule::DIFFUSION; + if (!parse_backend_module(key, &module)) { + if (error != nullptr) { + *error = "unknown backend module '" + key + "'"; + } + return false; + } + assignment->set_module(module, value); + } + return true; +} + +bool SDBackendAssignment::empty() const { + return default_name.empty() && module_names.empty(); +} + +std::string SDBackendAssignment::get(SDBackendModule module) const { + return module_assignment_name(*this, module); +} + +void SDBackendAssignment::set_default(const std::string& name) { + default_name = trim_copy(name); +} + +void SDBackendAssignment::set_module(SDBackendModule module, const std::string& name) { + module_names[module] = trim_copy(name); +} + +void SDBackendHandleDeleter::operator()(ggml_backend_t backend) const { + ggml_backend_free(backend); +} + +SDBackendManager::~SDBackendManager() { + reset(); +} + +void SDBackendManager::reset() { + backends_.clear(); + runtime_assignment_ = {}; + params_assignment_ = {}; +} + +ggml_backend_t SDBackendManager::runtime_backend(SDBackendModule module) { + return init_cached_backend(runtime_assignment_.get(module)); +} + +ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) { + std::string name = params_assignment_.get(module); + if (name.empty()) { + return runtime_backend(module); + } + return init_cached_backend(name); +} + +bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) { + return ggml_backend_is_cpu(runtime_backend(module)); +} + +bool SDBackendManager::params_backend_is_cpu(SDBackendModule module) { + return ggml_backend_is_cpu(params_backend(module)); +} + +bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) { + ggml_backend_t backend = runtime_backend(module); + if (backend == nullptr) { + return false; + } + if (ggml_backend_is_cpu(backend)) { + return true; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (dev == nullptr) { + return false; + } + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + return props.caps.buffer_from_host_ptr; +} + +bool SDBackendManager::init(const char* backend_spec, + const char* params_backend_spec, + bool offload_params_to_cpu, + bool keep_clip_on_cpu, + bool keep_vae_on_cpu, + bool keep_control_net_on_cpu, + std::string* error) { + reset(); + + if (!sd_parse_backend_assignment(SAFE_STR(backend_spec), &runtime_assignment_, error)) { + return false; + } + if (!sd_parse_backend_assignment(SAFE_STR(params_backend_spec), ¶ms_assignment_, error)) { + return false; + } + + if (runtime_assignment_.empty()) { + if (keep_clip_on_cpu) { + runtime_assignment_.set_module(SDBackendModule::TE, "cpu"); + } + if (keep_vae_on_cpu) { + runtime_assignment_.set_module(SDBackendModule::VAE, "cpu"); + } + if (keep_control_net_on_cpu) { + runtime_assignment_.set_module(SDBackendModule::CONTROL_NET, "cpu"); + } + } + + if (params_assignment_.empty() && offload_params_to_cpu) { + params_assignment_.set_default("cpu"); + } + + return validate(error); +} + +bool SDBackendManager::validate(std::string* error) const { + auto validate_name = [&](const std::string& name) -> bool { + if (is_default_backend_token(name)) { + return true; + } + if (!sd_resolve_backend_name(name).empty()) { + return true; + } + if (error != nullptr) { + *error = "backend '" + name + "' was not found"; + } + return false; + }; + + if (!validate_name(runtime_assignment_.default_name) || + !validate_name(params_assignment_.default_name)) { + return false; + } + for (const auto& kv : runtime_assignment_.module_names) { + if (!validate_name(kv.second)) { + return false; + } + } + for (const auto& kv : params_assignment_.module_names) { + if (!validate_name(kv.second)) { + return false; + } + } + return true; +} + +ggml_backend_t SDBackendManager::init_cached_backend(const std::string& name) { + std::string resolved = sd_resolve_backend_name(name); + std::string key = lower_copy(resolved); + ggml_backend_t backend = nullptr; + + if (!key.empty()) { + auto it = backends_.find(key); + if (it != backends_.end()) { + return it->second.get(); + } + } else if (!is_default_backend_token(name)) { + LOG_ERROR("backend '%s' was not found", name.c_str()); + return nullptr; + } + + backend = is_default_backend_token(name) ? sd_get_default_backend() : init_named_backend(resolved); + if (backend == nullptr) { + LOG_ERROR("failed to initialize backend '%s'", name.c_str()); + return nullptr; + } + + std::string actual_key = backend_cache_key(backend); + if (actual_key.empty()) { + actual_key = !key.empty() ? key : lower_copy(trim_copy(name)); + } + + auto it = backends_.find(actual_key); + if (it != backends_.end()) { + ggml_backend_free(backend); + return it->second.get(); + } + + SDBackendHandle handle(backend); + backends_.emplace(actual_key, std::move(handle)); + return backend; +} + +const char* sd_backend_module_name(SDBackendModule module) { + switch (module) { + case SDBackendModule::DIFFUSION: + return "diffusion"; + case SDBackendModule::TE: + return "te"; + case SDBackendModule::CLIP_VISION: + return "clip_vision"; + case SDBackendModule::VAE: + return "vae"; + case SDBackendModule::CONTROL_NET: + return "controlnet"; + case SDBackendModule::PHOTOMAKER: + return "photomaker"; + case SDBackendModule::UPSCALER: + return "upscaler"; + } + return "unknown"; +} diff --git a/src/ggml_extend_backend.h b/src/ggml_extend_backend.h new file mode 100644 index 00000000..b98efc10 --- /dev/null +++ b/src/ggml_extend_backend.h @@ -0,0 +1,77 @@ +#ifndef __SD_GGML_EXTEND_BACKEND_H__ +#define __SD_GGML_EXTEND_BACKEND_H__ + +#include +#include +#include +#include +#include + +#include "ggml-backend.h" +#include "ggml-cpu.h" +#include "ggml.h" + +enum class SDBackendModule { + DIFFUSION, + TE, + CLIP_VISION, + VAE, + CONTROL_NET, + PHOTOMAKER, + UPSCALER, +}; + +struct SDBackendAssignment { + std::string default_name; + std::unordered_map module_names; + + bool empty() const; + std::string get(SDBackendModule module) const; + void set_default(const std::string& name); + void set_module(SDBackendModule module, const std::string& name); +}; + +struct SDBackendHandleDeleter { + void operator()(ggml_backend_t backend) const; +}; + +using SDBackendHandle = std::unique_ptr; + +class SDBackendManager { +private: + SDBackendAssignment runtime_assignment_; + SDBackendAssignment params_assignment_; + std::unordered_map backends_; + +public: + SDBackendManager() = default; + ~SDBackendManager(); + + SDBackendManager(const SDBackendManager&) = delete; + SDBackendManager& operator=(const SDBackendManager&) = delete; + + bool init(const char* backend_spec, + const char* params_backend_spec, + bool offload_params_to_cpu, + bool keep_clip_on_cpu, + bool keep_vae_on_cpu, + bool keep_control_net_on_cpu, + std::string* error); + void reset(); + + ggml_backend_t runtime_backend(SDBackendModule module); + ggml_backend_t params_backend(SDBackendModule module); + + bool runtime_backend_is_cpu(SDBackendModule module); + bool params_backend_is_cpu(SDBackendModule module); + bool runtime_backend_supports_host_buffer(SDBackendModule module); + +private: + bool validate(std::string* error) const; + ggml_backend_t init_cached_backend(const std::string& name); +}; + +bool sd_backend_is(ggml_backend_t backend, const std::string& name); +const char* sd_backend_module_name(SDBackendModule module); +void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value); +#endif diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp deleted file mode 100644 index 50158c88..00000000 --- a/src/ggml_extend_backend.hpp +++ /dev/null @@ -1,298 +0,0 @@ -#ifndef __GGML_EXTEND_BACKEND_HPP__ -#define __GGML_EXTEND_BACKEND_HPP__ - -#include -#include - -#include "ggml-backend.h" -#include "ggml.h" - -#ifndef __STATIC_INLINE__ -#define __STATIC_INLINE__ static inline -#endif - -inline void ggml_backend_load_all_once() { - // If the registry already has devices and the CPU backend is present, - // assume either static registration or explicit host-side preloading has - // completed and avoid rescanning the default paths. - if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { - return; - } - // In dynamic-backend mode the backend modules are discovered at runtime, - // so we must load them before asking for the CPU backend or its proc table. - // If the host preloaded only a subset of backends, allow one default-path - // scan so missing modules can still be discovered. - static std::once_flag once; - std::call_once(once, []() { - if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { - return; - } - ggml_backend_load_all(); - }); -} - -// Do not gate this branch on GGML_CPU or GGML_CPU_ALL_VARIANTS: -// those are CMake options used to configure ggml itself, but they are not -// exported as PUBLIC compile definitions to stable-diffusion in backend-DL mode. -// In practice, this target can reliably see GGML_BACKEND_DL, but not whether -// the CPU backend was compiled as a loadable module. We therefore use runtime -// backend discovery instead of compile-time assumptions. - -__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_cpu_reg() { - ggml_backend_reg_t reg = ggml_backend_reg_by_name("CPU"); - if (reg != nullptr) { - return reg; - } - - ggml_backend_load_all_once(); - return ggml_backend_reg_by_name("CPU"); -} - -__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_reg_from_backend(ggml_backend_t backend) { - if (backend != nullptr) { - ggml_backend_dev_t device = ggml_backend_get_device(backend); - if (device != nullptr) { - return ggml_backend_dev_backend_reg(device); - } - } - - return ggml_backend_cpu_reg(); -} - -__STATIC_INLINE__ ggml_backend_t ggml_backend_cpu_init() { - ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - if (backend != nullptr) { - return backend; - } - - ggml_backend_load_all_once(); - return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); -} - -__STATIC_INLINE__ bool ggml_backend_is_cpu(ggml_backend_t backend) { - if (backend == nullptr) { - return false; - } - - ggml_backend_dev_t device = ggml_backend_get_device(backend); - if (device != nullptr) { - return ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU; - } - - const char* backend_name = ggml_backend_name(backend); - return backend_name != nullptr && std::strcmp(backend_name, "CPU") == 0; -} - -__STATIC_INLINE__ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { - ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); - if (reg == nullptr) { - return; - } - - auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads")); - if (fn != nullptr) { - fn(backend_cpu, n_threads); - } -} - -using __ggml_backend_cpu_set_threadpool_t = void (*)(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); - -__STATIC_INLINE__ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { - ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); - if (reg == nullptr) { - return; - } - - auto fn = reinterpret_cast<__ggml_backend_cpu_set_threadpool_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool")); - if (fn != nullptr) { - fn(backend_cpu, threadpool); - } -} - -__STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void* abort_callback_data) { - ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); - if (reg == nullptr) { - return; - } - - auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback")); - if (fn != nullptr) { - fn(backend_cpu, abort_callback, abort_callback_data); - } -} - -__STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { - if (tensor == nullptr) { - return nullptr; - } - - return tensor->view_src ? tensor->view_src->buffer : tensor->buffer; -} - -__STATIC_INLINE__ bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) { - if (tensor == nullptr || tensor->data == nullptr) { - return false; - } - - ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor); - return buffer == nullptr || ggml_backend_buffer_is_host(buffer); -} - -__STATIC_INLINE__ size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - return (size_t)(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]); -} - -template -__STATIC_INLINE__ void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) { - const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3); - - if (ggml_backend_tensor_is_host_accessible(tensor)) { - auto* dst = reinterpret_cast(reinterpret_cast(tensor->data) + offset); - *dst = value; - return; - } - - ggml_backend_tensor_set(const_cast(tensor), &value, offset, sizeof(T)); -} - -__STATIC_INLINE__ void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) { - switch (tensor->type) { - case GGML_TYPE_I8: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); - break; - case GGML_TYPE_I16: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); - break; - case GGML_TYPE_I32: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); - break; - case GGML_TYPE_F16: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value)); - break; - case GGML_TYPE_BF16: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value)); - break; - case GGML_TYPE_F32: - ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value); - break; - default: - GGML_ABORT("fatal error"); - } -} - -__STATIC_INLINE__ void ggml_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) { - if (!ggml_is_contiguous(tensor)) { - int64_t id[4] = {0, 0, 0, 0}; - ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); - ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); - return; - } - - switch (tensor->type) { - case GGML_TYPE_I8: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); - break; - case GGML_TYPE_I16: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); - break; - case GGML_TYPE_I32: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); - break; - case GGML_TYPE_F16: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value)); - break; - case GGML_TYPE_BF16: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value)); - break; - case GGML_TYPE_F32: - ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value); - break; - default: - GGML_ABORT("fatal error"); - } -} - -__STATIC_INLINE__ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context* ctx, struct ggml_cgraph* cgraph, int n_threads) { - (void)ctx; - - // The legacy ggml_graph_compute_with_ctx() symbol lives in ggml-cpu, but - // the backend proc table does not expose it in GGML_BACKEND_DL mode. - // Recreate the old behavior by initializing the CPU backend explicitly and - // executing the graph through the generic backend API. - ggml_backend_t backend = ggml_backend_cpu_init(); - if (backend == nullptr) { - return GGML_STATUS_ALLOC_FAILED; - } - - ggml_backend_cpu_set_n_threads(backend, n_threads); - - const enum ggml_status status = ggml_backend_graph_compute(backend, cgraph); - ggml_backend_free(backend); - - return status; -} - -__STATIC_INLINE__ ggml_tensor* ggml_set_f32(struct ggml_tensor* tensor, float value) { - GGML_ASSERT(tensor != nullptr); - - if (ggml_backend_tensor_is_host_accessible(tensor) && ggml_is_contiguous(tensor)) { - const int64_t nelements = ggml_nelements(tensor); - - switch (tensor->type) { - case GGML_TYPE_I8: { - auto* data = reinterpret_cast(tensor->data); - const int8_t v = static_cast(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_I16: { - auto* data = reinterpret_cast(tensor->data); - const int16_t v = static_cast(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_I32: { - auto* data = reinterpret_cast(tensor->data); - const int32_t v = static_cast(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_F16: { - auto* data = reinterpret_cast(tensor->data); - const ggml_fp16_t v = ggml_fp32_to_fp16(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_BF16: { - auto* data = reinterpret_cast(tensor->data); - const ggml_bf16_t v = ggml_fp32_to_bf16(value); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = v; - } - } break; - case GGML_TYPE_F32: { - auto* data = reinterpret_cast(tensor->data); - for (int64_t i = 0; i < nelements; ++i) { - data[i] = value; - } - } break; - default: - GGML_ABORT("fatal error"); - } - - return tensor; - } - - const int64_t nelements = ggml_nelements(tensor); - for (int64_t i = 0; i < nelements; ++i) { - ggml_set_f32_1d(tensor, static_cast(i), value); - } - - return tensor; -} - -#endif diff --git a/src/hidream_o1.hpp b/src/hidream_o1.hpp index 908f2de3..d72739d5 100644 --- a/src/hidream_o1.hpp +++ b/src/hidream_o1.hpp @@ -279,10 +279,10 @@ namespace HiDreamO1 { std::array, 4> pos_embed_weight_data_; HiDreamO1VisionRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string& prefix = "model.visual") - : GGMLRunner(backend, offload_params_to_cpu), + : GGMLRunner(backend, params_backend), params(make_hidream_o1_params()), model(std::make_shared(false, params.llm.vision)) { model->init(params_ctx, tensor_storage_map, prefix); @@ -336,10 +336,10 @@ namespace HiDreamO1 { std::vector attention_mask_vec; HiDreamO1Runner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string& prefix = "model") - : GGMLRunner(backend, offload_params_to_cpu), + : GGMLRunner(backend, params_backend), params(make_hidream_o1_params()) { model = HiDreamO1Model(params); model.init(params_ctx, tensor_storage_map, prefix); @@ -461,9 +461,9 @@ namespace HiDreamO1 { std::shared_ptr vision_runner; HiDreamO1Conditioner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) - : vision_runner(std::make_shared(backend, offload_params_to_cpu, tensor_storage_map)) {} + : vision_runner(std::make_shared(backend, params_backend, tensor_storage_map)) {} void get_param_tensors(std::map& tensors) override { vision_runner->get_param_tensors(tensors); diff --git a/src/llm.hpp b/src/llm.hpp index bb83312c..ab673b20 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -1166,11 +1166,11 @@ namespace LLM { public: LLMRunner(LLMArch arch, ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool enable_vision_ = false) - : GGMLRunner(backend, offload_params_to_cpu), enable_vision(enable_vision_) { + : GGMLRunner(backend, params_backend), enable_vision(enable_vision_) { params.arch = arch; if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) { params.head_dim = 128; @@ -1477,11 +1477,11 @@ namespace LLM { LLMEmbedder(LLMArch arch, ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", bool enable_vision = false) - : model(arch, backend, offload_params_to_cpu, tensor_storage_map, prefix, enable_vision) { + : model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) { if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) { tokenizer = std::make_shared(); } else { @@ -1731,7 +1731,7 @@ namespace LLM { std::shared_ptr llm = std::make_shared(arch, backend, - true, + backend, tensor_storage_map, "text_encoders.llm", true); diff --git a/src/lora.hpp b/src/lora.hpp index b57bc422..3d2b7699 100644 --- a/src/lora.hpp +++ b/src/lora.hpp @@ -22,10 +22,11 @@ struct LoraModel : public GGMLRunner { LoraModel(const std::string& lora_id, ggml_backend_t backend, + ggml_backend_t params_backend, const std::string& file_path = "", std::string prefix = "", SDVersion version = VERSION_COUNT) - : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, false) { + : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, params_backend) { prefix = "lora." + prefix; if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) { load_failed = true; diff --git a/src/ltx_audio_vae.h b/src/ltx_audio_vae.h index d5ee30bc..d1d765d7 100644 --- a/src/ltx_audio_vae.h +++ b/src/ltx_audio_vae.h @@ -963,10 +963,10 @@ namespace LTXV { LTXAudioVAE model; LTXAudioVAERunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string& prefix = "") - : GGMLRunner(backend, offload_params_to_cpu), + : GGMLRunner(backend, params_backend), config(LTXAudioVAEConfig::detect_from_weights(tensor_storage_map)), model(config) { model.init(params_ctx, tensor_storage_map, prefix); @@ -1086,7 +1086,7 @@ namespace LTXV { auto& tensor_storage_map = model_loader.get_tensor_storage_map(); auto ltx_audio_vae = std::make_shared(backend, - false, + backend, tensor_storage_map, prefix); diff --git a/src/ltx_vae.hpp b/src/ltx_vae.hpp index 8bcc1ca8..a4d47b50 100644 --- a/src/ltx_vae.hpp +++ b/src/ltx_vae.hpp @@ -1037,7 +1037,7 @@ struct LTXVideoVAE : public VAE { LTXVAE::VideoVAE vae; LTXVideoVAE(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string& prefix, bool decode_only = true, @@ -1053,7 +1053,7 @@ struct LTXVideoVAE : public VAE { patch_size, tensor_storage_map, prefix), - VAE(version, backend, offload_params_to_cpu) { + VAE(version, backend, params_backend) { vae.init(params_ctx, tensor_storage_map, prefix); decode_timestep_tensor.values()[0] = vae.decode_timestep; } @@ -1176,7 +1176,7 @@ struct LTXVideoVAE : public VAE { auto& tensor_storage_map = model_loader.get_tensor_storage_map(); std::shared_ptr vae = std::make_shared(backend, - false, + backend, tensor_storage_map, "first_stage_model", true, diff --git a/src/ltxv.hpp b/src/ltxv.hpp index 7cb52a1f..50142cb1 100644 --- a/src/ltxv.hpp +++ b/src/ltxv.hpp @@ -1450,10 +1450,10 @@ namespace LTXV { } LTXAVRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string& prefix = "model.diffusion_model") - : GGMLRunner(backend, offload_params_to_cpu), + : GGMLRunner(backend, params_backend), prefix(prefix), params(), model(params) { @@ -1864,7 +1864,7 @@ namespace LTXV { auto& tensor_storage_map = model_loader.get_tensor_storage_map(); std::shared_ptr ltxav = std::make_shared(backend, - false, + backend, tensor_storage_map, "model.diffusion_model"); diff --git a/src/mmdit.hpp b/src/mmdit.hpp index e57041dc..6fcd732e 100644 --- a/src/mmdit.hpp +++ b/src/mmdit.hpp @@ -828,10 +828,10 @@ struct MMDiTRunner : public GGMLRunner { MMDiT mmdit; MMDiTRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") - : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_storage_map) { + : GGMLRunner(backend, params_backend), mmdit(tensor_storage_map) { mmdit.init(params_ctx, tensor_storage_map, prefix); } @@ -934,7 +934,7 @@ struct MMDiTRunner : public GGMLRunner { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr mmdit = std::make_shared(backend, false); + std::shared_ptr mmdit = std::make_shared(backend, backend); { LOG_INFO("loading from '%s'", file_path.c_str()); diff --git a/src/model.cpp b/src/model.cpp index b3aa8b0d..9929605e 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -24,7 +24,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml.h" -#include "ggml_extend_backend.hpp" +#include "ggml_extend_backend.h" #include "zip.h" #include "name_conversion.h" diff --git a/src/pmid.hpp b/src/pmid.hpp index f19a8c3c..2a9d2da7 100644 --- a/src/pmid.hpp +++ b/src/pmid.hpp @@ -411,13 +411,13 @@ public: public: PhotoMakerIDEncoder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, SDVersion version = VERSION_SDXL, PMVersion pm_v = PM_VERSION_1, float sty = 20.f) - : GGMLRunner(backend, offload_params_to_cpu), + : GGMLRunner(backend, params_backend), version(version), pm_version(pm_v), style_strength(sty) { @@ -568,11 +568,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner { bool applied = false; PhotoMakerIDEmbed(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, ModelLoader* ml, const std::string& file_path = "", const std::string& prefix = "") - : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) { + : file_path(file_path), GGMLRunner(backend, params_backend), model_loader(ml) { if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) { load_failed = true; } diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp index 35d32109..73c1f9ae 100644 --- a/src/qwen_image.hpp +++ b/src/qwen_image.hpp @@ -488,12 +488,12 @@ namespace Qwen { SDVersion version; QwenImageRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", SDVersion version = VERSION_QWEN_IMAGE, bool zero_cond_t = false) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { qwen_image_params.num_layers = 0; qwen_image_params.zero_cond_t = zero_cond_t; for (auto pair : tensor_storage_map) { @@ -686,7 +686,7 @@ namespace Qwen { } std::shared_ptr qwen_image = std::make_shared(backend, - false, + backend, tensor_storage_map, "model.diffusion_model", VERSION_QWEN_IMAGE); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 8a08f12c..ac2ff224 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -116,11 +116,7 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { class StableDiffusionGGML { public: std::vector mmap_tensor_store; - ggml_backend_t backend = nullptr; // general backend - ggml_backend_t clip_backend = nullptr; - ggml_backend_t control_net_backend = nullptr; - ggml_backend_t vae_backend = nullptr; - ggml_backend_t audio_backend = nullptr; + SDBackendManager backend_manager; SDVersion version; bool vae_decode_only = false; @@ -156,6 +152,8 @@ public: bool offload_params_to_cpu = false; float max_vram = 0.f; bool use_pmid = false; + std::string backend_spec; + std::string params_backend_spec; bool is_using_v_parameterization = false; bool is_using_edm_v_parameterization = false; @@ -169,24 +167,44 @@ public: StableDiffusionGGML() = default; - ~StableDiffusionGGML() { - if (clip_backend != backend) { - ggml_backend_free(clip_backend); + ~StableDiffusionGGML() = default; + + ggml_backend_t backend_for(SDBackendModule module) { + ggml_backend_t module_backend = backend_manager.runtime_backend(module); + if (module_backend == nullptr) { + LOG_ERROR("failed to initialize %s backend", sd_backend_module_name(module)); } - if (control_net_backend != backend) { - ggml_backend_free(control_net_backend); - } - if (vae_backend != backend) { - ggml_backend_free(vae_backend); - } - if (audio_backend != nullptr && audio_backend != backend) { - ggml_backend_free(audio_backend); - } - ggml_backend_free(backend); + return module_backend; } - void init_backend() { - backend = sd_get_default_backend(); + ggml_backend_t params_backend_for(SDBackendModule module) { + ggml_backend_t module_backend = backend_manager.params_backend(module); + if (module_backend == nullptr) { + LOG_ERROR("failed to initialize %s params backend", sd_backend_module_name(module)); + } + return module_backend; + } + + bool ensure_backend_pair(SDBackendModule module) { + if (backend_for(module) == nullptr) { + return false; + } + return params_backend_for(module) != nullptr; + } + + bool init_backend(const sd_ctx_params_t* sd_ctx_params) { + std::string error; + if (!backend_manager.init(sd_ctx_params->backend, + sd_ctx_params->params_backend, + sd_ctx_params->offload_params_to_cpu, + sd_ctx_params->keep_clip_on_cpu, + sd_ctx_params->keep_vae_on_cpu, + sd_ctx_params->keep_control_net_on_cpu, + &error)) { + LOG_ERROR("backend config failed: %s", error.c_str()); + return false; + } + return ensure_backend_pair(SDBackendModule::DIFFUSION); } std::shared_ptr get_rng(rng_type_t rng_type) { @@ -205,6 +223,8 @@ public: free_params_immediately = sd_ctx_params->free_params_immediately; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; max_vram = sd_ctx_params->max_vram; + backend_spec = SAFE_STR(sd_ctx_params->backend); + params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); bool use_tae = false; bool use_audio_vae = false; @@ -218,8 +238,10 @@ public: ggml_log_set(ggml_log_callback_default, nullptr); - init_backend(); - max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend); + if (!init_backend(sd_ctx_params)) { + return false; + } + max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION)); ModelLoader model_loader; @@ -393,7 +415,6 @@ public: std::map mmap_able_tensors; bool enable_mmap_tensors = false; - bool main_backend_mmap = false; bool needs_writable_mmap = false; if (sd_ctx_params->enable_mmap) { if (apply_lora_immediately) { @@ -401,21 +422,19 @@ public: LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap"); } enable_mmap_tensors = true; - if (offload_params_to_cpu) { - main_backend_mmap = true; - } else { - ggml_backend_dev_t dev = ggml_backend_get_device(backend); - struct ggml_backend_dev_props props; - ggml_backend_dev_get_props(dev, &props); - main_backend_mmap = props.caps.buffer_from_host_ptr; - } } // split definition to avoid msvc choking on the extra parameter handling - auto get_param_tensors_p = [&](auto&& model, bool force_cpu, const char* prefix) { + auto module_can_mmap = [&](SDBackendModule module) { + return enable_mmap_tensors && + (backend_manager.runtime_backend_is_cpu(module) || + backend_manager.params_backend_is_cpu(module) || + backend_manager.runtime_backend_supports_host_buffer(module)); + }; + + auto get_param_tensors_p = [&](auto&& model, bool do_mmap, const char* prefix) { std::map temp; model->get_param_tensors(temp, prefix); - bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu); for (const auto& [key, tensor] : temp) { tensors[key] = tensor; if (do_mmap) { @@ -424,10 +443,9 @@ public: } }; - auto get_param_tensors = [&](auto&& model, bool force_cpu = false) { + auto get_param_tensors = [&](auto&& model, bool do_mmap) { std::map temp; model->get_param_tensors(temp); - bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu); for (const auto& [key, tensor] : temp) { tensors[key] = tensor; if (do_mmap) { @@ -454,22 +472,20 @@ public: LOG_INFO("Using circular padding for convolutions"); } - bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; - const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(max_vram); { - clip_backend = backend; - if (clip_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("CLIP: Using CPU backend"); - clip_backend = ggml_backend_cpu_init(); + if (!ensure_backend_pair(SDBackendModule::TE) || + !ensure_backend_pair(SDBackendModule::DIFFUSION)) { + return false; } + if (sd_version_is_sd3(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map); } else if (sd_version_is_flux(version)) { bool is_chroma = false; @@ -489,62 +505,62 @@ public: "--chroma-disable-dit-mask as a workaround."); } - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else if (version == VERSION_OVIS_IMAGE) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version, "", false); } else { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map); } - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_ltxav(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model"); } else if (sd_version_is_wan(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, true, 0, true); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { - high_noise_diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + high_noise_diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.high_noise_diffusion_model", version); @@ -552,62 +568,65 @@ public: if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" || diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" || diffusion_model->get_desc() == "Wan2.1-I2V-1.3B") { - clip_vision = std::make_shared(backend, - offload_params_to_cpu, + if (!ensure_backend_pair(SDBackendModule::CLIP_VISION)) { + return false; + } + clip_vision = std::make_shared(backend_for(SDBackendModule::CLIP_VISION), + params_backend_for(SDBackendModule::CLIP_VISION), tensor_storage_map); clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(clip_vision); + get_param_tensors(clip_vision, module_can_mmap(SDBackendModule::CLIP_VISION)); } } else if (sd_version_is_qwen_image(version)) { bool enable_vision = false; if (!vae_decode_only) { enable_vision = true; } - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version, "", enable_vision); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version, sd_ctx_params->qwen_image_zero_cond_t); } else if (version == VERSION_HIDREAM_O1) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model"); } else if (sd_version_is_anima(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model"); } else if (sd_version_is_z_image(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version); } else if (sd_version_is_ernie_image(version)) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, version); - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model"); } else { // SD1.x SD2.x SDXL @@ -616,21 +635,21 @@ public: embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path)); } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, embbeding_map, version, PM_VERSION_2); } else { - cond_stage_model = std::make_shared(clip_backend, - offload_params_to_cpu, + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + params_backend_for(SDBackendModule::TE), tensor_storage_map, embbeding_map, version); } - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, version); if (sd_ctx_params->diffusion_conv_direct) { @@ -640,10 +659,10 @@ public: } cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(cond_stage_model, clip_on_cpu); + get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE)); diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(diffusion_model); + get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); if (sd_version_is_unet_edit(version)) { vae_decode_only = false; @@ -651,30 +670,27 @@ public: if (high_noise_diffusion_model) { high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(high_noise_diffusion_model); + get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); } - if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("VAE Autoencoder: Using CPU backend"); - vae_backend = ggml_backend_cpu_init(); - } else { - vae_backend = backend; + if (!ensure_backend_pair(SDBackendModule::VAE)) { + return false; } auto create_tae = [&]() -> std::shared_ptr { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - return std::make_shared(vae_backend, - offload_params_to_cpu, + return std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map, "decoder", vae_decode_only, version); } else { - auto model = std::make_shared(vae_backend, - offload_params_to_cpu, + auto model = std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map, "decoder.layers", vae_decode_only, @@ -685,8 +701,8 @@ public: auto create_vae = [&]() -> std::shared_ptr { if (sd_version_is_ltxav(version)) { - return std::make_shared(vae_backend, - offload_params_to_cpu, + return std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", true, @@ -694,15 +710,15 @@ public: } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - return std::make_shared(vae_backend, - offload_params_to_cpu, + return std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", vae_decode_only, version); } else { - auto model = std::make_shared(vae_backend, - offload_params_to_cpu, + auto model = std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", vae_decode_only, @@ -721,43 +737,36 @@ public: } }; - bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu; + bool vae_mmap = module_can_mmap(SDBackendModule::VAE); if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { LOG_INFO("using FakeVAE"); first_stage_model = std::make_shared(version, - vae_backend, - offload_params_to_cpu); + backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE)); } else if (use_tae && !tae_preview_only) { LOG_INFO("using TAE for encoding / decoding"); first_stage_model = create_tae(); first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors_p(first_stage_model, force_vae_cpu, "tae"); + get_param_tensors_p(first_stage_model, vae_mmap, "tae"); } else { LOG_INFO("using VAE for encoding / decoding"); first_stage_model = create_vae(); first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors_p(first_stage_model, force_vae_cpu, "first_stage_model"); + get_param_tensors_p(first_stage_model, vae_mmap, "first_stage_model"); if (use_tae && tae_preview_only) { LOG_INFO("using TAE for preview"); preview_vae = create_tae(); preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors_p(first_stage_model, force_vae_cpu, "vae"); + get_param_tensors_p(first_stage_model, vae_mmap, "vae"); } } if (use_audio_vae) { - if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_INFO("LTX audio VAE: Using CPU backend"); - audio_backend = ggml_backend_cpu_init(); - } else { - audio_backend = backend; - } - audio_vae_model = std::make_shared(audio_backend, - false, + audio_vae_model = std::make_shared(backend_for(SDBackendModule::VAE), + params_backend_for(SDBackendModule::VAE), tensor_storage_map); - audio_vae_model->alloc_params_buffer(); - audio_vae_model->get_param_tensors(tensors, ""); + get_param_tensors_p(audio_vae_model, vae_mmap, ""); } if (sd_ctx_params->vae_conv_direct) { @@ -769,15 +778,11 @@ public: } if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { - ggml_backend_t controlnet_backend = nullptr; - if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) { - LOG_DEBUG("ControlNet: Using CPU backend"); - controlnet_backend = ggml_backend_cpu_init(); - } else { - controlnet_backend = backend; + if (!ensure_backend_pair(SDBackendModule::CONTROL_NET)) { + return false; } - control_net = std::make_shared(controlnet_backend, - offload_params_to_cpu, + control_net = std::make_shared(backend_for(SDBackendModule::CONTROL_NET), + params_backend_for(SDBackendModule::CONTROL_NET), tensor_storage_map, version); if (sd_ctx_params->diffusion_conv_direct) { @@ -786,23 +791,31 @@ public: } } - if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { - pmid_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "pmid", - version, - PM_VERSION_2); - LOG_INFO("using PhotoMaker Version 2"); - } else { - pmid_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "pmid", - version); - } if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) { - pmid_lora = std::make_shared("pmid", backend, sd_ctx_params->photo_maker_path, "", version); + if (!ensure_backend_pair(SDBackendModule::PHOTOMAKER)) { + return false; + } + if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { + pmid_model = std::make_shared(backend_for(SDBackendModule::PHOTOMAKER), + params_backend_for(SDBackendModule::PHOTOMAKER), + tensor_storage_map, + "pmid", + version, + PM_VERSION_2); + LOG_INFO("using PhotoMaker Version 2"); + } else { + pmid_model = std::make_shared(backend_for(SDBackendModule::PHOTOMAKER), + params_backend_for(SDBackendModule::PHOTOMAKER), + tensor_storage_map, + "pmid", + version); + } + pmid_lora = std::make_shared("pmid", + backend_for(SDBackendModule::PHOTOMAKER), + params_backend_for(SDBackendModule::PHOTOMAKER), + sd_ctx_params->photo_maker_path, + "", + version); auto lora_tensor_filter = [&](const std::string& tensor_name) { if (starts_with(tensor_name, "lora.model")) { return true; @@ -821,7 +834,7 @@ public: } } if (use_pmid) { - get_param_tensors_p(pmid_model, false, "pmid"); + get_param_tensors_p(pmid_model, module_can_mmap(SDBackendModule::PHOTOMAKER), "pmid"); } if (sd_ctx_params->flash_attn) { @@ -917,8 +930,10 @@ public: } } - if (clip_vision) { - clip_vision->alloc_params_buffer(); + if (clip_vision && !clip_vision->alloc_params_buffer()) { + LOG_ERROR("CLIP vision params buffer allocation failed"); + ggml_free(ctx); + return false; } if (cond_stage_model) { cond_stage_model->alloc_params_buffer(); @@ -929,18 +944,25 @@ public: if (high_noise_diffusion_model) { high_noise_diffusion_model->alloc_params_buffer(); } - if (first_stage_model) { - first_stage_model->alloc_params_buffer(); + if (first_stage_model && !first_stage_model->alloc_params_buffer()) { + LOG_ERROR("VAE params buffer allocation failed"); + ggml_free(ctx); + return false; } - if (preview_vae) { - preview_vae->alloc_params_buffer(); + if (preview_vae && !preview_vae->alloc_params_buffer()) { + LOG_ERROR("preview VAE params buffer allocation failed"); + ggml_free(ctx); + return false; } - if (use_pmid && pmid_model) { - if (!pmid_model->alloc_params_buffer()) { - LOG_ERROR(" pmid model params buffer allocation failed"); - ggml_free(ctx); - return false; - } + if (audio_vae_model && !audio_vae_model->alloc_params_buffer()) { + LOG_ERROR("LTX audio VAE params buffer allocation failed"); + ggml_free(ctx); + return false; + } + if (use_pmid && pmid_model && !pmid_model->alloc_params_buffer()) { + LOG_ERROR("PhotoMaker params buffer allocation failed"); + ggml_free(ctx); + return false; } bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); @@ -969,6 +991,7 @@ public: size_t control_net_params_mem_size = 0; if (control_net) { if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) { + ggml_free(ctx); return false; } control_net_params_mem_size = control_net->get_params_buffer_size(); @@ -980,28 +1003,39 @@ public: size_t total_params_ram_size = 0; size_t total_params_vram_size = 0; - if (ggml_backend_is_cpu(clip_backend)) { - total_params_ram_size += clip_params_mem_size + pmid_params_mem_size; - } else { - total_params_vram_size += clip_params_mem_size + pmid_params_mem_size; - } + auto add_params_memory = [&](size_t size, SDBackendModule module) { + if (size == 0) { + return true; + } + ggml_backend_t module_backend = params_backend_for(module); + if (module_backend == nullptr) { + return false; + } + if (ggml_backend_is_cpu(module_backend)) { + total_params_ram_size += size; + } else { + total_params_vram_size += size; + } + return true; + }; + auto params_memory_location = [&](size_t size, SDBackendModule module) { + if (size == 0) { + return "N/A"; + } + ggml_backend_t module_backend = params_backend_for(module); + if (module_backend == nullptr) { + return "N/A"; + } + return ggml_backend_is_cpu(module_backend) ? "RAM" : "VRAM"; + }; - if (ggml_backend_is_cpu(backend)) { - total_params_ram_size += unet_params_mem_size; - } else { - total_params_vram_size += unet_params_mem_size; - } - - if (ggml_backend_is_cpu(vae_backend)) { - total_params_ram_size += vae_params_mem_size; - } else { - total_params_vram_size += vae_params_mem_size; - } - - if (ggml_backend_is_cpu(control_net_backend)) { - total_params_ram_size += control_net_params_mem_size; - } else { - total_params_vram_size += control_net_params_mem_size; + if (!add_params_memory(clip_params_mem_size, SDBackendModule::TE) || + !add_params_memory(pmid_params_mem_size, SDBackendModule::PHOTOMAKER) || + !add_params_memory(unet_params_mem_size, SDBackendModule::DIFFUSION) || + !add_params_memory(vae_params_mem_size, SDBackendModule::VAE) || + !add_params_memory(control_net_params_mem_size, SDBackendModule::CONTROL_NET)) { + ggml_free(ctx); + return false; } size_t total_params_size = total_params_ram_size + total_params_vram_size; @@ -1012,15 +1046,15 @@ public: total_params_vram_size / 1024.0 / 1024.0, total_params_ram_size / 1024.0 / 1024.0, clip_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM", + params_memory_location(clip_params_mem_size, SDBackendModule::TE), unet_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(backend) ? "RAM" : "VRAM", + params_memory_location(unet_params_mem_size, SDBackendModule::DIFFUSION), vae_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM", + params_memory_location(vae_params_mem_size, SDBackendModule::VAE), control_net_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM", + params_memory_location(control_net_params_mem_size, SDBackendModule::CONTROL_NET), pmid_params_mem_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM"); + params_memory_location(pmid_params_mem_size, SDBackendModule::PHOTOMAKER)); } // init denoiser @@ -1163,7 +1197,7 @@ public: std::shared_ptr load_lora_model_from_file(const std::string& lora_id, float multiplier, - ggml_backend_t backend, + SDBackendModule module, LoraModel::filter_t lora_tensor_filter = nullptr) { std::string lora_path = lora_id; static std::string high_noise_tag = "|high_noise|"; @@ -1173,7 +1207,15 @@ public: is_high_noise = true; LOG_DEBUG("high noise lora: %s", lora_path.c_str()); } - auto lora = std::make_shared(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version); + if (!ensure_backend_pair(module)) { + return nullptr; + } + auto lora = std::make_shared(lora_id, + backend_for(module), + params_backend_for(module), + lora_path, + is_high_noise ? "model.high_noise_" : "", + version); if (!lora->load_from_file(n_threads, lora_tensor_filter)) { LOG_WARN("load lora tensors from %s failed", lora_path.c_str()); return nullptr; @@ -1212,7 +1254,7 @@ public: for (auto& kv : lora_state_diff) { int64_t t0 = ggml_time_ms(); - auto lora = load_lora_model_from_file(kv.first, kv.second, backend); + auto lora = load_lora_model_from_file(kv.first, kv.second, SDBackendModule::DIFFUSION); if (!lora || lora->lora_tensors.empty()) { continue; } @@ -1270,7 +1312,7 @@ public: const std::string& lora_id = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_id, multiplier, SDBackendModule::TE, lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); cond_stage_lora_models.push_back(lora); @@ -1307,7 +1349,7 @@ public: const std::string& lora_name = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_name, multiplier, SDBackendModule::DIFFUSION, lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); diffusion_lora_models.push_back(lora); @@ -1345,7 +1387,7 @@ public: const std::string& lora_name = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_name, multiplier, SDBackendModule::VAE, lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); first_stage_lora_models.push_back(lora); @@ -2387,15 +2429,17 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->chroma_use_dit_mask = true; sd_ctx_params->chroma_use_t5_mask = false; sd_ctx_params->chroma_t5_mask_pad = 1; + sd_ctx_params->backend = nullptr; + sd_ctx_params->params_backend = nullptr; } char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { - char* buf = (char*)malloc(4096); + char* buf = (char*)malloc(8192); if (!buf) return nullptr; buf[0] = '\0'; - snprintf(buf + strlen(buf), 4096 - strlen(buf), + snprintf(buf + strlen(buf), 8192 - strlen(buf), "model_path: %s\n" "clip_l_path: %s\n" "clip_g_path: %s\n" @@ -2421,6 +2465,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "prediction: %s\n" "offload_params_to_cpu: %s\n" "max_vram: %.3f\n" + "backend: %s\n" + "params_backend: %s\n" "keep_clip_on_cpu: %s\n" "keep_control_net_on_cpu: %s\n" "keep_vae_on_cpu: %s\n" @@ -2456,6 +2502,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), sd_ctx_params->max_vram, + SAFE_STR(sd_ctx_params->backend), + SAFE_STR(sd_ctx_params->params_backend), BOOL_STR(sd_ctx_params->keep_clip_on_cpu), BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), BOOL_STR(sd_ctx_params->keep_vae_on_cpu), @@ -3822,7 +3870,9 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s LOG_INFO("hires fix: loading model upscaler from '%s'", request.hires.model_path); hires_upscaler = std::make_unique(sd_ctx->sd->n_threads, false, - request.hires.upscale_tile_size); + request.hires.upscale_tile_size, + sd_ctx->sd->backend_spec, + sd_ctx->sd->params_backend_spec); const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram); hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes); if (!hires_upscaler->load_from_file(request.hires.model_path, diff --git a/src/t5.hpp b/src/t5.hpp index 71545e52..01c35d7d 100644 --- a/src/t5.hpp +++ b/src/t5.hpp @@ -321,11 +321,11 @@ struct T5Runner : public GGMLRunner { std::vector relative_position_bucket_vec; T5Runner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool is_umt5 = false) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { if (is_umt5) { params.vocab_size = 256384; params.relative_attention = false; @@ -464,11 +464,11 @@ struct T5Embedder { T5Runner model; T5Embedder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", bool is_umt5 = false) - : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { + : model(backend, params_backend, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { } void get_param_tensors(std::map& tensors, const std::string prefix) { @@ -576,7 +576,7 @@ struct T5Embedder { } } - std::shared_ptr t5 = std::make_shared(backend, false, tensor_storage_map, "", true); + std::shared_ptr t5 = std::make_shared(backend, backend, tensor_storage_map, "", true); t5->alloc_params_buffer(); std::map tensors; diff --git a/src/tae.hpp b/src/tae.hpp index 41b53515..823cff2d 100644 --- a/src/tae.hpp +++ b/src/tae.hpp @@ -541,14 +541,14 @@ struct TinyImageAutoEncoder : public VAE { bool decode_only = false; TinyImageAutoEncoder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool decoder_only = true, SDVersion version = VERSION_SD1) : decode_only(decoder_only), taesd(decoder_only, version), - VAE(version, backend, offload_params_to_cpu) { + VAE(version, backend, params_backend) { scale_input = false; taesd.init(params_ctx, tensor_storage_map, prefix); } @@ -603,14 +603,14 @@ struct TinyVideoAutoEncoder : public VAE { bool decode_only = false; TinyVideoAutoEncoder(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, bool decoder_only = true, SDVersion version = VERSION_WAN2) : decode_only(decoder_only), taehv(decoder_only, version), - VAE(version, backend, offload_params_to_cpu) { + VAE(version, backend, params_backend) { scale_input = false; taehv.init(params_ctx, tensor_storage_map, prefix); } diff --git a/src/unet.hpp b/src/unet.hpp index d7ea8c3f..a67fe076 100644 --- a/src/unet.hpp +++ b/src/unet.hpp @@ -603,11 +603,11 @@ struct UNetModelRunner : public GGMLRunner { UnetModelBlock unet; UNetModelRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, SDVersion version = VERSION_SD1) - : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_storage_map) { + : GGMLRunner(backend, params_backend), unet(version, tensor_storage_map) { unet.init(params_ctx, tensor_storage_map, prefix); } diff --git a/src/upscaler.cpp b/src/upscaler.cpp index 25fc0c5d..1197ce35 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -4,12 +4,18 @@ #include "stable-diffusion.h" #include "util.h" +#include + UpscalerGGML::UpscalerGGML(int n_threads, bool direct, - int tile_size) + int tile_size, + std::string backend_spec, + std::string params_backend_spec) : n_threads(n_threads), direct(direct), - tile_size(tile_size) { + tile_size(tile_size), + backend_spec(std::move(backend_spec)), + params_backend_spec(std::move(params_backend_spec)) { } void UpscalerGGML::set_max_graph_vram_bytes(size_t max_vram_bytes) { @@ -24,19 +30,51 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path, int n_threads) { ggml_log_set(ggml_log_callback_default, nullptr); - backend = sd_get_default_backend(); + std::string error; + if (!backend_manager.init(backend_spec.c_str(), + params_backend_spec.c_str(), + offload_params_to_cpu, + false, + false, + false, + &error)) { + LOG_ERROR("upscaler backend config failed: %s", error.c_str()); + return false; + } + auto backend_for = [&](SDBackendModule module) { + ggml_backend_t module_backend = backend_manager.runtime_backend(module); + if (module_backend == nullptr) { + LOG_ERROR("failed to initialize %s backend", sd_backend_module_name(module)); + } + return module_backend; + }; + auto params_backend_for = [&](SDBackendModule module) { + ggml_backend_t module_backend = backend_manager.params_backend(module); + if (module_backend == nullptr) { + LOG_ERROR("failed to initialize %s params backend", sd_backend_module_name(module)); + } + return module_backend; + }; + auto ensure_backend_pair = [&](SDBackendModule module) { + if (backend_for(module) == nullptr) { + return false; + } + return params_backend_for(module) != nullptr; + }; + if (!ensure_backend_pair(SDBackendModule::UPSCALER)) { + return false; + } ModelLoader model_loader; if (!model_loader.init_from_file_and_convert_name(esrgan_path)) { LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); } model_loader.set_wtype_override(model_data_type); - if (!backend) { - LOG_DEBUG("Using CPU backend"); - backend = ggml_backend_cpu_init(); - } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); - esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map()); + esrgan_upscaler = std::make_shared(backend_for(SDBackendModule::UPSCALER), + params_backend_for(SDBackendModule::UPSCALER), + tile_size, + model_loader.get_tensor_storage_map()); esrgan_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes); if (direct) { esrgan_upscaler->set_conv2d_direct_enabled(true); @@ -110,14 +148,16 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, bool offload_params_to_cpu, bool direct, int n_threads, - int tile_size) { + int tile_size, + const char* backend, + const char* params_backend) { upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); if (upscaler_ctx == nullptr) { return nullptr; } std::string esrgan_path(esrgan_path_c_str); - upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size); + upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size, SAFE_STR(backend), SAFE_STR(params_backend)); if (upscaler_ctx->upscaler == nullptr) { return nullptr; } diff --git a/src/upscaler.h b/src/upscaler.h index d667a6f1..e3967865 100644 --- a/src/upscaler.h +++ b/src/upscaler.h @@ -2,6 +2,7 @@ #define __SD_UPSCALER_H__ #include "esrgan.hpp" +#include "ggml_extend_backend.h" #include "stable-diffusion.h" #include "tensor.hpp" @@ -9,7 +10,7 @@ #include struct UpscalerGGML { - ggml_backend_t backend = nullptr; // general backend + SDBackendManager backend_manager; ggml_type model_data_type = GGML_TYPE_F16; std::shared_ptr esrgan_upscaler; std::string esrgan_path; @@ -17,10 +18,14 @@ struct UpscalerGGML { bool direct = false; int tile_size = 128; size_t max_graph_vram_bytes = 0; + std::string backend_spec; + std::string params_backend_spec; UpscalerGGML(int n_threads, - bool direct = false, - int tile_size = 128); + bool direct = false, + int tile_size = 128, + std::string backend_spec = "", + std::string params_backend_spec = ""); bool load_from_file(const std::string& esrgan_path, bool offload_params_to_cpu, diff --git a/src/util.cpp b/src/util.cpp index 586284c8..1c2e5e89 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -25,7 +25,7 @@ #include "ggml-backend.h" #include "ggml.h" -#include "ggml_extend_backend.hpp" +#include "ggml_extend_backend.h" #include "stable-diffusion.h" bool ends_with(const std::string& str, const std::string& ending) { @@ -758,76 +758,6 @@ std::vector> parse_prompt_attention(const std::str return res; } -// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc. -bool sd_backend_is(ggml_backend_t backend, const std::string& name) { - if (!backend) { - return false; - } - ggml_backend_dev_t dev = ggml_backend_get_device(backend); - if (!dev) - return false; - std::string dev_name = ggml_backend_dev_name(dev); - return dev_name.find(name) != std::string::npos; -} - -ggml_backend_t sd_get_default_backend() { - ggml_backend_load_all_once(); - static std::once_flag once; - std::call_once(once, []() { - size_t dev_count = ggml_backend_dev_count(); - if (dev_count == 0) { - LOG_ERROR("No devices found!"); - } else { - LOG_DEBUG("Found %zu backend devices:", dev_count); - for (size_t i = 0; i < dev_count; ++i) { - auto dev = ggml_backend_dev_get(i); - LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev)); - } - } - }); - ggml_backend_t backend = nullptr; - const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); - if (SD_VK_DEVICE != nullptr) { - std::string sd_vk_device_str = SD_VK_DEVICE; - try { - unsigned long long device = std::stoull(sd_vk_device_str); - std::string vk_device_name = "Vulkan" + std::to_string(device); - if (backend_name_exists(vk_device_name)) { - LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str()); - backend = init_named_backend(vk_device_name); - if (!backend) { - LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str()); - } - } else { - LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str()); - } - } catch (const std::invalid_argument&) { - LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE); - } catch (const std::out_of_range&) { - LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE); - } - } - - if (!backend) { - std::string dev_name = get_default_backend_name(); - backend = init_named_backend(dev_name); - if (!backend && !dev_name.empty()) { - LOG_WARN("device %s failed to init", dev_name.c_str()); - } - } - - if (!backend) { - LOG_WARN("loading CPU backend"); - backend = ggml_backend_cpu_init(); - } - - if (ggml_backend_is_cpu(backend)) { - LOG_DEBUG("Using CPU backend"); - } - - return backend; -} - // namespace is needed to avoid conflicts with ggml_backend_extend.hpp namespace ggml_cpu { #include "ggml-cpu.h" diff --git a/src/util.h b/src/util.h index 628a1f9d..9843ae18 100644 --- a/src/util.h +++ b/src/util.h @@ -86,7 +86,6 @@ bool sd_should_preview_noisy(); // test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc. bool sd_backend_is(ggml_backend_t backend, const std::string& name); -ggml_backend_t sd_get_default_backend(); #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) diff --git a/src/vae.hpp b/src/vae.hpp index 4e6fa714..d7e0fdee 100644 --- a/src/vae.hpp +++ b/src/vae.hpp @@ -62,8 +62,8 @@ protected: } public: - VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu) - : version(version), GGMLRunner(backend, offload_params_to_cpu) {} + VAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend) + : version(version), GGMLRunner(backend, params_backend) {} int get_scale_factor() { int scale_factor = 8; @@ -219,8 +219,8 @@ public: }; struct FakeVAE : public VAE { - FakeVAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu) - : VAE(version, backend, offload_params_to_cpu) {} + FakeVAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend) + : VAE(version, backend, params_backend) {} int get_encoder_output_channels(int input_channels) { return input_channels; diff --git a/src/wan.hpp b/src/wan.hpp index 64890c04..ddd9680e 100644 --- a/src/wan.hpp +++ b/src/wan.hpp @@ -1126,12 +1126,12 @@ namespace WAN { WanVAE ae; WanVAERunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", bool decode_only = false, SDVersion version = VERSION_WAN2) - : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, offload_params_to_cpu) { + : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, params_backend) { ae.init(params_ctx, tensor_storage_map, prefix); } @@ -1329,7 +1329,7 @@ namespace WAN { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr vae = std::make_shared(backend, false, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V); + std::shared_ptr vae = std::make_shared(backend, backend, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V); { LOG_INFO("loading from '%s'", file_path.c_str()); @@ -2094,11 +2094,11 @@ namespace WAN { SDVersion version; WanRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", SDVersion version = VERSION_WAN2) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { wan_params.num_layers = 0; for (auto pair : tensor_storage_map) { std::string tensor_name = pair.first; @@ -2346,7 +2346,7 @@ namespace WAN { } std::shared_ptr wan = std::make_shared(backend, - false, + backend, tensor_storage_map, "model.diffusion_model", VERSION_WAN2_2_TI2V); diff --git a/src/z_image.hpp b/src/z_image.hpp index 00b69c26..c0546931 100644 --- a/src/z_image.hpp +++ b/src/z_image.hpp @@ -473,11 +473,11 @@ namespace ZImage { SDVersion version; ZImageRunner(ggml_backend_t backend, - bool offload_params_to_cpu, + ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "", SDVersion version = VERSION_Z_IMAGE) - : GGMLRunner(backend, offload_params_to_cpu) { + : GGMLRunner(backend, params_backend) { z_image = ZImageModel(z_image_params); z_image.init(params_ctx, tensor_storage_map, prefix); } @@ -620,7 +620,7 @@ namespace ZImage { } std::shared_ptr z_image = std::make_shared(backend, - false, + backend, tensor_storage_map, "model.diffusion_model", VERSION_QWEN_IMAGE);