Compare commits

..

No commits in common. "master" and "master-8124588" have entirely different histories.

172 changed files with 87106 additions and 4380642 deletions

View File

@ -3,10 +3,11 @@ UseTab: Never
IndentWidth: 4
TabWidth: 4
AllowShortIfStatementsOnASingleLine: false
IndentCaseLabels: false
ColumnLimit: 0
AccessModifierOffset: -4
NamespaceIndentation: All
FixNamespaceComments: false
FixNamespaceComments: false
AlignAfterOpenBracket: true
AlignConsecutiveAssignments: true
IndentCaseLabels: true

View File

@ -1,10 +0,0 @@
Checks: >
modernize-make-shared,
modernize-use-nullptr,
modernize-use-override,
modernize-pass-by-value,
modernize-return-braced-init-list,
modernize-deprecated-headers,
HeaderFilterRegex: '^$'
WarningsAsErrors: ''
FormatStyle: none

View File

@ -1,73 +0,0 @@
name: 🐞 Bug Report
description: Report a bug or unexpected behavior
title: "[Bug] "
labels: ["bug"]
body:
- type: markdown
attributes:
value: |
Please use this template and include as many details as possible to help us reproduce and fix the issue.
- type: textarea
id: commit
attributes:
label: Git commit
description: Which commit are you trying to compile?
placeholder: |
$git rev-parse HEAD
40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
validations:
required: true
- type: input
id: os
attributes:
label: Operating System & Version
placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
validations:
required: true
- type: dropdown
id: backends
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
- type: input
id: cmd_arguments
attributes:
label: Command-line arguments used
placeholder: The full command line you ran (with all flags)
validations:
required: true
- type: textarea
id: steps_to_reproduce
attributes:
label: Steps to reproduce
placeholder: A step-by-step list of what you did
validations:
required: true
- type: textarea
id: expected_behavior
attributes:
label: What you expected to happen
placeholder: Describe the expected behavior or result
validations:
required: true
- type: textarea
id: actual_behavior
attributes:
label: What actually happened
placeholder: Describe what you saw instead (errors, logs, crash, etc.)
validations:
required: true
- type: textarea
id: logs_and_errors
attributes:
label: Logs / error messages / stack trace
placeholder: Paste complete logs or error output
- type: textarea
id: additional_info
attributes:
label: Additional context / environment details
placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.

View File

@ -1,33 +0,0 @@
name: 💡 Feature Request
description: Suggest a new feature or improvement
title: "[Feature] "
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
Thank you for suggesting an improvement! Please fill in the fields below.
- type: input
id: summary
attributes:
label: Feature Summary
placeholder: A one-line summary of the feature youd like
validations:
required: true
- type: textarea
id: description
attributes:
label: Detailed Description
placeholder: What problem does this solve? How do you expect it to work?
validations:
required: true
- type: textarea
id: alternatives
attributes:
label: Alternatives you considered
placeholder: Any alternative designs or workarounds you tried
- type: textarea
id: additional_context
attributes:
label: Additional context
placeholder: Any extra information (use cases, related functionalities, constraints)

View File

@ -4,36 +4,17 @@ on:
workflow_dispatch: # allows manual triggering
inputs:
create_release:
description: "Create new release"
description: 'Create new release'
required: true
type: boolean
push:
branches:
- master
- ci
paths:
[
".github/workflows/**",
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
"**/*.hpp",
"**/*.c",
"**/*.cpp",
"**/*.cu",
]
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
pull_request:
types: [opened, synchronize, reopened]
paths:
[
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
"**/*.hpp",
"**/*.c",
"**/*.cpp",
"**/*.cu",
]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -49,6 +30,7 @@ jobs:
with:
submodules: recursive
- name: Dependencies
id: depends
run: |
@ -60,37 +42,14 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON
cmake ..
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
run: |
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
#- name: Test
#id: cmake_test
#run: |
#cd build
#ctest --verbose --timeout 900
macOS-latest-cmake:
runs-on: macos-latest
@ -104,8 +63,9 @@ jobs:
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew install zip
brew update
- name: Build
id: cmake_build
@ -113,59 +73,30 @@ jobs:
sysctl -a
mkdir build
cd build
cmake .. -DGGML_AVX2=ON -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" -DSD_BUILD_SHARED_LIBS=ON
cmake ..
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
run: |
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
echo "OS_NAME=`sw_vers -productName`" >> "$GITHUB_OUTPUT"
echo "OS_VERSION=`sw_vers -productVersion`" >> "$GITHUB_OUTPUT"
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
#- name: Test
#id: cmake_test
#run: |
#cd build
#ctest --verbose --timeout 900
windows-latest-cmake:
runs-on: windows-2025
env:
VULKAN_VERSION: 1.4.328.1
runs-on: windows-latest
strategy:
matrix:
include:
- build: "noavx"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
- build: "avx2"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "avx"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
- build: "avx512"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120'"
- build: 'vulkan'
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: 'noavx'
defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
- build: 'avx2'
defines: '-DGGML_AVX2=ON'
- build: 'avx'
defines: '-DGGML_AVX2=OFF'
- build: 'avx512'
defines: '-DGGML_AVX512=ON'
steps:
- name: Clone
id: checkout
@ -173,24 +104,6 @@ jobs:
with:
submodules: recursive
- name: Install cuda-toolkit
id: cuda-toolkit
if: ${{ matrix.build == 'cuda12' }}
uses: Jimver/cuda-toolkit@v0.2.22
with:
cuda: "12.8.1"
method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'vulkan' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Build
id: cmake_build
run: |
@ -212,6 +125,12 @@ jobs:
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
#- name: Test
#id: cmake_test
#run: |
#cd build
#ctest -C Release --verbose --timeout 900
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -221,145 +140,17 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
$filePath = ".\build\bin\Release\*"
if (Test-Path $filePath) {
echo "Exists at path $filePath"
Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
} elseif (Test-Path ".\build\bin\stable-diffusion.dll") {
$filePath = ".\build\bin\*"
echo "Exists at path $filePath"
Copy-Item ggml/LICENSE .\build\bin\ggml.txt
Copy-Item LICENSE .\build\bin\stable-diffusion.cpp.txt
} else {
ls .\build\bin
throw "Can't find stable-diffusion.dll"
}
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip $filePath
- name: Copy and pack Cuda runtime
id: pack_cuda_runtime
if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
$dst='.\build\bin\cudart\'
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
- name: Upload Cuda runtime
if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
uses: actions/upload-artifact@v4
with:
name: sd-cudart-sd-bin-win-cu12-x64.zip
path: |
cudart-sd-bin-win-cu12-x64.zip
Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-hip:
runs-on: windows-2022
env:
HIPSDK_INSTALLER_VERSION: "25.Q3"
GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Cache ROCm Installation
id: cache-rocm
uses: actions/cache@v4
with:
path: C:\Program Files\AMD\ROCm
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
evict-old-files: 1d
- name: Install ROCm
if: steps.cache-rocm.outputs.cache-hit != 'true'
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$completed = $proc.WaitForExit(600000)
if (-not $completed) {
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
$proc.Kill()
exit 1
}
if ($proc.ExitCode -ne 0) {
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
exit 1
}
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
run: |
# Find and test ROCm installation
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
if (-not $clangPath) {
Write-Error "ROCm installation not found"
exit 1
}
& $clangPath.FullName --version
# Set HIP_PATH environment variable for later steps
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
- name: Build
run: |
mkdir build
cd build
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake .. `
-G "Unix Makefiles" `
-DSD_HIPBLAS=ON `
-DSD_BUILD_SHARED_LIBS=ON `
-DGGML_NATIVE=OFF `
-DCMAKE_C_COMPILER=clang `
-DCMAKE_CXX_COMPILER=clang++ `
-DCMAKE_BUILD_TYPE=Release `
-DGPU_TARGETS="${{ env.GPU_TARGETS }}"
cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Pack artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -369,26 +160,11 @@ jobs:
- ubuntu-latest-cmake
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-hip
steps:
- name: Clone
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
with:
path: ./artifact
pattern: sd-*
merge-multiple: true
- name: Get commit count
id: commit_count
run: |
echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
uses: actions/download-artifact@v3
- name: Get commit hash
id: commit
@ -396,16 +172,14 @@ jobs:
- name: Create release
id: create_release
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
uses: anzz1/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ format('{0}-{1}-{2}', env.BRANCH_NAME, steps.commit_count.outputs.count, steps.commit.outputs.short) }}
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
- name: Upload release
id: upload_release
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
uses: actions/github-script@v3
with:
github-token: ${{secrets.GITHUB_TOKEN}}

9
.gitignore vendored
View File

@ -1,15 +1,12 @@
build*/
cmake-build-*/
test/
.vscode/
.idea/
.cache/
*.swp
.vscode/
*.bat
*.bin
*.exe
*.gguf
output*.png
models*
*.log
preview.png
output.png
models/*

2
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "ggml"]
path = ggml
url = https://github.com/ggml-org/ggml.git
url = https://github.com/FSSRepo/ggml.git

View File

@ -24,168 +24,37 @@ endif()
# general
#option(SD_BUILD_TESTS "sd: build tests" ${SD_STANDALONE})
option(SD_BUILD_EXAMPLES "sd: build examples" ${SD_STANDALONE})
option(SD_CUDA "sd: cuda backend" OFF)
option(SD_HIPBLAS "sd: rocm backend" OFF)
option(SD_METAL "sd: metal backend" OFF)
option(SD_VULKAN "sd: vulkan backend" OFF)
option(SD_OPENCL "sd: opencl backend" OFF)
option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF)
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
option(SD_CUBLAS "sd: cuda backend" OFF)
option(SD_FLASH_ATTN "sd: use flash attention for x4 less memory usage" OFF)
option(BUILD_SHARED_LIBS "sd: build shared libs" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON)
if(SD_CUDA)
message("-- Use CUDA as backend stable-diffusion")
set(GGML_CUDA ON)
add_definitions(-DSD_USE_CUDA)
if(SD_CUBLAS)
message("Use CUBLAS as backend stable-diffusion")
set(GGML_CUBLAS ON)
add_definitions(-DSD_USE_CUBLAS)
endif()
if(SD_METAL)
message("-- Use Metal as backend stable-diffusion")
set(GGML_METAL ON)
add_definitions(-DSD_USE_METAL)
if(SD_FLASH_ATTN)
message("Use Flash Attention for memory optimization")
add_definitions(-DSD_USE_FLASH_ATTENTION)
endif()
if (SD_VULKAN)
message("-- Use Vulkan as backend stable-diffusion")
set(GGML_VULKAN ON)
add_definitions(-DSD_USE_VULKAN)
endif ()
if (SD_OPENCL)
message("-- Use OpenCL as backend stable-diffusion")
set(GGML_OPENCL ON)
add_definitions(-DSD_USE_OPENCL)
endif ()
if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
endif ()
if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
endif()
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
# deps
add_subdirectory(ggml)
set(SD_LIB stable-diffusion)
file(GLOB SD_LIB_SOURCES
"*.h"
"*.cpp"
"*.hpp"
)
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
endif()
if(NOT SDCPP_BUILD_VERSION)
set(SDCPP_BUILD_VERSION unknown)
endif()
message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
if(NOT SDCPP_BUILD_COMMIT)
set(SDCPP_BUILD_COMMIT unknown)
endif()
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
set_property(
SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
APPEND PROPERTY COMPILE_DEFINITIONS
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
)
if(SD_BUILD_SHARED_LIBS)
message("-- Build shared library")
message(${SD_LIB_SOURCES})
if(NOT SD_BUILD_SHARED_GGML_LIB)
set(BUILD_SHARED_LIBS OFF)
endif()
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
add_definitions(-DSD_BUILD_SHARED_LIB)
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
else()
message("-- Build static library")
if(NOT SD_BUILD_SHARED_GGML_LIB)
set(BUILD_SHARED_LIBS OFF)
endif()
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
endif()
if(SD_SYCL)
message("-- Use SYCL as backend stable-diffusion")
set(GGML_SYCL ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
add_definitions(-DSD_USE_SYCL)
# disable fast-math on host, see:
# https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
if (WIN32)
set(SYCL_COMPILE_OPTIONS /fp:precise)
else()
set(SYCL_COMPILE_OPTIONS -fp-model=precise)
endif()
message("-- Turn off fast-math for host in SYCL backend")
target_compile_options(${SD_LIB} PRIVATE ${SYCL_COMPILE_OPTIONS})
endif()
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
if (NOT SD_USE_SYSTEM_GGML)
# see https://github.com/ggerganov/ggml/pull/682
add_definitions(-DGGML_MAX_NAME=128)
endif()
# deps
# Only add ggml if it hasn't been added yet
if (NOT TARGET ggml)
if (SD_USE_SYSTEM_GGML)
find_package(ggml REQUIRED)
if (NOT ggml_FOUND)
message(FATAL_ERROR "System-installed GGML library not found.")
endif()
add_library(ggml ALIAS ggml::ggml)
else()
add_subdirectory(ggml)
endif()
endif()
add_subdirectory(thirdparty)
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp)
target_link_libraries(${SD_LIB} PUBLIC ggml)
target_include_directories(${SD_LIB} PUBLIC .)
target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
add_subdirectory(common)
if (SD_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()
set(SD_PUBLIC_HEADERS stable-diffusion.h)
set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)

View File

@ -1,21 +1,16 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
RUN apt-get update && apt-get install -y build-essential git cmake
WORKDIR /sd.cpp
COPY . .
RUN cmake . -B ./build
RUN cmake --build ./build --config Release --parallel
RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install --yes --no-install-recommends libgomp1 && \
apt-get clean
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /sd.cpp/build/bin/sd /sd

View File

@ -1,23 +0,0 @@
ARG MUSA_VERSION=rc4.2.0
ARG UBUNTU_VERSION=22.04
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
RUN apt-get update && apt-get install -y ccache cmake git
WORKDIR /sd.cpp
COPY . .
RUN mkdir build && cd build && \
cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
-DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build . --config Release
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
COPY --from=build /sd.cpp/build/bin/sd /sd
ENTRYPOINT [ "/sd" ]

View File

@ -1,19 +0,0 @@
ARG SYCL_VERSION=2025.1.0-0
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
RUN apt-get update && apt-get install -y cmake
WORKDIR /sd.cpp
COPY . .
RUN mkdir build && cd build && \
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build . --config Release -j$(nproc)
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
COPY --from=build /sd.cpp/build/bin/sd /sd
ENTRYPOINT [ "/sd" ]

360
README.md
View File

@ -1,86 +1,28 @@
<p align="center">
<img src="./assets/logo.png" width="360x">
<img src="./assets/a%20lovely%20cat.png" width="256x">
</p>
# stable-diffusion.cpp
<div align="center">
<a href="https://trendshift.io/repositories/9714" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9714" alt="leejet%2Fstable-diffusion.cpp | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</div>
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
***Note that this project is under active development. \
API and command-line option may change frequently.***
## 🔥Important News
* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**
👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**
👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**
👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**
👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**
👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**
👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in pure C/C++
## Features
- Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
- Super lightweight and without external dependencies
- Supported models
- Image Models
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev](./docs/flux2.md)
- [Chroma](./docs/chroma.md)
- [Chroma1-Radiance](./docs/chroma_radiance.md)
- [Qwen Image](./docs/qwen_image.md)
- [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
- Video Models
- [Wan2.1/Wan2.2](./docs/wan.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
- Control Net support with SD 1.5
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
- Latent Consistency Models support (LCM/LCM-LoRA)
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
- Supported backends
- CPU (AVX, AVX2 and AVX512 support for x86 architectures)
- CUDA
- Vulkan
- Metal
- OpenCL
- SYCL
- Supported weight formats
- Pytorch checkpoint (`.ckpt` or `.pth`)
- Safetensors (`./safetensors`)
- GGUF (`.gguf`)
- Supported platforms
- Linux
- Mac OS
- Windows
- Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
- Flash Attention for memory usage optimization
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
- Super lightweight and without external dependencies.
- 16-bit, 32-bit float support
- 4-bit, 5-bit and 8-bit integer quantization support
- Accelerated memory-efficient CPU inference
- Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
- AVX, AVX2 and AVX512 support for x86 architectures
- SD1.x and SD2.x support
- Full CUDA backend for GPU acceleration, for now just for float16 and float32 models. There are some issues with quantized models and CUDA; it will be fixed in the future.
- Flash Attention for memory usage optimization (only cpu for now).
- Original `txt2img` and `img2img` mode
- Negative prompt
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
- VAE tiling processing for reduce memory usage
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
- Latent Consistency Models support (LCM/LCM-LoRA)
- Sampling method
- `Euler A`
- `Euler`
@ -90,83 +32,224 @@ API and command-line option may change frequently.***
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
- `DPM++ 2S a`
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
- `--rng cpu`, consistent with the `comfyui RNG`
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
- Embedds generation parameters into png output as webui-compatible text string
- Supported platforms
- Linux
- Mac OS
- Windows
- Android (via Termux)
## Quick Start
### TODO
### Get the sd executable
- [ ] More sampling methods
- [ ] Make inference faster
- The current implementation of ggml_conv_2d is slow and has high memory usage
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
- [ ] Implement BPE Tokenizer
- [ ] Add [TAESD](https://github.com/madebyollin/taesd) for faster VAE decoding
- [ ] k-quants support
- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
- Or build from source by following the [build guide](./docs/build.md)
## Usage
### Download model weights
### Get the Code
- download weights(.ckpt or .safetensors or .gguf). For example
- Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
```sh
curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
```
### Generate an image with just one command
```sh
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
```
git clone --recursive https://github.com/leejet/stable-diffusion.cpp
cd stable-diffusion.cpp
```
***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
## Performance
```
cd stable-diffusion.cpp
git pull origin master
git submodule init
git submodule update
```
If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
### Convert weights
## More Guides
- download original weights(.ckpt or .safetensors). For example
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
- Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
- [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev](./docs/flux2.md)
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Chroma](./docs/chroma.md)
- [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
- [Using ESRGAN to upscale results](./docs/esrgan.md)
- [Using TAESD to faster decoding](./docs/taesd.md)
- [Docker](./docs/docker.md)
- [Quantization and GGUF](./docs/quantization_and_gguf.md)
```shell
curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
# curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-nonema-pruned.safetensors
```
## Bindings
- convert weights to gguf model format
These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.
```shell
./bin/convert sd-v1-4.ckpt -t f16
```
* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)
### Quantization
## UIs
You can specify the output model format using the `--type` or `-t` parameter
These projects use `stable-diffusion.cpp` as a backend for their image generation.
- `f16` for 16-bit floating-point
- `f32` for 32-bit floating-point
- `q8_0` for 8-bit integer quantization
- `q5_0` or `q5_1` for 5-bit integer quantization
- `q4_0` or `q4_1` for 4-bit integer quantization
- [Jellybox](https://jellybox.com)
- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
- [LocalAI](https://github.com/mudler/LocalAI)
- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
- [KoboldCpp](https://github.com/LostRuins/koboldcpp)
### Build
#### Build from scratch
```shell
mkdir build
cd build
cmake ..
cmake --build . --config Release
```
##### Using OpenBLAS
```
cmake .. -DGGML_OPENBLAS=ON
cmake --build . --config Release
```
##### Using CUBLAS
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
```
cmake .. -DSD_CUBLAS=ON
cmake --build . --config Release
```
### Using Flash Attention
Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.
```
cmake .. -DSD_FLASH_ATTN=ON
cmake --build . --config Release
```
### Run
```
usage: ./bin/sd [arguments]
arguments:
-h, --help show this help message and exit
-M, --mode [txt2img or img2img] generation mode (default: txt2img)
-t, --threads N number of threads to use during computation (default: -1).
If threads <= 0, then threads will be set to the number of CPU physical cores
-m, --model [MODEL] path to model
--lora-model-dir [DIR] lora model directory
-i, --init-img [IMAGE] path to the input image, required by img2img
-o, --output OUTPUT path to write result image to (default: .\output.png)
-p, --prompt [PROMPT] the prompt to render
-n, --negative-prompt PROMPT the negative prompt (default: "")
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
--strength STRENGTH strength for noising/unnoising (default: 0.75)
1.0 corresponds to full destruction of information in init image
-H, --height H image height, in pixel space (default: 512)
-W, --width W image width, in pixel space (default: 512)
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}
sampling method (default: "euler_a")
--steps STEPS number of sample steps (default: 20)
--rng {std_default, cuda} RNG (default: cuda)
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
-b, --batch-count COUNT number of images to generate.
--schedule {discrete, karras} Denoiser sigma schedule (default: discrete)
-v, --verbose print extra info
```
#### txt2img example
```
./bin/sd -m ../sd-v1-4-f16.gguf -p "a lovely cat"
```
Using formats of different precisions will yield results of varying quality.
| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
| ---- |---- |---- |---- |---- |---- |---- |
| ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
#### img2img example
- `./output.png` is the image generated from the above txt2img pipeline
```
./bin/sd --mode img2img -m ../models/sd-v1-4-f16.gguf -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
```
<p align="center">
<img src="./assets/img2img_output.png" width="256x">
</p>
#### with LoRA
- convert lora weights to gguf model format
```shell
bin/convert [lora path] -t f16
# For example, bin/convert marblesh.safetensors -t f16
```
- You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
- LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
Here's a simple example:
```
./bin/sd -m ../models/v1-5-pruned-emaonly-f16.gguf -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
```
`../models/marblesh.gguf` will be applied to the model
#### LCM/LCM-LoRA
- Download LCM-LoRA form https://huggingface.co/latent-consistency/lcm-lora-sdv1-5
- Specify LCM-LoRA by adding `<lora:lcm-lora-sdv1-5:1>` to prompt
- It's advisable to set `--cfg-scale` to `1.0` instead of the default `7.0`. For `--steps`, a range of `2-8` steps is recommended. For `--sampling-method`, `lcm`/`euler_a` is recommended.
Here's a simple example:
```
./bin/sd -m ../models/v1-5-pruned-emaonly-f16.gguf -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
```
| without LCM-LoRA (--cfg-scale 7) | with LCM-LoRA (--cfg-scale 1) |
| ---- |---- |
| ![](./assets/without_lcm.png) |![](./assets/with_lcm.png) |
### Docker
#### Building using Docker
```shell
docker build -t sd .
```
#### Run
```shell
docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
# For example
# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4-f16.gguf -p "a lovely cat" -v -o /output/output.png
```
## Memory/Disk Requirements
| precision | f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
| ---- | ---- |---- |---- |---- |---- |---- |---- |
| **Disk** | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
| **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
| **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |
## Contributors
@ -174,22 +257,11 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
[![Contributors](https://contrib.rocks/image?repo=leejet/stable-diffusion.cpp)](https://github.com/leejet/stable-diffusion.cpp/graphs/contributors)
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=leejet/stable-diffusion.cpp&type=Date)](https://star-history.com/#leejet/stable-diffusion.cpp&Date)
## References
- [ggml](https://github.com/ggml-org/ggml)
- [diffusers](https://github.com/huggingface/diffusers)
- [ggml](https://github.com/ggerganov/ggml)
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
- [ComfyUI](https://github.com/comfyanonymous/ComfyUI)
- [k-diffusion](https://github.com/crowsonkb/k-diffusion)
- [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
- [generative-models](https://github.com/Stability-AI/generative-models/)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
- [Wan2.2](https://github.com/Wan-Video/Wan2.2)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 477 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 539 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 416 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 490 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 464 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 468 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 566 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 475 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 481 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 496 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 556 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 401 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 311 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 107 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 457 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 415 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 MiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 594 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

1003
clip.hpp

File diff suppressed because it is too large Load Diff

View File

@ -1,591 +0,0 @@
#ifndef __COMMON_HPP__
#define __COMMON_HPP__
#include "ggml_extend.hpp"
class DownSampleBlock : public GGMLBlock {
protected:
int channels;
int out_channels;
bool vae_downsample;
public:
DownSampleBlock(int channels,
int out_channels,
bool vae_downsample = false)
: channels(channels),
out_channels(out_channels),
vae_downsample(vae_downsample) {
if (vae_downsample) {
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
} else {
blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
}
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
if (vae_downsample) {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
x = conv->forward(ctx, x);
} else {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
x = conv->forward(ctx, x);
}
return x; // [N, out_channels, h/2, w/2]
}
};
class UpSampleBlock : public GGMLBlock {
protected:
int channels;
int out_channels;
public:
UpSampleBlock(int channels,
int out_channels)
: channels(channels),
out_channels(out_channels) {
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
return x;
}
};
class ResBlock : public GGMLBlock {
protected:
// network hparams
int64_t channels; // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
int64_t emb_channels; // time_embed_dim
int64_t out_channels; // mult * model_channels
std::pair<int, int> kernel_size;
int dims;
bool skip_t_emb;
bool exchange_temb_dims;
std::shared_ptr<GGMLBlock> conv_nd(int dims,
int64_t in_channels,
int64_t out_channels,
std::pair<int, int> kernel_size,
std::pair<int, int> padding) {
GGML_ASSERT(dims == 2 || dims == 3);
if (dims == 3) {
return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
} else {
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
}
}
public:
ResBlock(int64_t channels,
int64_t emb_channels,
int64_t out_channels,
std::pair<int, int> kernel_size = {3, 3},
int dims = 2,
bool exchange_temb_dims = false,
bool skip_t_emb = false)
: channels(channels),
emb_channels(emb_channels),
out_channels(out_channels),
kernel_size(kernel_size),
dims(dims),
skip_t_emb(skip_t_emb),
exchange_temb_dims(exchange_temb_dims) {
std::pair<int, int> padding = {kernel_size.first / 2, kernel_size.second / 2};
blocks["in_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(channels));
// in_layer_1 is nn.SILU()
blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
if (!skip_t_emb) {
// emb_layer_0 is nn.SILU()
blocks["emb_layers.1"] = std::shared_ptr<GGMLBlock>(new Linear(emb_channels, out_channels));
}
blocks["out_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
// out_layer_1 is nn.SILU()
// out_layer_2 is nn.Dropout(), skip for inference
blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
if (out_channels != channels) {
blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
}
}
virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
// [N, c, t, h, w] => [N, c, t, h * w]
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
// emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
auto in_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["in_layers.0"]);
auto in_layers_2 = std::dynamic_pointer_cast<UnaryBlock>(blocks["in_layers.2"]);
auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
if (emb == nullptr) {
GGML_ASSERT(skip_t_emb);
}
// in_layers
auto h = in_layers_0->forward(ctx, x);
h = ggml_silu_inplace(ctx->ggml_ctx, h);
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
// emb_layers
if (!skip_t_emb) {
auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
if (dims == 2) {
emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
} else {
emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
if (exchange_temb_dims) {
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
}
}
h = ggml_add(ctx->ggml_ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
// out_layers
h = out_layers_0->forward(ctx, h);
h = ggml_silu_inplace(ctx->ggml_ctx, h);
// dropout, skip for inference
h = out_layers_3->forward(ctx, h);
// skip connection
if (out_channels != channels) {
auto skip_connection = std::dynamic_pointer_cast<UnaryBlock>(blocks["skip_connection"]);
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
h = ggml_add(ctx->ggml_ctx, h, x);
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
};
class GEGLU : public UnaryBlock {
protected:
int64_t dim_in;
int64_t dim_out;
public:
GEGLU(int64_t dim_in, int64_t dim_out)
: dim_in(dim_in), dim_out(dim_out) {
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out]
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2]
auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
x = x_vec[0]; // [ne3, ne2, ne1, dim_out]
auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out]
gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out]
return x;
}
};
class GELU : public UnaryBlock {
public:
GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out]
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
x = proj->forward(ctx, x);
x = ggml_gelu_inplace(ctx->ggml_ctx, x);
return x;
}
};
class FeedForward : public GGMLBlock {
public:
enum class Activation {
GEGLU,
GELU
};
FeedForward(int64_t dim,
int64_t dim_out,
int64_t mult = 4,
Activation activation = Activation::GEGLU,
bool precision_fix = false) {
int64_t inner_dim = dim * mult;
if (activation == Activation::GELU) {
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
} else {
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
}
// net_1 is nn.Dropout(), skip for inference
bool force_prec_f32 = false;
float scale = 1.f;
if (precision_fix) {
scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
}
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32,
// or when using CUDA but the weights are k-quants.
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [ne3, ne2, ne1, dim]
// return: [ne3, ne2, ne1, dim_out]
auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
return x;
}
};
class CrossAttention : public GGMLBlock {
protected:
int64_t query_dim;
int64_t context_dim;
int64_t n_head;
int64_t d_head;
public:
CrossAttention(int64_t query_dim,
int64_t context_dim,
int64_t n_head,
int64_t d_head)
: n_head(n_head),
d_head(d_head),
query_dim(query_dim),
context_dim(context_dim) {
int64_t inner_dim = d_head * n_head;
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, query_dim));
// to_out_1 is nn.Dropout(), skip for inference
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, n_token, query_dim]
// context: [N, n_context, context_dim]
// return: [N, n_token, query_dim]
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
auto to_k = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
int64_t n = x->ne[2];
int64_t n_token = x->ne[1];
int64_t n_context = context->ne[1];
int64_t inner_dim = d_head * n_head;
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
return x;
}
};
class BasicTransformerBlock : public GGMLBlock {
protected:
int64_t n_head;
int64_t d_head;
bool ff_in;
public:
BasicTransformerBlock(int64_t dim,
int64_t n_head,
int64_t d_head,
int64_t context_dim,
bool ff_in = false)
: n_head(n_head), d_head(d_head), ff_in(ff_in) {
// disable_self_attn is always False
// disable_temporal_crossattention is always False
// switch_temporal_ca_to_sa is always False
// inner_dim is always None or equal to dim
// gated_ff is always True
blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["norm3"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
if (ff_in) {
blocks["norm_in"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["ff_in"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
}
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, n_token, query_dim]
// context: [N, n_context, context_dim]
// return: [N, n_token, query_dim]
auto attn1 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn1"]);
auto attn2 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn2"]);
auto ff = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
auto norm3 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm3"]);
if (ff_in) {
auto norm_in = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_in"]);
auto ff_in = std::dynamic_pointer_cast<FeedForward>(blocks["ff_in"]);
auto x_skip = x;
x = norm_in->forward(ctx, x);
x = ff_in->forward(ctx, x);
// self.is_res is always True
x = ggml_add(ctx->ggml_ctx, x, x_skip);
}
auto r = x;
x = norm1->forward(ctx, x);
x = attn1->forward(ctx, x, x); // self-attention
x = ggml_add(ctx->ggml_ctx, x, r);
r = x;
x = norm2->forward(ctx, x);
x = attn2->forward(ctx, x, context); // cross-attention
x = ggml_add(ctx->ggml_ctx, x, r);
r = x;
x = norm3->forward(ctx, x);
x = ff->forward(ctx, x);
x = ggml_add(ctx->ggml_ctx, x, r);
return x;
}
};
class SpatialTransformer : public GGMLBlock {
protected:
int64_t in_channels; // mult * model_channels
int64_t n_head;
int64_t d_head;
int64_t depth = 1; // 1
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
bool use_linear = false;
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
if (iter != tensor_storage_map.end()) {
int64_t inner_dim = n_head * d_head;
if (iter->second.n_dims == 4 && use_linear) {
use_linear = false;
blocks["proj_in"] = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
} else if (iter->second.n_dims == 2 && !use_linear) {
use_linear = true;
blocks["proj_in"] = std::make_shared<Linear>(in_channels, inner_dim);
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
}
}
}
public:
SpatialTransformer(int64_t in_channels,
int64_t n_head,
int64_t d_head,
int64_t depth,
int64_t context_dim,
bool use_linear)
: in_channels(in_channels),
n_head(n_head),
d_head(d_head),
depth(depth),
context_dim(context_dim),
use_linear(use_linear) {
// disable_self_attn is always False
int64_t inner_dim = n_head * d_head; // in_channels
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
if (use_linear) {
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
} else {
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
}
for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
}
if (use_linear) {
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
} else {
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
}
}
virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, in_channels, h, w]
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
auto proj_in = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
auto x_in = x;
int64_t n = x->ne[3];
int64_t h = x->ne[1];
int64_t w = x->ne[0];
int64_t inner_dim = n_head * d_head;
x = norm->forward(ctx, x);
if (use_linear) {
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
} else {
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
}
for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
x = transformer_block->forward(ctx, x, context);
}
if (use_linear) {
// proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
} else {
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
// proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
}
x = ggml_add(ctx->ggml_ctx, x, x_in);
return x;
}
};
class AlphaBlender : public GGMLBlock {
protected:
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
}
float get_alpha() {
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
// so learned_with_images is same as learned
float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
return sigmoid(alpha);
}
public:
AlphaBlender() {
// merge_strategy is always learned_with_images
// for inference, we don't need to set alpha
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x_spatial,
struct ggml_tensor* x_temporal) {
// image_only_indicator is always tensor([0.])
float alpha = get_alpha();
auto x = ggml_add(ctx->ggml_ctx,
ggml_scale(ctx->ggml_ctx, x_spatial, alpha),
ggml_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
return x;
}
};
class VideoResBlock : public ResBlock {
public:
VideoResBlock(int channels,
int emb_channels,
int out_channels,
std::pair<int, int> kernel_size = {3, 3},
int64_t video_kernel_size = 3,
int dims = 2) // always 2
: ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* emb,
int num_video_frames) {
// x: [N, channels, h, w] aka [b*t, channels, h, w]
// emb: [N, emb_channels] aka [b*t, emb_channels]
// image_only_indicator is always tensor([0.])
auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
x = ResBlock::forward(ctx, x, emb);
int64_t T = num_video_frames;
int64_t B = x->ne[3] / T;
int64_t C = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
auto x_mix = x;
emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
x = time_stack->forward(ctx, x, emb); // b t c (h w)
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
return x;
}
};
#endif // __COMMON_HPP__

15
common/CMakeLists.txt Normal file
View File

@ -0,0 +1,15 @@
set(TARGET common)
# json.hpp library from: https://github.com/nlohmann/json
add_library(${TARGET} OBJECT common.cpp common.h stb_image.h stb_image_write.h json.hpp)
target_include_directories(${TARGET} PUBLIC .)
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PUBLIC cxx_std_11)
# ZIP Library from: https://github.com/kuba--/zip
set(Z_TARGET zip)
add_library(${Z_TARGET} OBJECT zip.c zip.h miniz.h)
target_include_directories(${Z_TARGET} PUBLIC .)

391
common/common.cpp Normal file
View File

@ -0,0 +1,391 @@
#include "common.h"
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <thread>
#include <unordered_set>
#include <vector>
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/sysctl.h>
#include <sys/types.h>
#endif
#if !defined(_WIN32)
#include <sys/ioctl.h>
#include <unistd.h>
#endif
// get_num_physical_cores is copy from
// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
if (!thread_siblings.is_open()) {
break; // no more cpus
}
std::string line;
if (std::getline(thread_siblings, line)) {
siblings.insert(line);
}
}
if (siblings.size() > 0) {
return static_cast<int32_t>(siblings.size());
}
#elif defined(__APPLE__) && defined(__MACH__)
int32_t num_physical_cores;
size_t len = sizeof(num_physical_cores);
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
#elif defined(_WIN32)
// TODO: Implement
#endif
unsigned int n_threads = std::thread::hardware_concurrency();
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
const char* rng_type_to_str[] = {
"std_default",
"cuda",
};
// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
const char* sample_method_str[] = {
"euler_a",
"euler",
"heun",
"dpm2",
"dpm++2s_a",
"dpm++2m",
"dpm++2mv2",
"lcm",
};
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
const char* schedule_str[] = {
"default",
"discrete",
"karras"};
const char* modes_str[] = {
"txt2img",
"img2img"};
void print_params(SDParams params) {
printf("Option: \n");
printf(" n_threads: %d\n", params.n_threads);
printf(" mode: %s\n", modes_str[params.mode]);
printf(" model_path: %s\n", params.model_path.c_str());
printf(" output_path: %s\n", params.output_path.c_str());
printf(" init_img: %s\n", params.input_path.c_str());
printf(" prompt: %s\n", params.prompt.c_str());
printf(" negative_prompt: %s\n", params.negative_prompt.c_str());
printf(" cfg_scale: %.2f\n", params.cfg_scale);
printf(" width: %d\n", params.width);
printf(" height: %d\n", params.height);
printf(" sample_method: %s\n", sample_method_str[params.sample_method]);
printf(" schedule: %s\n", schedule_str[params.schedule]);
printf(" sample_steps: %d\n", params.sample_steps);
printf(" strength: %.2f\n", params.strength);
printf(" rng: %s\n", rng_type_to_str[params.rng_type]);
printf(" seed: %ld\n", params.seed);
printf(" batch_count: %d\n", params.batch_count);
}
void print_usage(int argc, const char* argv[]) {
printf("usage: %s [arguments]\n", argv[0]);
printf("\n");
printf("arguments:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -M, --mode [txt2img or img2img] generation mode (default: txt2img)\n");
printf(" -t, --threads N number of threads to use during computation (default: -1).\n");
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
printf(" -m, --model [MODEL] path to model\n");
printf(" --lora-model-dir [DIR] lora model directory\n");
printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n");
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
printf(" -p, --prompt [PROMPT] the prompt to render\n");
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
printf(" 1.0 corresponds to full destruction of information in init image\n");
printf(" -H, --height H image height, in pixel space (default: 512)\n");
printf(" -W, --width W image width, in pixel space (default: 512)\n");
printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
printf(" sampling method (default: \"euler_a\")\n");
printf(" --steps STEPS number of sample steps (default: 20)\n");
printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
printf(" -b, --batch-count COUNT number of images to generate.\n");
printf(" --schedule {discrete, karras} Denoiser sigma schedule (default: discrete)\n");
printf(" -v, --verbose print extra info\n");
}
void parse_args(int argc, const char** argv, SDParams& params) {
bool invalid_arg = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.n_threads = std::stoi(argv[i]);
} else if (arg == "-M" || arg == "--mode") {
if (++i >= argc) {
invalid_arg = true;
break;
}
const char* mode_selected = argv[i];
int mode_found = -1;
for (int d = 0; d < MODE_COUNT; d++) {
if (!strcmp(mode_selected, modes_str[d])) {
mode_found = d;
}
}
if (mode_found == -1) {
fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img]\n",
mode_selected);
exit(1);
}
params.mode = (sd_mode)mode_found;
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.model_path = argv[i];
} else if (arg == "--lora-model-dir") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.lora_model_dir = argv[i];
} else if (arg == "-i" || arg == "--init-img") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.input_path = argv[i];
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.output_path = argv[i];
} else if (arg == "-p" || arg == "--prompt") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.prompt = argv[i];
} else if (arg == "-n" || arg == "--negative-prompt") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.negative_prompt = argv[i];
} else if (arg == "--cfg-scale") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.cfg_scale = std::stof(argv[i]);
} else if (arg == "--strength") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.strength = std::stof(argv[i]);
} else if (arg == "-H" || arg == "--height") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.height = std::stoi(argv[i]);
} else if (arg == "-W" || arg == "--width") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.width = std::stoi(argv[i]);
} else if (arg == "--steps") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.sample_steps = std::stoi(argv[i]);
} else if (arg == "-b" || arg == "--batch-count") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.batch_count = std::stoi(argv[i]);
} else if (arg == "--rng") {
if (++i >= argc) {
invalid_arg = true;
break;
}
std::string rng_type_str = argv[i];
if (rng_type_str == "std_default") {
params.rng_type = STD_DEFAULT_RNG;
} else if (rng_type_str == "cuda") {
params.rng_type = CUDA_RNG;
} else {
invalid_arg = true;
break;
}
} else if (arg == "--schedule") {
if (++i >= argc) {
invalid_arg = true;
break;
}
const char* schedule_selected = argv[i];
int schedule_found = -1;
for (int d = 0; d < N_SCHEDULES; d++) {
if (!strcmp(schedule_selected, schedule_str[d])) {
schedule_found = d;
}
}
if (schedule_found == -1) {
invalid_arg = true;
break;
}
params.schedule = (Schedule)schedule_found;
} else if (arg == "-s" || arg == "--seed") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.seed = std::stoll(argv[i]);
} else if (arg == "--sampling-method") {
if (++i >= argc) {
invalid_arg = true;
break;
}
const char* sample_method_selected = argv[i];
int sample_method_found = -1;
for (int m = 0; m < N_SAMPLE_METHODS; m++) {
if (!strcmp(sample_method_selected, sample_method_str[m])) {
sample_method_found = m;
}
}
if (sample_method_found == -1) {
invalid_arg = true;
break;
}
params.sample_method = (SampleMethod)sample_method_found;
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv);
exit(0);
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv);
exit(1);
}
}
if (invalid_arg) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv);
exit(1);
}
if (params.n_threads <= 0) {
params.n_threads = get_num_physical_cores();
}
if (params.prompt.length() == 0) {
fprintf(stderr, "error: the following arguments are required: prompt\n");
print_usage(argc, argv);
exit(1);
}
if (params.model_path.length() == 0) {
fprintf(stderr, "error: the following arguments are required: model_path\n");
print_usage(argc, argv);
exit(1);
}
if (params.mode == IMG2IMG && params.input_path.length() == 0) {
fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
print_usage(argc, argv);
exit(1);
}
if (params.output_path.length() == 0) {
fprintf(stderr, "error: the following arguments are required: output_path\n");
print_usage(argc, argv);
exit(1);
}
if (params.width <= 0 || params.width % 64 != 0) {
fprintf(stderr, "error: the width must be a multiple of 64\n");
exit(1);
}
if (params.height <= 0 || params.height % 64 != 0) {
fprintf(stderr, "error: the height must be a multiple of 64\n");
exit(1);
}
if (params.sample_steps <= 0) {
fprintf(stderr, "error: the sample_steps must be greater than 0\n");
exit(1);
}
if (params.strength < 0.f || params.strength > 1.f) {
fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
exit(1);
}
if (params.seed < 0) {
srand((int)time(NULL));
params.seed = rand();
}
}
std::string basename(const std::string& path) {
size_t pos = path.find_last_of('/');
if (pos != std::string::npos) {
return path.substr(pos + 1);
}
pos = path.find_last_of('\\');
if (pos != std::string::npos) {
return path.substr(pos + 1);
}
return path;
}
const char* get_image_params(SDParams params, int seed) {
std::string parameter_string = params.prompt + "\n";
if (params.negative_prompt.size() != 0) {
parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
}
parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
parameter_string += "Seed: " + std::to_string(seed) + ", ";
parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
parameter_string += "Model: " + basename(params.model_path) + ", ";
parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
if (params.schedule == KARRAS) {
parameter_string += " karras";
}
parameter_string += ", ";
parameter_string += "Version: stable-diffusion.cpp";
return parameter_string.c_str();
}

43
common/common.h Normal file
View File

@ -0,0 +1,43 @@
#pragma once
#include <string>
#include "stable-diffusion.h"
enum sd_mode {
TXT2IMG,
IMG2IMG,
MODE_COUNT
};
struct SDParams {
int n_threads = -1;
sd_mode mode = TXT2IMG;
std::string model_path;
std::string lora_model_dir;
std::string output_path = "output.png";
std::string input_path;
std::string prompt;
std::string negative_prompt;
float cfg_scale = 7.0f;
int width = 512;
int height = 512;
int batch_count = 1;
SampleMethod sample_method = EULER_A;
Schedule schedule = DEFAULT;
int sample_steps = 20;
float strength = 0.75f;
RNGType rng_type = CUDA_RNG;
int64_t seed = 42;
bool verbose = false;
};
void print_params(SDParams params);
void print_usage(int argc, const char* argv[]);
void parse_args(int argc, const char** argv, SDParams& params);
const char* get_image_params(SDParams params, int seed);

View File

@ -177,7 +177,7 @@ STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const
STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality, const char* parameters = NULL);
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality);
#ifdef STBIW_WINDOWS_UTF8
STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
@ -1412,7 +1412,7 @@ static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt
return DU[0];
}
static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality, const char* parameters) {
static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
// Constants that don't pollute global namespace
static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
@ -1521,20 +1521,6 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
s->func(s->context, (void*)YTable, sizeof(YTable));
stbiw__putc(s, 1);
s->func(s->context, UVTable, sizeof(UVTable));
// comment block with parameters of generation
if(parameters != NULL) {
stbiw__putc(s, 0xFF /* comnent */ );
stbiw__putc(s, 0xFE /* marker */ );
size_t param_length = std::min(2 + strlen("parameters") + 1 + strlen(parameters) + 1, (size_t) 0xFFFF);
stbiw__putc(s, param_length >> 8); // no need to mask, length < 65536
stbiw__putc(s, param_length & 0xFF);
s->func(s->context, (void*)"parameters", strlen("parameters") + 1); // std::string is zero-terminated
s->func(s->context, (void*)parameters, std::min(param_length, (size_t) 65534) - 2 - strlen("parameters") - 1);
if(param_length > 65534) stbiw__putc(s, 0); // always zero-terminate for safety
if(param_length & 1) stbiw__putc(s, 0xFF); // pad to even length
}
s->func(s->context, (void*)head1, sizeof(head1));
s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
@ -1639,16 +1625,16 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
{
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality, NULL);
return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
}
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality, const char* parameters)
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
{
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_jpg_core(&s, x, y, comp, data, quality, parameters);
int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
stbi__end_write_file(&s);
return r;
} else

View File

@ -36,7 +36,6 @@
#include <unistd.h>
#endif
#define USE_EXTERNAL_MZCRC
#include "miniz.h"
#include "zip.h"
@ -1835,234 +1834,3 @@ int zip_extract(const char *zipname, const char *dir,
return zip_archive_extract(&zip_archive, dir, on_extract, arg);
}
#if defined(__SSE4_2__) || defined(__AVX512F__)
#include <immintrin.h>
#endif
// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
// V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
uint32_t crc = ~init;
#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
if (len >= 256) {
_Alignas(__m512) static const uint64_t k1k2[] = {
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
};
_Alignas(__m512) static const uint64_t k3k4[] = {
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
};
_Alignas(__m512) static const uint64_t k5k6[] = {
0x01751997d0,
0x00ccaa009e,
};
_Alignas(__m512) static const uint64_t k7k8[] = {
0x0163cd6124,
0x0000000000,
};
_Alignas(__m512) static const uint64_t poly[] = {
0x01db710641,
0x01f7011641,
};
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
__m128i a0, a1, a2, a3;
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
x0 = _mm512_load_si512((__m512i *)k1k2);
buf += 256;
len -= 256;
while (len >= 256) {
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, x5);
x2 = _mm512_xor_si512(x2, x6);
x3 = _mm512_xor_si512(x3, x7);
x4 = _mm512_xor_si512(x4, x8);
x1 = _mm512_xor_si512(x1, y5);
x2 = _mm512_xor_si512(x2, y6);
x3 = _mm512_xor_si512(x3, y7);
x4 = _mm512_xor_si512(x4, y8);
buf += 256;
len -= 256;
}
x0 = _mm512_load_si512((__m512i *)k3k4);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x3);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x4);
x1 = _mm512_xor_si512(x1, x5);
while (len >= 64) {
x2 = _mm512_loadu_si512((__m512i *)buf);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
buf += 64;
len -= 64;
}
a0 = _mm_load_si128((__m128i *)k5k6);
a1 = _mm512_extracti32x4_epi32(x1, 0);
a2 = _mm512_extracti32x4_epi32(x1, 1);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm512_extracti32x4_epi32(x1, 2);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm512_extracti32x4_epi32(x1, 3);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
a1 = _mm_srli_si128(a1, 8);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_loadl_epi64((__m128i *)k7k8);
a2 = _mm_srli_si128(a1, 4);
a1 = _mm_and_si128(a1, a3);
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_load_si128((__m128i *)poly);
a2 = _mm_and_si128(a1, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
a2 = _mm_and_si128(a2, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
crc = _mm_extract_epi32(a1, 1);
}
#endif
#if defined(__SSE4_2__) && defined(__PCLMUL__)
if (len >= 64) {
_Alignas(__m128) static const uint64_t k1k2[] = {
0x0154442bd4,
0x01c6e41596,
};
_Alignas(__m128) static const uint64_t k3k4[] = {
0x01751997d0,
0x00ccaa009e,
};
_Alignas(__m128) static const uint64_t k5k0[] = {
0x0163cd6124,
0x0000000000,
};
_Alignas(__m128) static const uint64_t poly[] = {
0x01db710641,
0x01f7011641,
};
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
x0 = _mm_load_si128((__m128i *)k1k2);
buf += 64;
len -= 64;
while (len >= 64) {
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_xor_si128(x4, x8);
x1 = _mm_xor_si128(x1, y5);
x2 = _mm_xor_si128(x2, y6);
x3 = _mm_xor_si128(x3, y7);
x4 = _mm_xor_si128(x4, y8);
buf += 64;
len -= 64;
}
x0 = _mm_load_si128((__m128i *)k3k4);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x3);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x4);
x1 = _mm_xor_si128(x1, x5);
while (len >= 16) {
x2 = _mm_loadu_si128((__m128i *)buf);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
buf += 16;
len -= 16;
}
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
x1 = _mm_srli_si128(x1, 8);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_loadl_epi64((__m128i *)k5k0);
x2 = _mm_srli_si128(x1, 4);
x1 = _mm_and_si128(x1, x3);
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_load_si128((__m128i *)poly);
x2 = _mm_and_si128(x1, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
x2 = _mm_and_si128(x2, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
crc = _mm_extract_epi32(x1, 1);
}
#endif
static uint32_t tab[256];
if (!tab[255]) {
// generates table for byte-wise crc calculation on the polynomial
// x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
uint32_t polynomial = 0xedb88320; // bits are reversed
for (int d = 0; d < 256; ++d) {
uint32_t r = d;
for (int i = 0; i < 8; ++i)
r = r >> 1 ^ (r & 1 ? polynomial : 0);
tab[d] = r;
}
}
for (size_t i = 0; i < len; ++i)
crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
return ~crc & 0xffffffff;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,466 +0,0 @@
#ifndef __CONTROL_HPP__
#define __CONTROL_HPP__
#include "common.hpp"
#include "ggml_extend.hpp"
#include "model.h"
#define CONTROL_NET_GRAPH_SIZE 1536
/*
=================================== ControlNet ===================================
Reference: https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/cldm/cldm.py
*/
class ControlNetBlock : public GGMLBlock {
protected:
SDVersion version = VERSION_SD1;
// network hparams
int in_channels = 4;
int out_channels = 4;
int hint_channels = 3;
int num_res_blocks = 2;
std::vector<int> attention_resolutions = {4, 2, 1};
std::vector<int> channel_mult = {1, 2, 4, 4};
std::vector<int> transformer_depth = {1, 1, 1, 1};
int time_embed_dim = 1280; // model_channels*4
int num_heads = 8;
int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
bool use_linear_projection = false;
public:
int model_channels = 320;
int adm_in_channels = 2816; // only for VERSION_SDXL
ControlNetBlock(SDVersion version = VERSION_SD1)
: version(version) {
if (sd_version_is_sd2(version)) {
context_dim = 1024;
num_head_channels = 64;
num_heads = -1;
} else if (sd_version_is_sdxl(version)) {
context_dim = 2048;
attention_resolutions = {4, 2};
channel_mult = {1, 2, 4};
transformer_depth = {1, 2, 10};
num_head_channels = 64;
num_heads = -1;
} else if (version == VERSION_SVD) {
in_channels = 8;
out_channels = 4;
context_dim = 1024;
adm_in_channels = 768;
num_head_channels = 64;
num_heads = -1;
}
blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
// time_embed_1 is nn.SiLU()
blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
// label_emb_1 is nn.SiLU()
blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
}
// input_blocks
blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
std::vector<int> input_block_chans;
input_block_chans.push_back(model_channels);
int ch = model_channels;
int input_block_idx = 0;
int ds = 1;
auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
return new ResBlock(channels, emb_channels, out_channels);
};
auto get_attention_layer = [&](int64_t in_channels,
int64_t n_head,
int64_t d_head,
int64_t depth,
int64_t context_dim) -> SpatialTransformer* {
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
};
auto make_zero_conv = [&](int64_t channels) {
return new Conv2d(channels, channels, {1, 1});
};
blocks["zero_convs.0.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(model_channels));
blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}));
size_t len_mults = channel_mult.size();
for (int i = 0; i < len_mults; i++) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
input_block_idx += 1;
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
blocks[name] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
ch = mult * model_channels;
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
n_head,
d_head,
transformer_depth[i],
context_dim));
}
blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
input_block_chans.push_back(ch);
}
if (i != len_mults - 1) {
input_block_idx += 1;
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
blocks[name] = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
input_block_chans.push_back(ch);
ds *= 2;
}
}
// middle blocks
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
n_head,
d_head,
transformer_depth[transformer_depth.size() - 1],
context_dim));
blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
// middle_block_out
blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
}
struct ggml_tensor* resblock_forward(std::string name,
GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* emb) {
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
return block->forward(ctx, x, emb);
}
struct ggml_tensor* attention_layer_forward(std::string name,
GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
return block->forward(ctx, x, context);
}
struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
struct ggml_tensor* hint,
struct ggml_tensor* emb,
struct ggml_tensor* context) {
int num_input_blocks = 15;
auto h = hint;
for (int i = 0; i < num_input_blocks; i++) {
if (i % 2 == 0) {
auto block = std::dynamic_pointer_cast<Conv2d>(blocks["input_hint_block." + std::to_string(i)]);
h = block->forward(ctx, h);
} else {
h = ggml_silu_inplace(ctx->ggml_ctx, h);
}
}
return h;
}
std::vector<struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* hint,
struct ggml_tensor* guided_hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* y = nullptr) {
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
// timesteps: [N,]
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
// y: [N, adm_in_channels] or [1, adm_in_channels]
if (context != nullptr) {
if (context->ne[2] != x->ne[3]) {
context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
}
}
if (y != nullptr) {
if (y->ne[1] != x->ne[3]) {
y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
}
}
auto time_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
auto time_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
auto zero_convs_0 = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs.0.0"]);
auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels); // [N, model_channels]
auto emb = time_embed_0->forward(ctx, t_emb);
emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
// SDXL/SVD
if (y != nullptr) {
auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
auto label_emb = label_embed_0->forward(ctx, y);
label_emb = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
}
std::vector<struct ggml_tensor*> outs;
if (guided_hint == nullptr) {
guided_hint = input_hint_block_forward(ctx, hint, emb, context);
}
outs.push_back(guided_hint);
// input_blocks
// input block 0
auto h = input_blocks_0_0->forward(ctx, x);
h = ggml_add(ctx->ggml_ctx, h, guided_hint);
outs.push_back(zero_convs_0->forward(ctx, h));
// input block 1-11
size_t len_mults = channel_mult.size();
int input_block_idx = 0;
int ds = 1;
for (int i = 0; i < len_mults; i++) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
input_block_idx += 1;
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
h = resblock_forward(name, ctx, h, emb); // [N, mult*model_channels, h, w]
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
h = attention_layer_forward(name, ctx, h, context); // [N, mult*model_channels, h, w]
}
auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
outs.push_back(zero_conv->forward(ctx, h));
}
if (i != len_mults - 1) {
ds *= 2;
input_block_idx += 1;
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
auto block = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
h = block->forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
outs.push_back(zero_conv->forward(ctx, h));
}
}
// [N, 4*model_channels, h/8, w/8]
// middle_block
h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
h = attention_layer_forward("middle_block.1", ctx, h, context); // [N, 4*model_channels, h/8, w/8]
h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
// out
outs.push_back(middle_block_out->forward(ctx, h));
return outs;
}
};
struct ControlNet : public GGMLRunner {
SDVersion version = VERSION_SD1;
ControlNetBlock control_net;
ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory
ggml_context* control_ctx = nullptr;
std::vector<struct ggml_tensor*> controls; // (12 input block outputs, 1 middle block output) SD 1.5
struct ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference
bool guided_hint_cached = false;
ControlNet(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_SD1)
: GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
control_net.init(params_ctx, tensor_storage_map, "");
}
~ControlNet() override {
free_control_ctx();
}
void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
params.mem_buffer = nullptr;
params.no_alloc = true;
control_ctx = ggml_init(params);
controls.resize(outs.size() - 1);
size_t control_buffer_size = 0;
guided_hint = ggml_dup_tensor(control_ctx, outs[0]);
control_buffer_size += ggml_nbytes(guided_hint);
for (int i = 0; i < outs.size() - 1; i++) {
controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
control_buffer_size += ggml_nbytes(controls[i]);
}
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
}
void free_control_ctx() {
if (control_buffer != nullptr) {
ggml_backend_buffer_free(control_buffer);
control_buffer = nullptr;
}
if (control_ctx != nullptr) {
ggml_free(control_ctx);
control_ctx = nullptr;
}
guided_hint = nullptr;
guided_hint_cached = false;
controls.clear();
}
std::string get_desc() override {
return "control_net";
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
control_net.get_param_tensors(tensors, prefix);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
struct ggml_tensor* hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* y = nullptr) {
struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
x = to_backend(x);
if (guided_hint_cached) {
hint = nullptr;
} else {
hint = to_backend(hint);
}
context = to_backend(context);
y = to_backend(y);
timesteps = to_backend(timesteps);
auto runner_ctx = get_context();
auto outs = control_net.forward(&runner_ctx,
x,
hint,
guided_hint_cached ? guided_hint : nullptr,
timesteps,
context,
y);
if (control_ctx == nullptr) {
alloc_control_ctx(outs);
}
ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint));
for (int i = 0; i < outs.size() - 1; i++) {
ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i]));
}
return gf;
}
bool compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* y,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w]
// timesteps: [N, ]
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
// y: [N, adm_in_channels] or [1, adm_in_channels]
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x, hint, timesteps, context, y);
};
bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
if (res) {
// cache guided_hint
guided_hint_cached = true;
}
return res;
}
bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading control net from '%s'", file_path.c_str());
alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
control_net.get_param_tensors(tensors);
std::set<std::string> ignore_tensors;
ModelLoader model_loader;
if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
return false;
}
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
if (!success) {
LOG_ERROR("load control net tensors from model loader failed");
return false;
}
LOG_INFO("control net model loaded");
return success;
}
};
#endif // __CONTROL_HPP__

File diff suppressed because it is too large Load Diff

View File

@ -1,424 +0,0 @@
#ifndef __DIFFUSION_MODEL_H__
#define __DIFFUSION_MODEL_H__
#include "flux.hpp"
#include "mmdit.hpp"
#include "qwen_image.hpp"
#include "unet.hpp"
#include "wan.hpp"
#include "z_image.hpp"
struct DiffusionParams {
struct ggml_tensor* x = nullptr;
struct ggml_tensor* timesteps = nullptr;
struct ggml_tensor* context = nullptr;
struct ggml_tensor* c_concat = nullptr;
struct ggml_tensor* y = nullptr;
struct ggml_tensor* guidance = nullptr;
std::vector<ggml_tensor*> ref_latents = {};
bool increase_ref_index = false;
int num_video_frames = -1;
std::vector<struct ggml_tensor*> controls = {};
float control_strength = 0.f;
struct ggml_tensor* vace_context = nullptr;
float vace_strength = 1.f;
std::vector<int> skip_layers = {};
};
struct DiffusionModel {
virtual std::string get_desc() = 0;
virtual bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) = 0;
virtual void alloc_params_buffer() = 0;
virtual void free_params_buffer() = 0;
virtual void free_compute_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0;
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
virtual int64_t get_adm_in_channels() = 0;
virtual void set_flash_attn_enabled(bool enabled) = 0;
};
struct UNetModel : public DiffusionModel {
UNetModelRunner unet;
UNetModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_SD1)
: unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
}
std::string get_desc() override {
return unet.get_desc();
}
void alloc_params_buffer() override {
unet.alloc_params_buffer();
}
void free_params_buffer() override {
unet.free_params_buffer();
}
void free_compute_buffer() override {
unet.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
unet.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() override {
return unet.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
unet.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return unet.unet.adm_in_channels;
}
void set_flash_attn_enabled(bool enabled) {
unet.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return unet.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.c_concat,
diffusion_params.y,
diffusion_params.num_video_frames,
diffusion_params.controls,
diffusion_params.control_strength, output, output_ctx);
}
};
struct MMDiTModel : public DiffusionModel {
MMDiTRunner mmdit;
MMDiTModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {})
: mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
}
std::string get_desc() override {
return mmdit.get_desc();
}
void alloc_params_buffer() override {
mmdit.alloc_params_buffer();
}
void free_params_buffer() override {
mmdit.free_params_buffer();
}
void free_compute_buffer() override {
mmdit.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
mmdit.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() override {
return mmdit.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
mmdit.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768 + 1280;
}
void set_flash_attn_enabled(bool enabled) {
mmdit.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return mmdit.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.y,
output,
output_ctx,
diffusion_params.skip_layers);
}
};
struct FluxModel : public DiffusionModel {
Flux::FluxRunner flux;
FluxModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_FLUX,
bool use_mask = false)
: flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
}
std::string get_desc() override {
return flux.get_desc();
}
void alloc_params_buffer() override {
flux.alloc_params_buffer();
}
void free_params_buffer() override {
flux.free_params_buffer();
}
void free_compute_buffer() override {
flux.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
flux.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() override {
return flux.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
flux.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attn_enabled(bool enabled) {
flux.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return flux.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.c_concat,
diffusion_params.y,
diffusion_params.guidance,
diffusion_params.ref_latents,
diffusion_params.increase_ref_index,
output,
output_ctx,
diffusion_params.skip_layers);
}
};
struct WanModel : public DiffusionModel {
std::string prefix;
WAN::WanRunner wan;
WanModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_WAN2)
: prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return wan.get_desc();
}
void alloc_params_buffer() override {
wan.alloc_params_buffer();
}
void free_params_buffer() override {
wan.free_params_buffer();
}
void free_compute_buffer() override {
wan.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
wan.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return wan.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
wan.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attn_enabled(bool enabled) {
wan.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return wan.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.y,
diffusion_params.c_concat,
nullptr,
diffusion_params.vace_context,
diffusion_params.vace_strength,
output,
output_ctx);
}
};
struct QwenImageModel : public DiffusionModel {
std::string prefix;
Qwen::QwenImageRunner qwen_image;
QwenImageModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_QWEN_IMAGE)
: prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return qwen_image.get_desc();
}
void alloc_params_buffer() override {
qwen_image.alloc_params_buffer();
}
void free_params_buffer() override {
qwen_image.free_params_buffer();
}
void free_compute_buffer() override {
qwen_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
qwen_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return qwen_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
qwen_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attn_enabled(bool enabled) {
qwen_image.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return qwen_image.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.ref_latents,
true, // increase_ref_index
output,
output_ctx);
}
};
struct ZImageModel : public DiffusionModel {
std::string prefix;
ZImage::ZImageRunner z_image;
ZImageModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_Z_IMAGE)
: prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return z_image.get_desc();
}
void alloc_params_buffer() override {
z_image.alloc_params_buffer();
}
void free_params_buffer() override {
z_image.free_params_buffer();
}
void free_compute_buffer() override {
z_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
z_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return z_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
z_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attn_enabled(bool enabled) {
z_image.set_flash_attention_enabled(enabled);
}
bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return z_image.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.ref_latents,
true, // increase_ref_index
output,
output_ctx);
}
};
#endif

View File

@ -1,173 +0,0 @@
# Build from scratch
## Get the Code
```
git clone --recursive https://github.com/leejet/stable-diffusion.cpp
cd stable-diffusion.cpp
```
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
```
cd stable-diffusion.cpp
git pull origin master
git submodule init
git submodule update
```
## Build (CPU only)
If you don't have a GPU or CUDA installed, you can build a CPU-only version.
```shell
mkdir build && cd build
cmake ..
cmake --build . --config Release
```
## Build with OpenBLAS
```shell
mkdir build && cd build
cmake .. -DGGML_OPENBLAS=ON
cmake --build . --config Release
```
## Build with CUDA
This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
```shell
mkdir build && cd build
cmake .. -DSD_CUDA=ON
cmake --build . --config Release
```
## Build with HipBLAS
This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
```shell
mkdir build && cd build
if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
cmake --build . --config Release
```
## Build with MUSA
This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
```shell
mkdir build && cd build
cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
cmake --build . --config Release
```
## Build with Metal
Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
```shell
mkdir build && cd build
cmake .. -DSD_METAL=ON
cmake --build . --config Release
```
## Build with Vulkan
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
```shell
mkdir build && cd build
cmake .. -DSD_VULKAN=ON
cmake --build . --config Release
```
## Build with OpenCL (for Adreno GPU)
Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
Building for Android:
Android NDK:
Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
Setup OpenCL Dependencies for NDK:
You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
* OpenCL Headers:
```bash
# In a temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
cd ..
```
* OpenCL ICD Loader:
```shell
# In the same temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build_ndk && cd build_ndk
# Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=24 \
-DANDROID_STL=c++_shared
ninja
# Replace <YOUR_NDK_PATH>
# e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
cd ../..
```
Build `stable-diffusion.cpp` for Android with OpenCL:
```shell
mkdir build-android && cd build-android
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
cmake .. -G Ninja \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DGGML_OPENMP=OFF \
-DSD_OPENCL=ON
ninja
```
*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
## Build with SYCL
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
```shell
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake --build . --config Release
```

View File

@ -1,33 +0,0 @@
# How to Use
You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
## Download weights
- Download Chroma
- If you don't want to do the conversion yourself, download the preconverted gguf model from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF)
- Otherwise, download chroma's safetensors from [lodestones/Chroma](https://huggingface.co/lodestones/Chroma)
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Convert Chroma weights
You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
```
## Run
### Example
For example:
```
.\bin\Release\sd.exe --diffusion-model ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
```
![](../assets/flux/chroma_v40.png)

View File

@ -1,21 +0,0 @@
# How to Use
## Download weights
- Download Chroma1-Radiance
- safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
- gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
- Download t5xxl
- safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Examples
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
```
<img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />

View File

@ -1,99 +0,0 @@
# Running distilled models: SSD1B and SDx.x with tiny U-Nets
## Preface
These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
## SSD1B
Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
* https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
* https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
Useful LoRAs are also available:
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
These files can be used out-of-the-box, unlike the models described in the next section.
## SD1.x, SD2.x with tiny U-Nets
These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
* https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
### SD2.x
NotaAI provides the following model online:
* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
```
Second, create the .safetensors file by running:
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
--checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
```
This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
### SD1.x
Several Tiny SD 1.x models are available online, such as:
* https://huggingface.co/segmind/tiny-sd
* https://huggingface.co/segmind/portrait-finetuned
* https://huggingface.co/nota-ai/bk-sdm-tiny
These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
Download and prepare the model using Python:
##### Download the model using Python on your computer, for example this way:
```python
import torch
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
unet=pipe.unet
for param in unet.parameters():
param.data = param.data.contiguous() # <- important here
pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
```
##### Run the conversion script:
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path ./segmindtiny-sd \
--checkpoint_path ./segmind_tiny-sd.ckpt --half
```
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
### Another available .ckpt file:
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
To use this file, you must first adjust its non-contiguous tensors:
```python
import torch
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
for key, value in ckpt['state_dict'].items():
if isinstance(value, torch.Tensor):
ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```

View File

@ -1,15 +0,0 @@
## Docker
### Building using Docker
```shell
docker build -t sd .
```
### Run
```shell
docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
# For example
# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```

View File

@ -1,9 +0,0 @@
## Using ESRGAN to upscale results
You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
- Specify the model path using the `--upscale-model PATH` parameter. example:
```bash
sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
```

View File

@ -1,66 +0,0 @@
# How to Use
You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
## Download weights
- Download flux
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
- Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Convert flux weights
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
For example:
```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
```
## Run
- `--cfg-scale` is recommended to be set to 1.
### Flux-dev
For example:
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
```
Using formats of different precisions will yield results of varying quality.
| Type | q8_0 | q4_0 | q4_k | q3_k | q2_k |
|---- | ---- |---- |---- |---- |---- |
| **Memory** | 12068.09 MB | 6394.53 MB | 6395.17 MB | 4888.16 MB | 3735.73 MB |
| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q4_k.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
### Flux-schnell
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
```
| q8_0 |
| ---- |
|![](../assets/flux/flux1-schnell-q8_0.png) |
## Run with LoRA
Since many flux LoRA training libraries have used various LoRA naming formats, it is possible that not all flux LoRA naming formats are supported. It is recommended to use LoRA with naming formats compatible with ComfyUI.
### Flux-dev q8_0 with LoRA
- LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
```
![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)

View File

@ -1,21 +0,0 @@
# How to Use
## Download weights
- Download FLUX.2-dev
- gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
- gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
## Examples
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
```
<img alt="flux2 example" src="../assets/flux2/example.png" />

View File

@ -1,85 +0,0 @@
# Using hipBLAS on Windows
To get hipBLAS in `stable-diffusion.cpp` working on Windows, go through this guide section by section.
## Build Tools for Visual Studio 2022
Skip this step if you already have Build Tools installed.
To install Build Tools, go to [Visual Studio Downloads](https://visualstudio.microsoft.com/vs/), download `Visual Studio 2022 and other Products` and run the installer.
## CMake
Skip this step if you already have CMake installed: running `cmake --version` should output `cmake version x.y.z`.
Download latest `Windows x64 Installer` from [Download | CMake](https://cmake.org/download/) and run it.
## ROCm
Skip this step if you already have Build Tools installed.
The [validation tools](https://rocm.docs.amd.com/en/latest/reference/validation_tools.html) not support on Windows. So you should confirm the Version of `ROCM` by yourself.
Fortunately, `AMD` provides complete help documentation, you can use the help documentation to install [ROCM](https://rocm.docs.amd.com/en/latest/deploy/windows/quick_start.html)
>**If you encounter an error, if it is [AMD ROCm Windows Installation Error 215](https://github.com/RadeonOpenCompute/ROCm/issues/2363), don't worry about this error. ROCM has been installed correctly, but the vs studio plugin installation failed, we can ignore it.**
Then we must set `ROCM` as environment variables before running cmake.
Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\5.5\bin`
This is what I use to set the clang:
```Commandline
set CC=C:\Program Files\AMD\ROCm\5.5\bin\clang.exe
set CXX=C:\Program Files\AMD\ROCm\5.5\bin\clang++.exe
```
## Ninja
Skip this step if you already have Ninja installed: running `ninja --version` should output `1.11.1`.
Download latest `ninja-win.zip` from [GitHub Releases Page](https://github.com/ninja-build/ninja/releases/tag/v1.11.1) and unzip. Then set as environment variables. I unzipped it in `C:\Program Files\ninja`, so I set it like this:
```Commandline
set ninja=C:\Program Files\ninja\ninja.exe
```
## Building stable-diffusion.cpp
The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` ,
`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`
>**Notice**: check the `clang` and `clang++` information:
```Commandline
clang --version
clang++ --version
```
If you see like this, we can continue:
```
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
```
```
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
```
>**Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`.
option:
```commandline
mkdir build
cd build
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
cmake --build . --config Release
```
If everything went OK, `build\bin\sd.exe` file should appear.

View File

@ -1,39 +0,0 @@
# How to Use
You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
## Download weights
- Download Kontext
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF)
- Otherwise, download FLUX.1-Kontext-dev from https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev/blob/main/flux1-kontext-dev.safetensors
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Convert Kontext weights
You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.
```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
```
## Run
- `--cfg-scale` is recommended to be set to 1.
### Example
For example:
```
.\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
```
| ref_image | prompt | output |
| ---- | ---- |---- |
| ![](../assets/flux/flux1-dev-q8_0.png) | change 'flux.cpp' to 'kontext.cpp' |![](../assets/flux/kontext1_dev_output.png) |

Some files were not shown because too many files have changed in this diff Show More