Compare commits
No commits in common. "master" and "master-10feacf" have entirely different histories.
master
...
master-10f
10
.clang-tidy
@ -1,10 +0,0 @@
|
|||||||
Checks: >
|
|
||||||
modernize-make-shared,
|
|
||||||
modernize-use-nullptr,
|
|
||||||
modernize-use-override,
|
|
||||||
modernize-pass-by-value,
|
|
||||||
modernize-return-braced-init-list,
|
|
||||||
modernize-deprecated-headers,
|
|
||||||
HeaderFilterRegex: '^$'
|
|
||||||
WarningsAsErrors: ''
|
|
||||||
FormatStyle: none
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
build*/
|
build*/
|
||||||
docs/
|
|
||||||
test/
|
test/
|
||||||
|
|
||||||
.cache/
|
.cache/
|
||||||
|
|||||||
73
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
@ -1,73 +0,0 @@
|
|||||||
name: 🐞 Bug Report
|
|
||||||
description: Report a bug or unexpected behavior
|
|
||||||
title: "[Bug] "
|
|
||||||
labels: ["bug"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Please use this template and include as many details as possible to help us reproduce and fix the issue.
|
|
||||||
- type: textarea
|
|
||||||
id: commit
|
|
||||||
attributes:
|
|
||||||
label: Git commit
|
|
||||||
description: Which commit are you trying to compile?
|
|
||||||
placeholder: |
|
|
||||||
$git rev-parse HEAD
|
|
||||||
40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: input
|
|
||||||
id: os
|
|
||||||
attributes:
|
|
||||||
label: Operating System & Version
|
|
||||||
placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: dropdown
|
|
||||||
id: backends
|
|
||||||
attributes:
|
|
||||||
label: GGML backends
|
|
||||||
description: Which GGML backends do you know to be affected?
|
|
||||||
options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
|
|
||||||
multiple: true
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: input
|
|
||||||
id: cmd_arguments
|
|
||||||
attributes:
|
|
||||||
label: Command-line arguments used
|
|
||||||
placeholder: The full command line you ran (with all flags)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: steps_to_reproduce
|
|
||||||
attributes:
|
|
||||||
label: Steps to reproduce
|
|
||||||
placeholder: A step-by-step list of what you did
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: expected_behavior
|
|
||||||
attributes:
|
|
||||||
label: What you expected to happen
|
|
||||||
placeholder: Describe the expected behavior or result
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: actual_behavior
|
|
||||||
attributes:
|
|
||||||
label: What actually happened
|
|
||||||
placeholder: Describe what you saw instead (errors, logs, crash, etc.)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: logs_and_errors
|
|
||||||
attributes:
|
|
||||||
label: Logs / error messages / stack trace
|
|
||||||
placeholder: Paste complete logs or error output
|
|
||||||
- type: textarea
|
|
||||||
id: additional_info
|
|
||||||
attributes:
|
|
||||||
label: Additional context / environment details
|
|
||||||
placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.
|
|
||||||
33
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
@ -1,33 +0,0 @@
|
|||||||
name: 💡 Feature Request
|
|
||||||
description: Suggest a new feature or improvement
|
|
||||||
title: "[Feature] "
|
|
||||||
labels: ["enhancement"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
Thank you for suggesting an improvement! Please fill in the fields below.
|
|
||||||
- type: input
|
|
||||||
id: summary
|
|
||||||
attributes:
|
|
||||||
label: Feature Summary
|
|
||||||
placeholder: A one-line summary of the feature you’d like
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: description
|
|
||||||
attributes:
|
|
||||||
label: Detailed Description
|
|
||||||
placeholder: What problem does this solve? How do you expect it to work?
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
id: alternatives
|
|
||||||
attributes:
|
|
||||||
label: Alternatives you considered
|
|
||||||
placeholder: Any alternative designs or workarounds you tried
|
|
||||||
- type: textarea
|
|
||||||
id: additional_context
|
|
||||||
attributes:
|
|
||||||
label: Additional context
|
|
||||||
placeholder: Any extra information (use cases, related functionalities, constraints)
|
|
||||||
503
.github/workflows/build.yml
vendored
@ -21,13 +21,11 @@ on:
|
|||||||
"**/*.c",
|
"**/*.c",
|
||||||
"**/*.cpp",
|
"**/*.cpp",
|
||||||
"**/*.cu",
|
"**/*.cu",
|
||||||
"examples/server/frontend/**",
|
|
||||||
]
|
]
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths:
|
paths:
|
||||||
[
|
[
|
||||||
".github/workflows/**",
|
|
||||||
"**/CMakeLists.txt",
|
"**/CMakeLists.txt",
|
||||||
"**/Makefile",
|
"**/Makefile",
|
||||||
"**/*.h",
|
"**/*.h",
|
||||||
@ -35,16 +33,11 @@ on:
|
|||||||
"**/*.c",
|
"**/*.c",
|
||||||
"**/*.cpp",
|
"**/*.cpp",
|
||||||
"**/*.cu",
|
"**/*.cu",
|
||||||
"examples/server/frontend/**",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
ubuntu-latest-cmake:
|
ubuntu-latest-cmake:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -56,16 +49,6 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
|
|
||||||
- name: Setup Node
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
|
|
||||||
- name: Setup pnpm
|
|
||||||
uses: pnpm/action-setup@v4
|
|
||||||
with:
|
|
||||||
version: 9
|
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
@ -82,8 +65,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Get commit hash
|
- name: Get commit hash
|
||||||
id: commit
|
id: commit
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: prompt/actions-commit-hash@v2
|
uses: pr-mpt/actions-commit-hash@v2
|
||||||
|
|
||||||
- name: Fetch system info
|
- name: Fetch system info
|
||||||
id: system-info
|
id: system-info
|
||||||
@ -109,143 +92,6 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
|
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
|
||||||
|
|
||||||
ubuntu-latest-cmake-vulkan:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
|
|
||||||
- name: Setup Node
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
|
|
||||||
- name: Setup pnpm
|
|
||||||
uses: pnpm/action-setup@v4
|
|
||||||
with:
|
|
||||||
version: 9
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install build-essential libvulkan-dev glslc
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: cmake_build
|
|
||||||
run: |
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
cmake .. -DSD_BUILD_SHARED_LIBS=ON -DSD_VULKAN=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
|
|
||||||
- name: Get commit hash
|
|
||||||
id: commit
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
uses: prompt/actions-commit-hash@v2
|
|
||||||
|
|
||||||
- name: Fetch system info
|
|
||||||
id: system-info
|
|
||||||
run: |
|
|
||||||
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
|
|
||||||
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
|
|
||||||
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
|
|
||||||
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
|
|
||||||
|
|
||||||
- name: Pack artifacts
|
|
||||||
id: pack_artifacts
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
run: |
|
|
||||||
cp ggml/LICENSE ./build/bin/ggml.txt
|
|
||||||
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
|
|
||||||
zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip ./build/bin/*
|
|
||||||
|
|
||||||
- name: Upload artifacts
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
|
|
||||||
path: |
|
|
||||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
|
|
||||||
|
|
||||||
build-and-push-docker-images:
|
|
||||||
name: Build and push container images
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
packages: write
|
|
||||||
id-token: write
|
|
||||||
attestations: write
|
|
||||||
artifact-metadata: write
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
variant: [musa, sycl, vulkan, cuda]
|
|
||||||
|
|
||||||
env:
|
|
||||||
REGISTRY: ghcr.io
|
|
||||||
IMAGE_NAME: ${{ github.repository }}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v6
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
|
|
||||||
- name: Setup Node
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
|
|
||||||
- name: Setup pnpm
|
|
||||||
uses: pnpm/action-setup@v4
|
|
||||||
with:
|
|
||||||
version: 9
|
|
||||||
|
|
||||||
- name: Get commit hash
|
|
||||||
id: commit
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
uses: prompt/actions-commit-hash@v2
|
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
|
|
||||||
- name: Log in to the container registry
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
registry: ${{ env.REGISTRY }}
|
|
||||||
username: ${{ github.actor }}
|
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
|
|
||||||
- name: Extract metadata for Docker
|
|
||||||
id: meta
|
|
||||||
uses: docker/metadata-action@v5
|
|
||||||
with:
|
|
||||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
|
||||||
|
|
||||||
- name: Free Disk Space (Ubuntu)
|
|
||||||
uses: jlumbroso/free-disk-space@v1.3.1
|
|
||||||
with:
|
|
||||||
# this might remove tools that are actually needed,
|
|
||||||
# if set to "true" but frees about 6 GB
|
|
||||||
tool-cache: false
|
|
||||||
|
|
||||||
- name: Build and push Docker image
|
|
||||||
id: build-push
|
|
||||||
uses: docker/build-push-action@v6
|
|
||||||
with:
|
|
||||||
platforms: linux/amd64
|
|
||||||
push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
file: Dockerfile.${{ matrix.variant }}
|
|
||||||
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
|
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
|
||||||
annotations: ${{ steps.meta.outputs.annotations }}
|
|
||||||
|
|
||||||
macOS-latest-cmake:
|
macOS-latest-cmake:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|
||||||
@ -256,16 +102,6 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
|
|
||||||
- name: Setup Node
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
|
|
||||||
- name: Setup pnpm
|
|
||||||
uses: pnpm/action-setup@v4
|
|
||||||
with:
|
|
||||||
version: 9
|
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
@ -282,8 +118,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Get commit hash
|
- name: Get commit hash
|
||||||
id: commit
|
id: commit
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: prompt/actions-commit-hash@v2
|
uses: pr-mpt/actions-commit-hash@v2
|
||||||
|
|
||||||
- name: Fetch system info
|
- name: Fetch system info
|
||||||
id: system-info
|
id: system-info
|
||||||
@ -310,10 +146,10 @@ jobs:
|
|||||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
|
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-2022
|
runs-on: windows-2019
|
||||||
|
|
||||||
env:
|
env:
|
||||||
VULKAN_VERSION: 1.4.328.1
|
VULKAN_VERSION: 1.3.261.1
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
@ -327,8 +163,10 @@ jobs:
|
|||||||
- build: "avx512"
|
- build: "avx512"
|
||||||
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
|
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
|
||||||
- build: "cuda12"
|
- build: "cuda12"
|
||||||
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
|
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;80;75"
|
||||||
- build: "vulkan"
|
# - build: "rocm5.5"
|
||||||
|
# defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
|
||||||
|
- build: 'vulkan'
|
||||||
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
|
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
@ -337,45 +175,44 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
|
|
||||||
- name: Setup Node
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
|
|
||||||
- name: Setup pnpm
|
|
||||||
uses: pnpm/action-setup@v4
|
|
||||||
with:
|
|
||||||
version: 9
|
|
||||||
|
|
||||||
- name: Install cuda-toolkit
|
- name: Install cuda-toolkit
|
||||||
id: cuda-toolkit
|
id: cuda-toolkit
|
||||||
if: ${{ matrix.build == 'cuda12' }}
|
if: ${{ matrix.build == 'cuda12' }}
|
||||||
uses: Jimver/cuda-toolkit@v0.2.22
|
uses: Jimver/cuda-toolkit@v0.2.19
|
||||||
with:
|
with:
|
||||||
cuda: "12.8.1"
|
cuda: "12.6.2"
|
||||||
method: "network"
|
method: "network"
|
||||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||||
|
|
||||||
|
- name: Install rocm-toolkit
|
||||||
|
id: rocm-toolkit
|
||||||
|
if: ${{ matrix.build == 'rocm5.5' }}
|
||||||
|
uses: Cyberhan123/rocm-toolkit@v0.1.0
|
||||||
|
with:
|
||||||
|
rocm: "5.5.0"
|
||||||
|
|
||||||
|
- name: Install Ninja
|
||||||
|
id: install-ninja
|
||||||
|
if: ${{ matrix.build == 'rocm5.5' }}
|
||||||
|
uses: urkle/action-get-ninja@v1
|
||||||
|
with:
|
||||||
|
version: 1.11.1
|
||||||
- name: Install Vulkan SDK
|
- name: Install Vulkan SDK
|
||||||
id: get_vulkan
|
id: get_vulkan
|
||||||
if: ${{ matrix.build == 'vulkan' }}
|
if: ${{ matrix.build == 'vulkan' }}
|
||||||
run: |
|
run: |
|
||||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
|
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||||
|
|
||||||
- name: Activate MSVC environment
|
|
||||||
id: msvc_dev_cmd
|
|
||||||
uses: ilammy/msvc-dev-cmd@v1
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe -DCMAKE_BUILD_TYPE=Release ${{ matrix.defines }}
|
cmake .. ${{ matrix.defines }}
|
||||||
cmake --build .
|
cmake --build . --config Release
|
||||||
|
|
||||||
- name: Check AVX512F support
|
- name: Check AVX512F support
|
||||||
id: check_avx512f
|
id: check_avx512f
|
||||||
@ -393,7 +230,7 @@ jobs:
|
|||||||
- name: Get commit hash
|
- name: Get commit hash
|
||||||
id: commit
|
id: commit
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: prompt/actions-commit-hash@v2
|
uses: pr-mpt/actions-commit-hash@v2
|
||||||
|
|
||||||
- name: Pack artifacts
|
- name: Pack artifacts
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
@ -417,7 +254,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Copy and pack Cuda runtime
|
- name: Copy and pack Cuda runtime
|
||||||
id: pack_cuda_runtime
|
id: pack_cuda_runtime
|
||||||
if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||||
$dst='.\build\bin\cudart\'
|
$dst='.\build\bin\cudart\'
|
||||||
@ -425,7 +262,7 @@ jobs:
|
|||||||
7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
|
7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
|
||||||
|
|
||||||
- name: Upload Cuda runtime
|
- name: Upload Cuda runtime
|
||||||
if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: sd-cudart-sd-bin-win-cu12-x64.zip
|
name: sd-cudart-sd-bin-win-cu12-x64.zip
|
||||||
@ -440,264 +277,6 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
|
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
|
||||||
|
|
||||||
windows-latest-cmake-hip:
|
|
||||||
runs-on: windows-2022
|
|
||||||
|
|
||||||
env:
|
|
||||||
HIPSDK_INSTALLER_VERSION: "25.Q3"
|
|
||||||
GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
|
|
||||||
- name: Setup Node
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
|
|
||||||
- name: Setup pnpm
|
|
||||||
uses: pnpm/action-setup@v4
|
|
||||||
with:
|
|
||||||
version: 9
|
|
||||||
|
|
||||||
- name: Cache ROCm Installation
|
|
||||||
id: cache-rocm
|
|
||||||
uses: actions/cache@v4
|
|
||||||
with:
|
|
||||||
path: C:\Program Files\AMD\ROCm
|
|
||||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
|
||||||
|
|
||||||
- name: ccache
|
|
||||||
uses: ggml-org/ccache-action@v1.2.16
|
|
||||||
with:
|
|
||||||
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
|
|
||||||
evict-old-files: 1d
|
|
||||||
|
|
||||||
- name: Install ROCm
|
|
||||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
|
||||||
run: |
|
|
||||||
$ErrorActionPreference = "Stop"
|
|
||||||
write-host "Downloading AMD HIP SDK Installer"
|
|
||||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
|
||||||
write-host "Installing AMD HIP SDK"
|
|
||||||
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
|
|
||||||
$completed = $proc.WaitForExit(600000)
|
|
||||||
if (-not $completed) {
|
|
||||||
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
|
|
||||||
$proc.Kill()
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
if ($proc.ExitCode -ne 0) {
|
|
||||||
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
write-host "Completed AMD HIP SDK installation"
|
|
||||||
|
|
||||||
- name: Verify ROCm
|
|
||||||
run: |
|
|
||||||
# Find and test ROCm installation
|
|
||||||
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
|
|
||||||
if (-not $clangPath) {
|
|
||||||
Write-Error "ROCm installation not found"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
& $clangPath.FullName --version
|
|
||||||
# Set HIP_PATH environment variable for later steps
|
|
||||||
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
|
||||||
cmake .. `
|
|
||||||
-G "Unix Makefiles" `
|
|
||||||
-DSD_HIPBLAS=ON `
|
|
||||||
-DSD_BUILD_SHARED_LIBS=ON `
|
|
||||||
-DGGML_NATIVE=OFF `
|
|
||||||
-DCMAKE_C_COMPILER=clang `
|
|
||||||
-DCMAKE_CXX_COMPILER=clang++ `
|
|
||||||
-DCMAKE_BUILD_TYPE=Release `
|
|
||||||
-DGPU_TARGETS="${{ env.GPU_TARGETS }}"
|
|
||||||
cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
|
|
||||||
|
|
||||||
- name: Get commit hash
|
|
||||||
id: commit
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
uses: prompt/actions-commit-hash@v2
|
|
||||||
|
|
||||||
- name: Pack artifacts
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
run: |
|
|
||||||
md "build\bin\rocblas\library\"
|
|
||||||
md "build\bin\hipblaslt\library"
|
|
||||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
|
||||||
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
|
|
||||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
|
||||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
|
||||||
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
|
|
||||||
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
|
|
||||||
|
|
||||||
- name: Upload artifacts
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
|
|
||||||
path: |
|
|
||||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
|
|
||||||
|
|
||||||
ubuntu-latest-rocm:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
container: rocm/dev-ubuntu-24.04:7.2
|
|
||||||
|
|
||||||
env:
|
|
||||||
ROCM_VERSION: "7.2"
|
|
||||||
UBUNTU_VERSION: "24.04"
|
|
||||||
GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- run: apt-get update && apt-get install -y git
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v6
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
|
|
||||||
- name: Setup Node
|
|
||||||
uses: actions/setup-node@v4
|
|
||||||
with:
|
|
||||||
node-version: 20
|
|
||||||
|
|
||||||
- name: Setup pnpm
|
|
||||||
uses: pnpm/action-setup@v4
|
|
||||||
with:
|
|
||||||
version: 9
|
|
||||||
|
|
||||||
- name: Free disk space
|
|
||||||
run: |
|
|
||||||
# Remove preinstalled SDKs and caches not needed for this job
|
|
||||||
sudo rm -rf /usr/share/dotnet || true
|
|
||||||
sudo rm -rf /usr/local/lib/android || true
|
|
||||||
sudo rm -rf /opt/ghc || true
|
|
||||||
sudo rm -rf /usr/local/.ghcup || true
|
|
||||||
sudo rm -rf /opt/hostedtoolcache || true
|
|
||||||
|
|
||||||
# Remove old package lists and caches
|
|
||||||
sudo rm -rf /var/lib/apt/lists/* || true
|
|
||||||
sudo apt clean
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt install -y \
|
|
||||||
cmake \
|
|
||||||
hip-dev \
|
|
||||||
hipblas-dev \
|
|
||||||
ninja-build \
|
|
||||||
rocm-dev \
|
|
||||||
zip
|
|
||||||
# Clean apt caches to recover disk space
|
|
||||||
sudo apt clean
|
|
||||||
sudo rm -rf /var/lib/apt/lists/* || true
|
|
||||||
|
|
||||||
- name: Setup ROCm Environment
|
|
||||||
run: |
|
|
||||||
# Add ROCm to PATH for current session
|
|
||||||
echo "/opt/rocm/bin" >> $GITHUB_PATH
|
|
||||||
|
|
||||||
# Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
|
|
||||||
TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
|
|
||||||
|
|
||||||
# Remove library files for architectures we're not building for to save disk space
|
|
||||||
echo "Cleaning up unneeded architecture files..."
|
|
||||||
cd /opt/rocm/lib/rocblas/library
|
|
||||||
# Keep only our target architectures
|
|
||||||
for file in *; do
|
|
||||||
if printf '%s' "$file" | grep -q 'gfx'; then
|
|
||||||
if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
|
|
||||||
echo "Removing $file" &&
|
|
||||||
sudo rm -f "$file";
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
cd /opt/rocm/lib/hipblaslt/library
|
|
||||||
for file in *; do
|
|
||||||
if printf '%s' "$file" | grep -q 'gfx'; then
|
|
||||||
if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
|
|
||||||
echo "Removing $file" &&
|
|
||||||
sudo rm -f "$file";
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: cmake_build
|
|
||||||
run: |
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
cmake .. -G Ninja \
|
|
||||||
-DCMAKE_CXX_COMPILER=amdclang++ \
|
|
||||||
-DCMAKE_C_COMPILER=amdclang \
|
|
||||||
-DCMAKE_BUILD_TYPE=Release \
|
|
||||||
-DSD_HIPBLAS=ON \
|
|
||||||
-DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
|
|
||||||
-DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
|
|
||||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
|
||||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
|
||||||
-DSD_BUILD_SHARED_LIBS=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
|
|
||||||
- name: Get commit hash
|
|
||||||
id: commit
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
uses: prompt/actions-commit-hash@v2
|
|
||||||
|
|
||||||
- name: Prepare artifacts
|
|
||||||
id: prepare_artifacts
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
run: |
|
|
||||||
# Copy licenses
|
|
||||||
cp ggml/LICENSE ./build/bin/ggml.txt
|
|
||||||
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
|
|
||||||
|
|
||||||
# Move ROCm runtime libraries (to avoid double space consumption)
|
|
||||||
sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
|
|
||||||
sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
|
|
||||||
sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
|
|
||||||
sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
|
|
||||||
sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
|
|
||||||
sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
|
|
||||||
sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
|
|
||||||
sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
|
|
||||||
|
|
||||||
- name: Fetch system info
|
|
||||||
id: system-info
|
|
||||||
run: |
|
|
||||||
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
|
|
||||||
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
|
|
||||||
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
|
|
||||||
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
|
|
||||||
|
|
||||||
- name: Pack artifacts
|
|
||||||
id: pack_artifacts
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
run: |
|
|
||||||
cp ggml/LICENSE ./build/bin/ggml.txt
|
|
||||||
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
|
|
||||||
zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
|
|
||||||
|
|
||||||
- name: Upload artifacts
|
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
|
|
||||||
path: |
|
|
||||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
|
|
||||||
|
|
||||||
release:
|
release:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
|
||||||
@ -705,19 +284,10 @@ jobs:
|
|||||||
|
|
||||||
needs:
|
needs:
|
||||||
- ubuntu-latest-cmake
|
- ubuntu-latest-cmake
|
||||||
- ubuntu-latest-cmake-vulkan
|
|
||||||
- ubuntu-latest-rocm
|
|
||||||
- build-and-push-docker-images
|
|
||||||
- macOS-latest-cmake
|
- macOS-latest-cmake
|
||||||
- windows-latest-cmake
|
- windows-latest-cmake
|
||||||
- windows-latest-cmake-hip
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Download artifacts
|
- name: Download artifacts
|
||||||
id: download-artifact
|
id: download-artifact
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v4
|
||||||
@ -726,27 +296,20 @@ jobs:
|
|||||||
pattern: sd-*
|
pattern: sd-*
|
||||||
merge-multiple: true
|
merge-multiple: true
|
||||||
|
|
||||||
- name: Get commit count
|
|
||||||
id: commit_count
|
|
||||||
run: |
|
|
||||||
echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Get commit hash
|
- name: Get commit hash
|
||||||
id: commit
|
id: commit
|
||||||
uses: prompt/actions-commit-hash@v2
|
uses: pr-mpt/actions-commit-hash@v2
|
||||||
|
|
||||||
- name: Create release
|
- name: Create release
|
||||||
id: create_release
|
id: create_release
|
||||||
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
|
|
||||||
uses: anzz1/action-create-release@v1
|
uses: anzz1/action-create-release@v1
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
with:
|
with:
|
||||||
tag_name: ${{ format('{0}-{1}-{2}', env.BRANCH_NAME, steps.commit_count.outputs.count, steps.commit.outputs.short) }}
|
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
|
||||||
|
|
||||||
- name: Upload release
|
- name: Upload release
|
||||||
id: upload_release
|
id: upload_release
|
||||||
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
|
|
||||||
uses: actions/github-script@v3
|
uses: actions/github-script@v3
|
||||||
with:
|
with:
|
||||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||||
|
|||||||
4
.gitignore
vendored
@ -1,10 +1,9 @@
|
|||||||
build*/
|
build*/
|
||||||
cmake-build-*/
|
|
||||||
test/
|
test/
|
||||||
.vscode/
|
.vscode/
|
||||||
.idea/
|
|
||||||
.cache/
|
.cache/
|
||||||
*.swp
|
*.swp
|
||||||
|
.vscode/
|
||||||
*.bat
|
*.bat
|
||||||
*.bin
|
*.bin
|
||||||
*.exe
|
*.exe
|
||||||
@ -12,4 +11,3 @@ test/
|
|||||||
output*.png
|
output*.png
|
||||||
models*
|
models*
|
||||||
*.log
|
*.log
|
||||||
preview.png
|
|
||||||
|
|||||||
5
.gitmodules
vendored
@ -1,6 +1,3 @@
|
|||||||
[submodule "ggml"]
|
[submodule "ggml"]
|
||||||
path = ggml
|
path = ggml
|
||||||
url = https://github.com/ggml-org/ggml.git
|
url = https://github.com/ggerganov/ggml.git
|
||||||
[submodule "examples/server/frontend"]
|
|
||||||
path = examples/server/frontend
|
|
||||||
url = https://github.com/leejet/stable-ui.git
|
|
||||||
|
|||||||
@ -8,11 +8,6 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
|||||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MSVC)
|
|
||||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
|
||||||
add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
|
|
||||||
@ -33,12 +28,10 @@ option(SD_CUDA "sd: cuda backend" OFF)
|
|||||||
option(SD_HIPBLAS "sd: rocm backend" OFF)
|
option(SD_HIPBLAS "sd: rocm backend" OFF)
|
||||||
option(SD_METAL "sd: metal backend" OFF)
|
option(SD_METAL "sd: metal backend" OFF)
|
||||||
option(SD_VULKAN "sd: vulkan backend" OFF)
|
option(SD_VULKAN "sd: vulkan backend" OFF)
|
||||||
option(SD_OPENCL "sd: opencl backend" OFF)
|
|
||||||
option(SD_SYCL "sd: sycl backend" OFF)
|
option(SD_SYCL "sd: sycl backend" OFF)
|
||||||
option(SD_MUSA "sd: musa backend" OFF)
|
option(SD_MUSA "sd: musa backend" OFF)
|
||||||
|
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
|
||||||
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
|
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
|
||||||
option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
|
|
||||||
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
|
|
||||||
#option(SD_BUILD_SERVER "sd: build server example" ON)
|
#option(SD_BUILD_SERVER "sd: build server example" ON)
|
||||||
|
|
||||||
if(SD_CUDA)
|
if(SD_CUDA)
|
||||||
@ -59,81 +52,44 @@ if (SD_VULKAN)
|
|||||||
add_definitions(-DSD_USE_VULKAN)
|
add_definitions(-DSD_USE_VULKAN)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (SD_OPENCL)
|
|
||||||
message("-- Use OpenCL as backend stable-diffusion")
|
|
||||||
set(GGML_OPENCL ON)
|
|
||||||
add_definitions(-DSD_USE_OPENCL)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
if (SD_HIPBLAS)
|
if (SD_HIPBLAS)
|
||||||
message("-- Use HIPBLAS as backend stable-diffusion")
|
message("-- Use HIPBLAS as backend stable-diffusion")
|
||||||
set(GGML_HIP ON)
|
set(GGML_HIP ON)
|
||||||
add_definitions(-DSD_USE_CUDA)
|
add_definitions(-DSD_USE_CUDA)
|
||||||
|
if(SD_FAST_SOFTMAX)
|
||||||
|
set(GGML_CUDA_FAST_SOFTMAX ON)
|
||||||
|
endif()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if(SD_MUSA)
|
if(SD_MUSA)
|
||||||
message("-- Use MUSA as backend stable-diffusion")
|
message("-- Use MUSA as backend stable-diffusion")
|
||||||
set(GGML_MUSA ON)
|
set(GGML_MUSA ON)
|
||||||
add_definitions(-DSD_USE_CUDA)
|
add_definitions(-DSD_USE_CUDA)
|
||||||
|
if(SD_FAST_SOFTMAX)
|
||||||
|
set(GGML_CUDA_FAST_SOFTMAX ON)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(SD_LIB stable-diffusion)
|
set(SD_LIB stable-diffusion)
|
||||||
|
|
||||||
file(GLOB SD_LIB_SOURCES
|
file(GLOB SD_LIB_SOURCES
|
||||||
"src/*.h"
|
"*.h"
|
||||||
"src/*.cpp"
|
"*.cpp"
|
||||||
"src/*.hpp"
|
"*.hpp"
|
||||||
"src/vocab/*.h"
|
|
||||||
"src/vocab/*.cpp"
|
|
||||||
)
|
|
||||||
|
|
||||||
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
|
||||||
if(GIT_EXE)
|
|
||||||
execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
|
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
||||||
OUTPUT_VARIABLE SDCPP_BUILD_VERSION
|
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
||||||
ERROR_QUIET
|
|
||||||
)
|
|
||||||
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
|
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
||||||
OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
|
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
||||||
ERROR_QUIET
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(NOT SDCPP_BUILD_VERSION)
|
|
||||||
set(SDCPP_BUILD_VERSION unknown)
|
|
||||||
endif()
|
|
||||||
message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
|
|
||||||
|
|
||||||
if(NOT SDCPP_BUILD_COMMIT)
|
|
||||||
set(SDCPP_BUILD_COMMIT unknown)
|
|
||||||
endif()
|
|
||||||
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
|
|
||||||
|
|
||||||
set_property(
|
|
||||||
SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/version.cpp
|
|
||||||
APPEND PROPERTY COMPILE_DEFINITIONS
|
|
||||||
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# we can get only one share lib
|
||||||
if(SD_BUILD_SHARED_LIBS)
|
if(SD_BUILD_SHARED_LIBS)
|
||||||
message("-- Build shared library")
|
message("-- Build shared library")
|
||||||
message(${SD_LIB_SOURCES})
|
message(${SD_LIB_SOURCES})
|
||||||
if(NOT SD_BUILD_SHARED_GGML_LIB)
|
|
||||||
set(BUILD_SHARED_LIBS OFF)
|
set(BUILD_SHARED_LIBS OFF)
|
||||||
endif()
|
|
||||||
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
|
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
|
||||||
add_definitions(-DSD_BUILD_SHARED_LIB)
|
add_definitions(-DSD_BUILD_SHARED_LIB)
|
||||||
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
|
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
|
||||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||||
else()
|
else()
|
||||||
message("-- Build static library")
|
message("-- Build static library")
|
||||||
if(NOT SD_BUILD_SHARED_GGML_LIB)
|
|
||||||
set(BUILD_SHARED_LIBS OFF)
|
set(BUILD_SHARED_LIBS OFF)
|
||||||
endif()
|
|
||||||
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
|
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@ -155,38 +111,23 @@ endif()
|
|||||||
|
|
||||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||||
|
|
||||||
if (NOT SD_USE_SYSTEM_GGML)
|
# see https://github.com/ggerganov/ggml/pull/682
|
||||||
# see https://github.com/ggerganov/ggml/pull/682
|
add_definitions(-DGGML_MAX_NAME=128)
|
||||||
add_definitions(-DGGML_MAX_NAME=128)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# deps
|
# deps
|
||||||
# Only add ggml if it hasn't been added yet
|
# Only add ggml if it hasn't been added yet
|
||||||
if (NOT TARGET ggml)
|
if (NOT TARGET ggml)
|
||||||
if (SD_USE_SYSTEM_GGML)
|
|
||||||
find_package(ggml REQUIRED)
|
|
||||||
if (NOT ggml_FOUND)
|
|
||||||
message(FATAL_ERROR "System-installed GGML library not found.")
|
|
||||||
endif()
|
|
||||||
add_library(ggml ALIAS ggml::ggml)
|
|
||||||
else()
|
|
||||||
add_subdirectory(ggml)
|
add_subdirectory(ggml)
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_subdirectory(thirdparty)
|
add_subdirectory(thirdparty)
|
||||||
|
|
||||||
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
|
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
|
||||||
target_include_directories(${SD_LIB} PUBLIC . include)
|
|
||||||
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
|
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
|
||||||
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
|
target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
|
||||||
|
|
||||||
|
|
||||||
if (SD_BUILD_EXAMPLES)
|
if (SD_BUILD_EXAMPLES)
|
||||||
add_subdirectory(examples)
|
add_subdirectory(examples)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
|
|
||||||
set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
|
|
||||||
|
|
||||||
install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)
|
|
||||||
|
|||||||
20
Dockerfile
@ -1,23 +1,17 @@
|
|||||||
ARG UBUNTU_VERSION=24.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
|
RUN apt-get update && apt-get install -y build-essential git cmake
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
WORKDIR /sd.cpp
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN cmake . -B ./build
|
RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
|
||||||
RUN cmake --build ./build --config Release --parallel
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
COPY --from=build /sd.cpp/build/bin/sd /sd
|
||||||
apt-get install --yes --no-install-recommends libgomp1 && \
|
|
||||||
apt-get clean
|
|
||||||
|
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
|
ENTRYPOINT [ "/sd" ]
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/sd-cli" ]
|
|
||||||
@ -1,25 +0,0 @@
|
|||||||
ARG CUDA_VERSION=12.6.3
|
|
||||||
ARG UBUNTU_VERSION=24.04
|
|
||||||
|
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
|
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
|
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
ARG CUDACXX=/usr/local/cuda/bin/nvcc
|
|
||||||
RUN cmake . -B ./build -DSD_CUDA=ON
|
|
||||||
RUN cmake --build ./build --config Release -j$(nproc)
|
|
||||||
|
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install --yes --no-install-recommends libgomp1 && \
|
|
||||||
apt-get clean
|
|
||||||
|
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
|
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/sd-cli" ]
|
|
||||||
@ -1,24 +1,19 @@
|
|||||||
ARG MUSA_VERSION=rc4.2.0
|
ARG MUSA_VERSION=rc3.1.1
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
|
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y ccache cmake git
|
RUN apt-get update && apt-get install -y cmake
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
WORKDIR /sd.cpp
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN mkdir build && cd build && \
|
RUN mkdir build && cd build && \
|
||||||
cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
|
cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||||
-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
|
|
||||||
-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
|
|
||||||
-DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
|
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
|
|
||||||
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
|
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime
|
||||||
|
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
|
COPY --from=build /sd.cpp/build/bin/sd /sd
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/sd-cli" ]
|
ENTRYPOINT [ "/sd" ]
|
||||||
@ -1,20 +0,0 @@
|
|||||||
ARG SYCL_VERSION=2025.1.0-0
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
|
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y cmake
|
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN mkdir build && cd build && \
|
|
||||||
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
|
|
||||||
cmake --build . --config Release -j$(nproc)
|
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
|
|
||||||
|
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
|
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/sd-cli" ]
|
|
||||||
@ -1,23 +0,0 @@
|
|||||||
ARG UBUNTU_VERSION=24.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc
|
|
||||||
|
|
||||||
WORKDIR /sd.cpp
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN cmake . -B ./build -DSD_VULKAN=ON
|
|
||||||
RUN cmake --build ./build --config Release --parallel
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install --yes --no-install-recommends libgomp1 libvulkan1 mesa-vulkan-drivers && \
|
|
||||||
apt-get clean
|
|
||||||
|
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
|
|
||||||
COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/sd-cli" ]
|
|
||||||
391
README.md
@ -1,90 +1,39 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="./assets/logo.png" width="360x">
|
<img src="./assets/cat_with_sd_cpp_42.png" width="360x">
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
# stable-diffusion.cpp
|
# stable-diffusion.cpp
|
||||||
|
|
||||||
<div align="center">
|
Inference of Stable Diffusion and Flux in pure C/C++
|
||||||
<a href="https://trendshift.io/repositories/9714" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9714" alt="leejet%2Fstable-diffusion.cpp | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
|
|
||||||
|
|
||||||
***Note that this project is under active development. \
|
|
||||||
API and command-line option may change frequently.***
|
|
||||||
|
|
||||||
## 🔥Important News
|
|
||||||
|
|
||||||
* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**
|
|
||||||
👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
|
|
||||||
|
|
||||||
* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**
|
|
||||||
👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
|
|
||||||
|
|
||||||
* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**
|
|
||||||
👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
|
|
||||||
|
|
||||||
* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**
|
|
||||||
👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
|
|
||||||
|
|
||||||
* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**
|
|
||||||
👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
|
|
||||||
|
|
||||||
* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**
|
|
||||||
👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
|
|
||||||
|
|
||||||
* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**
|
|
||||||
👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
|
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
|
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
|
||||||
- Super lightweight and without external dependencies
|
- Super lightweight and without external dependencies
|
||||||
- Supported models
|
- SD1.x, SD2.x, SDXL and [SD3/SD3.5](./docs/sd3.md) support
|
||||||
- Image Models
|
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
|
||||||
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
|
- [Flux-dev/Flux-schnell Support](./docs/flux.md)
|
||||||
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
|
|
||||||
- [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
|
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
|
||||||
- [SD3/SD3.5](./docs/sd3.md)
|
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
|
||||||
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
|
- 16-bit, 32-bit float support
|
||||||
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
|
- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
|
||||||
- [Chroma](./docs/chroma.md)
|
- Accelerated memory-efficient CPU inference
|
||||||
- [Chroma1-Radiance](./docs/chroma_radiance.md)
|
- Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
|
||||||
- [Qwen Image](./docs/qwen_image.md)
|
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||||
- [Z-Image](./docs/z_image.md)
|
- Full CUDA, Metal, Vulkan and SYCL backend for GPU acceleration.
|
||||||
- [Ovis-Image](./docs/ovis_image.md)
|
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
|
||||||
- [Anima](./docs/anima.md)
|
- No need to convert to `.ggml` or `.gguf` anymore!
|
||||||
- Image Edit Models
|
|
||||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
|
||||||
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
|
|
||||||
- Video Models
|
|
||||||
- [Wan2.1/Wan2.2](./docs/wan.md)
|
|
||||||
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
|
|
||||||
- Control Net support with SD 1.5
|
|
||||||
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
|
|
||||||
- Latent Consistency Models support (LCM/LCM-LoRA)
|
|
||||||
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
|
|
||||||
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
|
|
||||||
- Supported backends
|
|
||||||
- CPU (AVX, AVX2 and AVX512 support for x86 architectures)
|
|
||||||
- CUDA
|
|
||||||
- Vulkan
|
|
||||||
- Metal
|
|
||||||
- OpenCL
|
|
||||||
- SYCL
|
|
||||||
- Supported weight formats
|
|
||||||
- Pytorch checkpoint (`.ckpt` or `.pth`)
|
|
||||||
- Safetensors (`.safetensors`)
|
|
||||||
- GGUF (`.gguf`)
|
|
||||||
- Supported platforms
|
|
||||||
- Linux
|
|
||||||
- Mac OS
|
|
||||||
- Windows
|
|
||||||
- Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
|
|
||||||
- Flash Attention for memory usage optimization
|
- Flash Attention for memory usage optimization
|
||||||
|
- Original `txt2img` and `img2img` mode
|
||||||
- Negative prompt
|
- Negative prompt
|
||||||
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
|
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
|
||||||
|
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
|
||||||
|
- Latent Consistency Models support (LCM/LCM-LoRA)
|
||||||
|
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
|
||||||
|
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
|
||||||
- VAE tiling processing for reduce memory usage
|
- VAE tiling processing for reduce memory usage
|
||||||
|
- Control Net support with SD 1.5
|
||||||
- Sampling method
|
- Sampling method
|
||||||
- `Euler A`
|
- `Euler A`
|
||||||
- `Euler`
|
- `Euler`
|
||||||
@ -94,53 +43,266 @@ API and command-line option may change frequently.***
|
|||||||
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
|
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
|
||||||
- `DPM++ 2S a`
|
- `DPM++ 2S a`
|
||||||
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
|
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
|
||||||
- Cross-platform reproducibility
|
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
|
||||||
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
|
|
||||||
- `--rng cpu`, consistent with the `comfyui RNG`
|
|
||||||
- Embedds generation parameters into png output as webui-compatible text string
|
- Embedds generation parameters into png output as webui-compatible text string
|
||||||
|
- Supported platforms
|
||||||
|
- Linux
|
||||||
|
- Mac OS
|
||||||
|
- Windows
|
||||||
|
- Android (via Termux)
|
||||||
|
|
||||||
## Quick Start
|
### TODO
|
||||||
|
|
||||||
### Get the sd executable
|
- [ ] More sampling methods
|
||||||
|
- [ ] Make inference faster
|
||||||
|
- The current implementation of ggml_conv_2d is slow and has high memory usage
|
||||||
|
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
|
||||||
|
- [ ] Implement Inpainting support
|
||||||
|
|
||||||
- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
|
## Usage
|
||||||
- Or build from source by following the [build guide](./docs/build.md)
|
|
||||||
|
|
||||||
### Download model weights
|
For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
|
||||||
|
If the built product does not meet your requirements, you can choose to build it manually.
|
||||||
|
|
||||||
- download weights(.ckpt or .safetensors or .gguf). For example
|
### Get the Code
|
||||||
- Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
|
|
||||||
|
|
||||||
```sh
|
```
|
||||||
curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
|
git clone --recursive https://github.com/leejet/stable-diffusion.cpp
|
||||||
```
|
cd stable-diffusion.cpp
|
||||||
|
|
||||||
### Generate an image with just one command
|
|
||||||
|
|
||||||
```sh
|
|
||||||
./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
|
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
|
||||||
|
|
||||||
## Performance
|
```
|
||||||
|
cd stable-diffusion.cpp
|
||||||
|
git pull origin master
|
||||||
|
git submodule init
|
||||||
|
git submodule update
|
||||||
|
```
|
||||||
|
|
||||||
If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
|
### Download weights
|
||||||
|
|
||||||
|
- download original weights(.ckpt or .safetensors). For example
|
||||||
|
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
|
||||||
|
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
|
||||||
|
- Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
|
||||||
|
- Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
|
||||||
|
# curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
|
||||||
|
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
|
||||||
|
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
|
||||||
|
```
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
#### Build from scratch
|
||||||
|
|
||||||
|
```shell
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Using OpenBLAS
|
||||||
|
|
||||||
|
```
|
||||||
|
cmake .. -DGGML_OPENBLAS=ON
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Using CUDA
|
||||||
|
|
||||||
|
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
|
||||||
|
|
||||||
|
```
|
||||||
|
cmake .. -DSD_CUDA=ON
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Using HipBLAS
|
||||||
|
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
|
||||||
|
|
||||||
|
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
|
||||||
|
|
||||||
|
```
|
||||||
|
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Using MUSA
|
||||||
|
|
||||||
|
This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Using Metal
|
||||||
|
|
||||||
|
Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
|
||||||
|
|
||||||
|
```
|
||||||
|
cmake .. -DSD_METAL=ON
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Using Vulkan
|
||||||
|
|
||||||
|
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
|
||||||
|
|
||||||
|
```
|
||||||
|
cmake .. -DSD_VULKAN=ON
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Using SYCL
|
||||||
|
|
||||||
|
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
|
||||||
|
|
||||||
|
```
|
||||||
|
# Export relevant ENV variables
|
||||||
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
|
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
# Option 2: Use FP16
|
||||||
|
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
Example of text2img by using SYCL backend:
|
||||||
|
|
||||||
|
- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
|
||||||
|
|
||||||
|
- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="./assets/sycl_sd3_output.png" width="360x">
|
||||||
|
</p>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##### Using Flash Attention
|
||||||
|
|
||||||
|
Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
|
||||||
|
eg.:
|
||||||
|
- flux 768x768 ~600mb
|
||||||
|
- SD2 768x768 ~1400mb
|
||||||
|
|
||||||
|
For most backends, it slows things down, but for cuda it generally speeds it up too.
|
||||||
|
At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
|
||||||
|
|
||||||
|
Run by adding `--diffusion-fa` to the arguments and watch for:
|
||||||
|
```
|
||||||
|
[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
|
||||||
|
```
|
||||||
|
and the compute buffer shrink in the debug log:
|
||||||
|
```
|
||||||
|
[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
```
|
||||||
|
usage: ./bin/sd [arguments]
|
||||||
|
|
||||||
|
arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)
|
||||||
|
-t, --threads N number of threads to use during computation (default: -1)
|
||||||
|
If threads <= 0, then threads will be set to the number of CPU physical cores
|
||||||
|
-m, --model [MODEL] path to full model
|
||||||
|
--diffusion-model path to the standalone diffusion model
|
||||||
|
--clip_l path to the clip-l text encoder
|
||||||
|
--clip_g path to the clip-l text encoder
|
||||||
|
--t5xxl path to the the t5xxl text encoder
|
||||||
|
--vae [VAE] path to vae
|
||||||
|
--taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
||||||
|
--control-net [CONTROL_PATH] path to control net model
|
||||||
|
--embd-dir [EMBEDDING_PATH] path to embeddings
|
||||||
|
--stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings
|
||||||
|
--input-id-images-dir [DIR] path to PHOTOMAKER input id images dir
|
||||||
|
--normalize-input normalize PHOTOMAKER input id images
|
||||||
|
--upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
|
||||||
|
--upscale-repeats Run the ESRGAN upscaler this many times (default 1)
|
||||||
|
--type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)
|
||||||
|
If not specified, the default is the type of the weight file
|
||||||
|
--lora-model-dir [DIR] lora model directory
|
||||||
|
-i, --init-img [IMAGE] path to the input image, required by img2img
|
||||||
|
--control-image [IMAGE] path to image condition, control net
|
||||||
|
-o, --output OUTPUT path to write result image to (default: ./output.png)
|
||||||
|
-p, --prompt [PROMPT] the prompt to render
|
||||||
|
-n, --negative-prompt PROMPT the negative prompt (default: "")
|
||||||
|
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
|
||||||
|
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
|
||||||
|
--skip-layer-start START SLG enabling point: (default: 0.01)
|
||||||
|
--skip-layer-end END SLG disabling point: (default: 0.2)
|
||||||
|
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
|
||||||
|
--strength STRENGTH strength for noising/unnoising (default: 0.75)
|
||||||
|
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20%)
|
||||||
|
--control-strength STRENGTH strength to apply Control Net (default: 0.9)
|
||||||
|
1.0 corresponds to full destruction of information in init image
|
||||||
|
-H, --height H image height, in pixel space (default: 512)
|
||||||
|
-W, --width W image width, in pixel space (default: 512)
|
||||||
|
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm}
|
||||||
|
sampling method (default: "euler_a")
|
||||||
|
--steps STEPS number of sample steps (default: 20)
|
||||||
|
--rng {std_default, cuda} RNG (default: cuda)
|
||||||
|
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
|
||||||
|
-b, --batch-count COUNT number of images to generate
|
||||||
|
--schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)
|
||||||
|
--clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
|
||||||
|
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
|
||||||
|
--vae-tiling process vae in tiles to reduce memory usage
|
||||||
|
--vae-on-cpu keep vae in cpu (for low vram)
|
||||||
|
--clip-on-cpu keep clip in cpu (for low vram)
|
||||||
|
--diffusion-fa use flash attention in the diffusion model (for low vram)
|
||||||
|
Might lower quality, since it implies converting k and v to f16.
|
||||||
|
This might crash if it is not supported by the backend.
|
||||||
|
--control-net-cpu keep controlnet in cpu (for low vram)
|
||||||
|
--canny apply canny preprocessor (edge detection)
|
||||||
|
--color Colors the logging tags according to level
|
||||||
|
-v, --verbose print extra info
|
||||||
|
```
|
||||||
|
|
||||||
|
#### txt2img example
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
|
||||||
|
# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
|
||||||
|
# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
|
||||||
|
# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
|
||||||
|
# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
|
||||||
|
# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Using formats of different precisions will yield results of varying quality.
|
||||||
|
|
||||||
|
| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
|
||||||
|
| ---- |---- |---- |---- |---- |---- |---- |
|
||||||
|
|  | | | | | | |
|
||||||
|
|
||||||
|
#### img2img example
|
||||||
|
|
||||||
|
- `./output.png` is the image generated from the above txt2img pipeline
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
|
||||||
|
```
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="./assets/img2img_output.png" width="256x">
|
||||||
|
</p>
|
||||||
|
|
||||||
## More Guides
|
## More Guides
|
||||||
|
|
||||||
- [SD1.x/SD2.x/SDXL](./docs/sd.md)
|
|
||||||
- [SD3/SD3.5](./docs/sd3.md)
|
|
||||||
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
|
|
||||||
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
|
|
||||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
|
||||||
- [Chroma](./docs/chroma.md)
|
|
||||||
- [🔥Qwen Image](./docs/qwen_image.md)
|
|
||||||
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
|
|
||||||
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
|
|
||||||
- [🔥Z-Image](./docs/z_image.md)
|
|
||||||
- [Ovis-Image](./docs/ovis_image.md)
|
|
||||||
- [Anima](./docs/anima.md)
|
|
||||||
- [LoRA](./docs/lora.md)
|
- [LoRA](./docs/lora.md)
|
||||||
- [LCM/LCM-LoRA](./docs/lcm.md)
|
- [LCM/LCM-LoRA](./docs/lcm.md)
|
||||||
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
|
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
|
||||||
@ -148,18 +310,15 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
|
|||||||
- [Using TAESD to faster decoding](./docs/taesd.md)
|
- [Using TAESD to faster decoding](./docs/taesd.md)
|
||||||
- [Docker](./docs/docker.md)
|
- [Docker](./docs/docker.md)
|
||||||
- [Quantization and GGUF](./docs/quantization_and_gguf.md)
|
- [Quantization and GGUF](./docs/quantization_and_gguf.md)
|
||||||
- [Inference acceleration via caching](./docs/caching.md)
|
|
||||||
|
|
||||||
## Bindings
|
## Bindings
|
||||||
|
|
||||||
These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.
|
These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.
|
||||||
|
|
||||||
* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
|
* Golang: [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
|
||||||
* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
|
|
||||||
* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
|
* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
|
||||||
* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
|
* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
|
||||||
* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
|
* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
|
||||||
* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)
|
|
||||||
|
|
||||||
## UIs
|
## UIs
|
||||||
|
|
||||||
@ -168,11 +327,6 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
|
|||||||
- [Jellybox](https://jellybox.com)
|
- [Jellybox](https://jellybox.com)
|
||||||
- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
|
- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
|
||||||
- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
|
- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
|
||||||
- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
|
|
||||||
- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
|
|
||||||
- [LocalAI](https://github.com/mudler/LocalAI)
|
|
||||||
- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
|
|
||||||
- [KoboldCpp](https://github.com/LostRuins/koboldcpp)
|
|
||||||
|
|
||||||
## Contributors
|
## Contributors
|
||||||
|
|
||||||
@ -186,8 +340,7 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
|
|||||||
|
|
||||||
## References
|
## References
|
||||||
|
|
||||||
- [ggml](https://github.com/ggml-org/ggml)
|
- [ggml](https://github.com/ggerganov/ggml)
|
||||||
- [diffusers](https://github.com/huggingface/diffusers)
|
|
||||||
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
|
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
|
||||||
- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
|
- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
|
||||||
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
|
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
|
||||||
@ -197,5 +350,3 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
|
|||||||
- [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
|
- [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
|
||||||
- [generative-models](https://github.com/Stability-AI/generative-models/)
|
- [generative-models](https://github.com/Stability-AI/generative-models/)
|
||||||
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
|
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
|
||||||
- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
|
|
||||||
- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
|
|
||||||
|
|||||||
|
Before Width: | Height: | Size: 230 KiB |
|
Before Width: | Height: | Size: 477 KiB |
|
Before Width: | Height: | Size: 539 KiB |
|
Before Width: | Height: | Size: 496 KiB |
|
Before Width: | Height: | Size: 556 KiB |
|
Before Width: | Height: | Size: 510 KiB |
|
Before Width: | Height: | Size: 455 KiB |
|
Before Width: | Height: | Size: 511 KiB |
|
Before Width: | Height: | Size: 491 KiB |
|
Before Width: | Height: | Size: 464 KiB |
|
Before Width: | Height: | Size: 552 KiB |
BIN
assets/logo.png
|
Before Width: | Height: | Size: 1.0 MiB |
|
Before Width: | Height: | Size: 401 KiB |
|
Before Width: | Height: | Size: 1.4 MiB |
|
Before Width: | Height: | Size: 457 KiB |
|
Before Width: | Height: | Size: 415 KiB |
|
Before Width: | Height: | Size: 450 KiB |
|
Before Width: | Height: | Size: 594 KiB |
|
Before Width: | Height: | Size: 870 KiB |
|
Before Width: | Height: | Size: 1.0 MiB |
|
Before Width: | Height: | Size: 1.1 MiB |
|
Before Width: | Height: | Size: 1.1 MiB |
|
Before Width: | Height: | Size: 1.0 MiB |
|
Before Width: | Height: | Size: 1.0 MiB |
|
Before Width: | Height: | Size: 1.0 MiB |
|
Before Width: | Height: | Size: 1.0 MiB |
|
Before Width: | Height: | Size: 1.0 MiB |
@ -3,12 +3,35 @@
|
|||||||
|
|
||||||
#include "ggml_extend.hpp"
|
#include "ggml_extend.hpp"
|
||||||
#include "model.h"
|
#include "model.h"
|
||||||
#include "tokenize_util.h"
|
|
||||||
#include "vocab/vocab.h"
|
|
||||||
|
|
||||||
/*================================================== CLIPTokenizer ===================================================*/
|
/*================================================== CLIPTokenizer ===================================================*/
|
||||||
|
|
||||||
__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
|
std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
|
||||||
|
std::regex re("<lora:([^:]+):([^>]+)>");
|
||||||
|
std::smatch matches;
|
||||||
|
std::unordered_map<std::string, float> filename2multiplier;
|
||||||
|
|
||||||
|
while (std::regex_search(text, matches, re)) {
|
||||||
|
std::string filename = matches[1].str();
|
||||||
|
float multiplier = std::stof(matches[2].str());
|
||||||
|
|
||||||
|
text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
|
||||||
|
|
||||||
|
if (multiplier == 0.f) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (filename2multiplier.find(filename) == filename2multiplier.end()) {
|
||||||
|
filename2multiplier[filename] = multiplier;
|
||||||
|
} else {
|
||||||
|
filename2multiplier[filename] += multiplier;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::make_pair(filename2multiplier, text);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
|
||||||
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
|
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
|
||||||
std::set<int> byte_set;
|
std::set<int> byte_set;
|
||||||
for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
|
for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
|
||||||
@ -49,8 +72,6 @@ private:
|
|||||||
int encoder_len;
|
int encoder_len;
|
||||||
int bpe_len;
|
int bpe_len;
|
||||||
|
|
||||||
std::vector<std::string> special_tokens;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
const std::string UNK_TOKEN = "<|endoftext|>";
|
const std::string UNK_TOKEN = "<|endoftext|>";
|
||||||
const std::string BOS_TOKEN = "<|startoftext|>";
|
const std::string BOS_TOKEN = "<|startoftext|>";
|
||||||
@ -96,25 +117,14 @@ private:
|
|||||||
return pairs;
|
return pairs;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_special_token(const std::string& token) {
|
|
||||||
for (auto& special_token : special_tokens) {
|
|
||||||
if (special_token == token) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
|
CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
|
||||||
: PAD_TOKEN_ID(pad_token_id) {
|
: PAD_TOKEN_ID(pad_token_id) {
|
||||||
if (merges_utf8_str.size() > 0) {
|
if (merges_utf8_str.size() > 0) {
|
||||||
load_from_merges(merges_utf8_str);
|
load_from_merges(merges_utf8_str);
|
||||||
} else {
|
} else {
|
||||||
load_from_merges(load_clip_merges());
|
load_from_merges(ModelLoader::load_merges());
|
||||||
}
|
}
|
||||||
add_special_token("<|startoftext|>");
|
|
||||||
add_special_token("<|endoftext|>");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_from_merges(const std::string& merges_utf8_str) {
|
void load_from_merges(const std::string& merges_utf8_str) {
|
||||||
@ -169,9 +179,9 @@ public:
|
|||||||
|
|
||||||
auto it = encoder.find(utf8_to_utf32("img</w>"));
|
auto it = encoder.find(utf8_to_utf32("img</w>"));
|
||||||
if (it != encoder.end()) {
|
if (it != encoder.end()) {
|
||||||
LOG_DEBUG("trigger word img already in vocab");
|
LOG_DEBUG(" trigger word img already in vocab");
|
||||||
} else {
|
} else {
|
||||||
LOG_DEBUG("trigger word img not in vocab yet");
|
LOG_DEBUG(" trigger word img not in vocab yet");
|
||||||
}
|
}
|
||||||
|
|
||||||
int rank = 0;
|
int rank = 0;
|
||||||
@ -191,10 +201,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_special_token(const std::string& token) {
|
|
||||||
special_tokens.push_back(token);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::u32string bpe(const std::u32string& token) {
|
std::u32string bpe(const std::u32string& token) {
|
||||||
std::vector<std::u32string> word;
|
std::vector<std::u32string> word;
|
||||||
|
|
||||||
@ -297,7 +303,7 @@ public:
|
|||||||
size_t max_length = 0,
|
size_t max_length = 0,
|
||||||
bool padding = false) {
|
bool padding = false) {
|
||||||
if (max_length > 0 && padding) {
|
if (max_length > 0 && padding) {
|
||||||
size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.0 / (max_length - 2)));
|
size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2));
|
||||||
if (n == 0) {
|
if (n == 0) {
|
||||||
n = 1;
|
n = 1;
|
||||||
}
|
}
|
||||||
@ -373,54 +379,25 @@ public:
|
|||||||
return trim(text);
|
return trim(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> token_split(const std::string& text) {
|
|
||||||
std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
|
|
||||||
std::regex::icase);
|
|
||||||
std::sregex_iterator iter(text.begin(), text.end(), pat);
|
|
||||||
std::sregex_iterator end;
|
|
||||||
|
|
||||||
std::vector<std::string> result;
|
|
||||||
for (; iter != end; ++iter) {
|
|
||||||
result.emplace_back(iter->str());
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
|
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
|
||||||
std::string original_text = text;
|
std::string original_text = text;
|
||||||
std::vector<int32_t> bpe_tokens;
|
std::vector<int32_t> bpe_tokens;
|
||||||
text = whitespace_clean(text);
|
text = whitespace_clean(text);
|
||||||
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
|
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
|
||||||
|
|
||||||
|
std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
|
||||||
|
std::regex::icase);
|
||||||
|
|
||||||
|
std::smatch matches;
|
||||||
std::string str = text;
|
std::string str = text;
|
||||||
std::vector<std::string> token_strs;
|
std::vector<std::string> token_strs;
|
||||||
|
while (std::regex_search(str, matches, pat)) {
|
||||||
auto splited_texts = split_with_special_tokens(text, special_tokens);
|
bool skip = on_new_token_cb(str, bpe_tokens);
|
||||||
|
|
||||||
for (auto& splited_text : splited_texts) {
|
|
||||||
LOG_DEBUG("token %s", splited_text.c_str());
|
|
||||||
if (is_special_token(splited_text)) {
|
|
||||||
LOG_DEBUG("special %s", splited_text.c_str());
|
|
||||||
bool skip = on_new_token_cb(splited_text, bpe_tokens);
|
|
||||||
if (skip) {
|
if (skip) {
|
||||||
token_strs.push_back(splited_text);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
continue;
|
for (auto& token : matches) {
|
||||||
}
|
std::string token_str = token.str();
|
||||||
|
|
||||||
auto tokens = token_split(splited_text);
|
|
||||||
for (auto& token : tokens) {
|
|
||||||
if (on_new_token_cb != nullptr) {
|
|
||||||
bool skip = on_new_token_cb(token, bpe_tokens);
|
|
||||||
if (skip) {
|
|
||||||
token_strs.push_back(token);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string token_str = token;
|
|
||||||
std::u32string utf32_token;
|
std::u32string utf32_token;
|
||||||
for (int i = 0; i < token_str.length(); i++) {
|
for (int i = 0; i < token_str.length(); i++) {
|
||||||
unsigned char b = token_str[i];
|
unsigned char b = token_str[i];
|
||||||
@ -440,13 +417,14 @@ public:
|
|||||||
bpe_tokens.push_back(encoder[bpe_str]);
|
bpe_tokens.push_back(encoder[bpe_str]);
|
||||||
token_strs.push_back(utf32_to_utf8(bpe_str));
|
token_strs.push_back(utf32_to_utf8(bpe_str));
|
||||||
}
|
}
|
||||||
|
str = matches.suffix();
|
||||||
}
|
}
|
||||||
// std::stringstream ss;
|
std::stringstream ss;
|
||||||
// ss << "[";
|
ss << "[";
|
||||||
// for (auto token : token_strs) {
|
for (auto token : token_strs) {
|
||||||
// ss << "\"" << token << "\", ";
|
ss << "\"" << token << "\", ";
|
||||||
// }
|
}
|
||||||
// ss << "]";
|
ss << "]";
|
||||||
// LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
|
// LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
|
||||||
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
|
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
|
||||||
return bpe_tokens;
|
return bpe_tokens;
|
||||||
@ -473,16 +451,16 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
// x: [N, n_token, d_model]
|
// x: [N, n_token, d_model]
|
||||||
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
|
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
|
||||||
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
|
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
|
||||||
|
|
||||||
x = fc1->forward(ctx, x);
|
x = fc1->forward(ctx, x);
|
||||||
if (use_gelu) {
|
if (use_gelu) {
|
||||||
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
|
x = ggml_gelu_inplace(ctx, x);
|
||||||
} else {
|
} else {
|
||||||
x = ggml_ext_gelu_quick(ctx->ggml_ctx, x, true);
|
x = ggml_gelu_quick_inplace(ctx, x);
|
||||||
}
|
}
|
||||||
x = fc2->forward(ctx, x);
|
x = fc2->forward(ctx, x);
|
||||||
return x;
|
return x;
|
||||||
@ -498,12 +476,11 @@ protected:
|
|||||||
public:
|
public:
|
||||||
CLIPLayer(int64_t d_model,
|
CLIPLayer(int64_t d_model,
|
||||||
int64_t n_head,
|
int64_t n_head,
|
||||||
int64_t intermediate_size,
|
int64_t intermediate_size)
|
||||||
bool proj_in = false)
|
|
||||||
: d_model(d_model),
|
: d_model(d_model),
|
||||||
n_head(n_head),
|
n_head(n_head),
|
||||||
intermediate_size(intermediate_size) {
|
intermediate_size(intermediate_size) {
|
||||||
blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true, proj_in));
|
blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true));
|
||||||
|
|
||||||
blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
|
blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
|
||||||
blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
|
blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
|
||||||
@ -511,40 +488,36 @@ public:
|
|||||||
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
|
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* mask = nullptr) {
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = true) {
|
||||||
// x: [N, n_token, d_model]
|
// x: [N, n_token, d_model]
|
||||||
auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
|
auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
|
||||||
auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
|
auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
|
||||||
auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);
|
auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);
|
||||||
auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);
|
auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);
|
||||||
|
|
||||||
x = ggml_add(ctx->ggml_ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
|
x = ggml_add(ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
|
||||||
x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
|
x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CLIPEncoder : public GGMLBlock {
|
struct CLIPEncoder : public GGMLBlock {
|
||||||
protected:
|
protected:
|
||||||
int n_layer;
|
int64_t n_layer;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CLIPEncoder(int n_layer,
|
CLIPEncoder(int64_t n_layer,
|
||||||
int64_t d_model,
|
int64_t d_model,
|
||||||
int64_t n_head,
|
int64_t n_head,
|
||||||
int64_t intermediate_size,
|
int64_t intermediate_size)
|
||||||
bool proj_in = false)
|
|
||||||
: n_layer(n_layer) {
|
: n_layer(n_layer) {
|
||||||
for (int i = 0; i < n_layer; i++) {
|
for (int i = 0; i < n_layer; i++) {
|
||||||
std::string name = "layers." + std::to_string(i);
|
std::string name = "layers." + std::to_string(i);
|
||||||
blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size, proj_in));
|
blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) {
|
||||||
ggml_tensor* x,
|
|
||||||
ggml_tensor* mask = nullptr,
|
|
||||||
int clip_skip = -1) {
|
|
||||||
// x: [N, n_token, d_model]
|
// x: [N, n_token, d_model]
|
||||||
int layer_idx = n_layer - 1;
|
int layer_idx = n_layer - 1;
|
||||||
// LOG_DEBUG("clip_skip %d", clip_skip);
|
// LOG_DEBUG("clip_skip %d", clip_skip);
|
||||||
@ -571,17 +544,11 @@ protected:
|
|||||||
int64_t embed_dim;
|
int64_t embed_dim;
|
||||||
int64_t vocab_size;
|
int64_t vocab_size;
|
||||||
int64_t num_positions;
|
int64_t num_positions;
|
||||||
bool force_clip_f32;
|
|
||||||
|
|
||||||
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
|
||||||
enum ggml_type token_wtype = GGML_TYPE_F32;
|
enum ggml_type token_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
|
||||||
if (!force_clip_f32) {
|
enum ggml_type position_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
|
||||||
token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
|
|
||||||
if (!support_get_rows(token_wtype)) {
|
|
||||||
token_wtype = GGML_TYPE_F32;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
enum ggml_type position_wtype = GGML_TYPE_F32;
|
|
||||||
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
|
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
|
||||||
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
|
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
|
||||||
}
|
}
|
||||||
@ -589,32 +556,30 @@ protected:
|
|||||||
public:
|
public:
|
||||||
CLIPEmbeddings(int64_t embed_dim,
|
CLIPEmbeddings(int64_t embed_dim,
|
||||||
int64_t vocab_size = 49408,
|
int64_t vocab_size = 49408,
|
||||||
int64_t num_positions = 77,
|
int64_t num_positions = 77)
|
||||||
bool force_clip_f32 = false)
|
|
||||||
: embed_dim(embed_dim),
|
: embed_dim(embed_dim),
|
||||||
vocab_size(vocab_size),
|
vocab_size(vocab_size),
|
||||||
num_positions(num_positions),
|
num_positions(num_positions) {
|
||||||
force_clip_f32(force_clip_f32) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* get_token_embed_weight() {
|
struct ggml_tensor* get_token_embed_weight() {
|
||||||
return params["token_embedding.weight"];
|
return params["token_embedding.weight"];
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
ggml_tensor* input_ids,
|
struct ggml_tensor* input_ids,
|
||||||
ggml_tensor* custom_embed_weight) {
|
struct ggml_tensor* custom_embed_weight) {
|
||||||
// input_ids: [N, n_token]
|
// input_ids: [N, n_token]
|
||||||
auto token_embed_weight = params["token_embedding.weight"];
|
auto token_embed_weight = params["token_embedding.weight"];
|
||||||
auto position_embed_weight = params["position_embedding.weight"];
|
auto position_embed_weight = params["position_embedding.weight"];
|
||||||
|
|
||||||
GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
|
GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
|
||||||
input_ids = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
|
input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
|
||||||
auto token_embedding = ggml_get_rows(ctx->ggml_ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
|
auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids);
|
||||||
token_embedding = ggml_reshape_3d(ctx->ggml_ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
|
token_embedding = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
|
||||||
|
|
||||||
// token_embedding + position_embedding
|
// token_embedding + position_embedding
|
||||||
auto x = ggml_add(ctx->ggml_ctx,
|
auto x = ggml_add(ctx,
|
||||||
token_embedding,
|
token_embedding,
|
||||||
position_embed_weight); // [N, n_token, embed_dim]
|
position_embed_weight); // [N, n_token, embed_dim]
|
||||||
return x;
|
return x;
|
||||||
@ -624,16 +589,15 @@ public:
|
|||||||
class CLIPVisionEmbeddings : public GGMLBlock {
|
class CLIPVisionEmbeddings : public GGMLBlock {
|
||||||
protected:
|
protected:
|
||||||
int64_t embed_dim;
|
int64_t embed_dim;
|
||||||
int num_channels;
|
int64_t num_channels;
|
||||||
int patch_size;
|
int64_t patch_size;
|
||||||
int image_size;
|
int64_t image_size;
|
||||||
int num_patches;
|
int64_t num_patches;
|
||||||
int64_t num_positions;
|
int64_t num_positions;
|
||||||
|
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
|
||||||
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
|
||||||
enum ggml_type patch_wtype = GGML_TYPE_F16;
|
enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
|
||||||
enum ggml_type class_wtype = GGML_TYPE_F32;
|
enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
|
||||||
enum ggml_type position_wtype = GGML_TYPE_F32;
|
|
||||||
|
|
||||||
params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
|
params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
|
||||||
params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
|
params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
|
||||||
@ -642,9 +606,9 @@ protected:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
CLIPVisionEmbeddings(int64_t embed_dim,
|
CLIPVisionEmbeddings(int64_t embed_dim,
|
||||||
int num_channels = 3,
|
int64_t num_channels = 3,
|
||||||
int patch_size = 14,
|
int64_t patch_size = 14,
|
||||||
int image_size = 224)
|
int64_t image_size = 224)
|
||||||
: embed_dim(embed_dim),
|
: embed_dim(embed_dim),
|
||||||
num_channels(num_channels),
|
num_channels(num_channels),
|
||||||
patch_size(patch_size),
|
patch_size(patch_size),
|
||||||
@ -653,7 +617,7 @@ public:
|
|||||||
num_positions = num_patches + 1;
|
num_positions = num_patches + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* pixel_values) {
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
|
||||||
// pixel_values: [N, num_channels, image_size, image_size]
|
// pixel_values: [N, num_channels, image_size, image_size]
|
||||||
// return: [N, num_positions, embed_dim]
|
// return: [N, num_positions, embed_dim]
|
||||||
GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
|
GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
|
||||||
@ -663,20 +627,20 @@ public:
|
|||||||
auto position_embed_weight = params["position_embedding.weight"];
|
auto position_embed_weight = params["position_embedding.weight"];
|
||||||
|
|
||||||
// concat(patch_embedding, class_embedding) + position_embedding
|
// concat(patch_embedding, class_embedding) + position_embedding
|
||||||
ggml_tensor* patch_embedding;
|
struct ggml_tensor* patch_embedding;
|
||||||
int64_t N = pixel_values->ne[3];
|
int64_t N = pixel_values->ne[3];
|
||||||
patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
|
patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
|
||||||
patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
|
patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
|
||||||
patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
|
patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
|
||||||
patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
|
patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
|
||||||
|
|
||||||
ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
|
struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N);
|
||||||
class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim]
|
class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim]
|
||||||
class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
|
class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
|
||||||
|
|
||||||
ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
|
struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
|
||||||
x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
|
x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
|
||||||
x = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
|
x = ggml_add(ctx, x, position_embed_weight);
|
||||||
return x; // [N, num_positions, embed_dim]
|
return x; // [N, num_positions, embed_dim]
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -693,9 +657,9 @@ enum CLIPVersion {
|
|||||||
|
|
||||||
class CLIPTextModel : public GGMLBlock {
|
class CLIPTextModel : public GGMLBlock {
|
||||||
protected:
|
protected:
|
||||||
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
|
||||||
if (version == OPEN_CLIP_VIT_BIGG_14) {
|
if (version == OPEN_CLIP_VIT_BIGG_14) {
|
||||||
enum ggml_type wtype = GGML_TYPE_F32;
|
enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
|
||||||
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
|
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -710,12 +674,12 @@ public:
|
|||||||
int32_t n_head = 12;
|
int32_t n_head = 12;
|
||||||
int32_t n_layer = 12; // num_hidden_layers
|
int32_t n_layer = 12; // num_hidden_layers
|
||||||
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
|
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
|
||||||
|
int32_t clip_skip = -1;
|
||||||
bool with_final_ln = true;
|
bool with_final_ln = true;
|
||||||
|
|
||||||
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
||||||
bool with_final_ln = true,
|
int clip_skip_value = -1,
|
||||||
bool force_clip_f32 = false,
|
bool with_final_ln = true)
|
||||||
bool proj_in = false)
|
|
||||||
: version(version), with_final_ln(with_final_ln) {
|
: version(version), with_final_ln(with_final_ln) {
|
||||||
if (version == OPEN_CLIP_VIT_H_14) {
|
if (version == OPEN_CLIP_VIT_H_14) {
|
||||||
hidden_size = 1024;
|
hidden_size = 1024;
|
||||||
@ -728,42 +692,48 @@ public:
|
|||||||
n_head = 20;
|
n_head = 20;
|
||||||
n_layer = 32;
|
n_layer = 32;
|
||||||
}
|
}
|
||||||
|
set_clip_skip(clip_skip_value);
|
||||||
|
|
||||||
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
|
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
|
||||||
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
|
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
|
||||||
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
|
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* get_token_embed_weight() {
|
void set_clip_skip(int skip) {
|
||||||
|
if (skip <= 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
clip_skip = skip;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* get_token_embed_weight() {
|
||||||
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
|
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
|
||||||
return embeddings->get_token_embed_weight();
|
return embeddings->get_token_embed_weight();
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
ggml_tensor* input_ids,
|
struct ggml_tensor* input_ids,
|
||||||
ggml_tensor* tkn_embeddings,
|
struct ggml_tensor* tkn_embeddings,
|
||||||
ggml_tensor* mask = nullptr,
|
|
||||||
size_t max_token_idx = 0,
|
size_t max_token_idx = 0,
|
||||||
bool return_pooled = false,
|
bool return_pooled = false) {
|
||||||
int clip_skip = -1) {
|
|
||||||
// input_ids: [N, n_token]
|
// input_ids: [N, n_token]
|
||||||
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
|
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
|
||||||
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
|
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
|
||||||
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);
|
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);
|
||||||
|
|
||||||
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
|
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
|
||||||
x = encoder->forward(ctx, x, mask, return_pooled ? -1 : clip_skip);
|
x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
|
||||||
if (return_pooled || with_final_ln) {
|
if (return_pooled || with_final_ln) {
|
||||||
x = final_layer_norm->forward(ctx, x);
|
x = final_layer_norm->forward(ctx, x);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (return_pooled) {
|
if (return_pooled) {
|
||||||
auto text_projection = params["text_projection"];
|
auto text_projection = params["text_projection"];
|
||||||
ggml_tensor* pooled = ggml_view_1d(ctx->ggml_ctx, x, hidden_size, x->nb[1] * max_token_idx);
|
ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
|
||||||
if (text_projection != nullptr) {
|
if (text_projection != NULL) {
|
||||||
pooled = ggml_ext_linear(ctx->ggml_ctx, pooled, text_projection, nullptr);
|
pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
|
||||||
} else {
|
} else {
|
||||||
LOG_DEBUG("identity projection");
|
LOG_DEBUG("Missing text_projection matrix, assuming identity...");
|
||||||
}
|
}
|
||||||
return pooled; // [hidden_size, 1, 1]
|
return pooled; // [hidden_size, 1, 1]
|
||||||
}
|
}
|
||||||
@ -785,7 +755,7 @@ public:
|
|||||||
int32_t n_layer = 24;
|
int32_t n_layer = 24;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false) {
|
CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
|
||||||
if (version == OPEN_CLIP_VIT_H_14) {
|
if (version == OPEN_CLIP_VIT_H_14) {
|
||||||
hidden_size = 1280;
|
hidden_size = 1280;
|
||||||
intermediate_size = 5120;
|
intermediate_size = 5120;
|
||||||
@ -800,14 +770,11 @@ public:
|
|||||||
|
|
||||||
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
|
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
|
||||||
blocks["pre_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
|
blocks["pre_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
|
||||||
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
|
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
|
||||||
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
|
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
|
||||||
ggml_tensor* pixel_values,
|
|
||||||
bool return_pooled = true,
|
|
||||||
int clip_skip = -1) {
|
|
||||||
// pixel_values: [N, num_channels, image_size, image_size]
|
// pixel_values: [N, num_channels, image_size, image_size]
|
||||||
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
|
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
|
||||||
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
|
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
|
||||||
@ -816,15 +783,14 @@ public:
|
|||||||
|
|
||||||
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
|
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
|
||||||
x = pre_layernorm->forward(ctx, x);
|
x = pre_layernorm->forward(ctx, x);
|
||||||
x = encoder->forward(ctx, x, nullptr, clip_skip);
|
x = encoder->forward(ctx, x, -1, false);
|
||||||
|
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
|
||||||
auto last_hidden_state = x;
|
auto last_hidden_state = x;
|
||||||
|
|
||||||
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
|
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
|
||||||
|
|
||||||
GGML_ASSERT(x->ne[3] == 1);
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
if (return_pooled) {
|
if (return_pooled) {
|
||||||
ggml_tensor* pooled = ggml_cont(ctx->ggml_ctx, ggml_view_2d(ctx->ggml_ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
|
ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
|
||||||
return pooled; // [N, hidden_size]
|
return pooled; // [N, hidden_size]
|
||||||
} else {
|
} else {
|
||||||
// return x; // [N, n_token, hidden_size]
|
// return x; // [N, n_token, hidden_size]
|
||||||
@ -839,8 +805,8 @@ protected:
|
|||||||
int64_t out_features;
|
int64_t out_features;
|
||||||
bool transpose_weight;
|
bool transpose_weight;
|
||||||
|
|
||||||
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
|
||||||
enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
|
enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
|
||||||
if (transpose_weight) {
|
if (transpose_weight) {
|
||||||
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
|
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
|
||||||
} else {
|
} else {
|
||||||
@ -856,12 +822,12 @@ public:
|
|||||||
out_features(out_features),
|
out_features(out_features),
|
||||||
transpose_weight(transpose_weight) {}
|
transpose_weight(transpose_weight) {}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
ggml_tensor* w = params["weight"];
|
struct ggml_tensor* w = params["weight"];
|
||||||
if (transpose_weight) {
|
if (transpose_weight) {
|
||||||
w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
|
w = ggml_cont(ctx, ggml_transpose(ctx, w));
|
||||||
}
|
}
|
||||||
return ggml_ext_linear(ctx->ggml_ctx, x, w, nullptr);
|
return ggml_nn_linear(ctx, x, w, NULL);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -873,8 +839,7 @@ public:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
||||||
bool transpose_proj_w = false,
|
bool transpose_proj_w = false) {
|
||||||
bool proj_in = false) {
|
|
||||||
if (version == OPEN_CLIP_VIT_H_14) {
|
if (version == OPEN_CLIP_VIT_H_14) {
|
||||||
hidden_size = 1280;
|
hidden_size = 1280;
|
||||||
projection_dim = 1024;
|
projection_dim = 1024;
|
||||||
@ -882,94 +847,75 @@ public:
|
|||||||
hidden_size = 1664;
|
hidden_size = 1664;
|
||||||
}
|
}
|
||||||
|
|
||||||
blocks["vision_model"] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version, proj_in));
|
blocks["vision_model"] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version));
|
||||||
blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
|
blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
|
||||||
ggml_tensor* pixel_values,
|
|
||||||
bool return_pooled = true,
|
|
||||||
int clip_skip = -1) {
|
|
||||||
// pixel_values: [N, num_channels, image_size, image_size]
|
// pixel_values: [N, num_channels, image_size, image_size]
|
||||||
// return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
|
// return: [N, projection_dim]
|
||||||
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
|
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
|
||||||
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
|
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
|
||||||
|
|
||||||
auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
|
auto x = vision_model->forward(ctx, pixel_values); // [N, hidden_size]
|
||||||
|
|
||||||
if (return_pooled) {
|
|
||||||
x = visual_projection->forward(ctx, x); // [N, projection_dim]
|
x = visual_projection->forward(ctx, x); // [N, projection_dim]
|
||||||
}
|
|
||||||
|
|
||||||
return x;
|
return x; // [N, projection_dim]
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CLIPTextModelRunner : public GGMLRunner {
|
struct CLIPTextModelRunner : public GGMLRunner {
|
||||||
CLIPTextModel model;
|
CLIPTextModel model;
|
||||||
|
|
||||||
std::vector<float> attention_mask_vec;
|
|
||||||
|
|
||||||
CLIPTextModelRunner(ggml_backend_t backend,
|
CLIPTextModelRunner(ggml_backend_t backend,
|
||||||
bool offload_params_to_cpu,
|
std::map<std::string, enum ggml_type>& tensor_types,
|
||||||
const String2TensorStorage& tensor_storage_map,
|
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
||||||
bool with_final_ln = true,
|
int clip_skip_value = 1,
|
||||||
bool force_clip_f32 = false)
|
bool with_final_ln = true)
|
||||||
: GGMLRunner(backend, offload_params_to_cpu) {
|
: GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
|
||||||
bool proj_in = false;
|
model.init(params_ctx, tensor_types, prefix);
|
||||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
|
||||||
if (!starts_with(name, prefix)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (contains(name, "self_attn.in_proj")) {
|
|
||||||
proj_in = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
model = CLIPTextModel(version, with_final_ln, force_clip_f32, proj_in);
|
|
||||||
model.init(params_ctx, tensor_storage_map, prefix);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string get_desc() override {
|
std::string get_desc() {
|
||||||
return "clip";
|
return "clip";
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
void set_clip_skip(int clip_skip) {
|
||||||
|
model.set_clip_skip(clip_skip);
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
||||||
model.get_param_tensors(tensors, prefix);
|
model.get_param_tensors(tensors, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
ggml_tensor* input_ids,
|
struct ggml_tensor* input_ids,
|
||||||
ggml_tensor* embeddings,
|
struct ggml_tensor* embeddings,
|
||||||
ggml_tensor* mask,
|
|
||||||
size_t max_token_idx = 0,
|
size_t max_token_idx = 0,
|
||||||
bool return_pooled = false,
|
bool return_pooled = false) {
|
||||||
int clip_skip = -1) {
|
|
||||||
size_t N = input_ids->ne[1];
|
size_t N = input_ids->ne[1];
|
||||||
size_t n_token = input_ids->ne[0];
|
size_t n_token = input_ids->ne[0];
|
||||||
if (input_ids->ne[0] > model.n_token) {
|
if (input_ids->ne[0] > model.n_token) {
|
||||||
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
|
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
|
||||||
input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
|
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip);
|
return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph* build_graph(ggml_tensor* input_ids,
|
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
|
||||||
int num_custom_embeddings = 0,
|
int num_custom_embeddings = 0,
|
||||||
void* custom_embeddings_data = nullptr,
|
void* custom_embeddings_data = NULL,
|
||||||
size_t max_token_idx = 0,
|
size_t max_token_idx = 0,
|
||||||
bool return_pooled = false,
|
bool return_pooled = false) {
|
||||||
int clip_skip = -1) {
|
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||||
ggml_cgraph* gf = new_graph_custom(2048);
|
|
||||||
|
|
||||||
input_ids = to_backend(input_ids);
|
input_ids = to_backend(input_ids);
|
||||||
|
|
||||||
ggml_tensor* embeddings = nullptr;
|
struct ggml_tensor* embeddings = NULL;
|
||||||
|
|
||||||
if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
|
if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
|
||||||
auto token_embed_weight = model.get_token_embed_weight();
|
auto token_embed_weight = model.get_token_embed_weight();
|
||||||
auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
|
auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
|
||||||
token_embed_weight->type,
|
token_embed_weight->type,
|
||||||
@ -981,42 +927,25 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
|||||||
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
|
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_tokens = static_cast<int>(input_ids->ne[0]);
|
struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
|
||||||
attention_mask_vec.resize(n_tokens * n_tokens);
|
|
||||||
for (int i0 = 0; i0 < n_tokens; i0++) {
|
|
||||||
for (int i1 = 0; i1 < n_tokens; i1++) {
|
|
||||||
float value = 0.f;
|
|
||||||
if (i0 > i1) {
|
|
||||||
value = -INFINITY;
|
|
||||||
}
|
|
||||||
attention_mask_vec[i1 * n_tokens + i0] = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
|
|
||||||
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
|
|
||||||
|
|
||||||
auto runner_ctx = get_context();
|
|
||||||
|
|
||||||
ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip);
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, hidden_states);
|
ggml_build_forward_expand(gf, hidden_states);
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool compute(const int n_threads,
|
void compute(const int n_threads,
|
||||||
ggml_tensor* input_ids,
|
struct ggml_tensor* input_ids,
|
||||||
int num_custom_embeddings,
|
int num_custom_embeddings,
|
||||||
void* custom_embeddings_data,
|
void* custom_embeddings_data,
|
||||||
size_t max_token_idx,
|
size_t max_token_idx,
|
||||||
bool return_pooled,
|
bool return_pooled,
|
||||||
int clip_skip,
|
|
||||||
ggml_tensor** output,
|
ggml_tensor** output,
|
||||||
ggml_context* output_ctx = nullptr) {
|
ggml_context* output_ctx = NULL) {
|
||||||
auto get_graph = [&]() -> ggml_cgraph* {
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
|
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
|
||||||
};
|
};
|
||||||
return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
|
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef __COMMON_BLOCK_HPP__
|
#ifndef __COMMON_HPP__
|
||||||
#define __COMMON_BLOCK_HPP__
|
#define __COMMON_HPP__
|
||||||
|
|
||||||
#include "ggml_extend.hpp"
|
#include "ggml_extend.hpp"
|
||||||
|
|
||||||
@ -23,12 +23,12 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
// x: [N, channels, h, w]
|
// x: [N, channels, h, w]
|
||||||
if (vae_downsample) {
|
if (vae_downsample) {
|
||||||
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
|
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
|
||||||
|
|
||||||
x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
|
x = ggml_pad(ctx, x, 1, 1, 0, 0);
|
||||||
x = conv->forward(ctx, x);
|
x = conv->forward(ctx, x);
|
||||||
} else {
|
} else {
|
||||||
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
|
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
|
||||||
@ -52,11 +52,11 @@ public:
|
|||||||
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
|
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
// x: [N, channels, h, w]
|
// x: [N, channels, h, w]
|
||||||
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
|
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
|
||||||
|
|
||||||
x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
|
x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2]
|
||||||
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
|
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
@ -80,7 +80,7 @@ protected:
|
|||||||
std::pair<int, int> padding) {
|
std::pair<int, int> padding) {
|
||||||
GGML_ASSERT(dims == 2 || dims == 3);
|
GGML_ASSERT(dims == 2 || dims == 3);
|
||||||
if (dims == 3) {
|
if (dims == 3) {
|
||||||
return std::shared_ptr<GGMLBlock>(new Conv3d(in_channels, out_channels, {kernel_size.first, 1, 1}, {1, 1, 1}, {padding.first, 0, 0}));
|
return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
|
||||||
} else {
|
} else {
|
||||||
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
|
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
|
||||||
}
|
}
|
||||||
@ -121,7 +121,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb = nullptr) {
|
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
|
||||||
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
|
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
|
||||||
// [N, c, t, h, w] => [N, c, t, h * w]
|
// [N, c, t, h, w] => [N, c, t, h * w]
|
||||||
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
|
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
|
||||||
@ -131,38 +131,38 @@ public:
|
|||||||
auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
|
auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
|
||||||
auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
|
auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
|
||||||
|
|
||||||
if (emb == nullptr) {
|
if (emb == NULL) {
|
||||||
GGML_ASSERT(skip_t_emb);
|
GGML_ASSERT(skip_t_emb);
|
||||||
}
|
}
|
||||||
|
|
||||||
// in_layers
|
// in_layers
|
||||||
auto h = in_layers_0->forward(ctx, x);
|
auto h = in_layers_0->forward(ctx, x);
|
||||||
h = ggml_silu_inplace(ctx->ggml_ctx, h);
|
h = ggml_silu_inplace(ctx, h);
|
||||||
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||||
|
|
||||||
// emb_layers
|
// emb_layers
|
||||||
if (!skip_t_emb) {
|
if (!skip_t_emb) {
|
||||||
auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
|
auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
|
||||||
|
|
||||||
auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
|
auto emb_out = ggml_silu(ctx, emb);
|
||||||
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
|
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
|
||||||
|
|
||||||
if (dims == 2) {
|
if (dims == 2) {
|
||||||
emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
|
emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
|
||||||
} else {
|
} else {
|
||||||
emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
|
emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
|
||||||
if (exchange_temb_dims) {
|
if (exchange_temb_dims) {
|
||||||
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
|
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
|
||||||
emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
|
emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
h = ggml_add(ctx->ggml_ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
h = ggml_add(ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||||
}
|
}
|
||||||
|
|
||||||
// out_layers
|
// out_layers
|
||||||
h = out_layers_0->forward(ctx, h);
|
h = out_layers_0->forward(ctx, h);
|
||||||
h = ggml_silu_inplace(ctx->ggml_ctx, h);
|
h = ggml_silu_inplace(ctx, h);
|
||||||
// dropout, skip for inference
|
// dropout, skip for inference
|
||||||
h = out_layers_3->forward(ctx, h);
|
h = out_layers_3->forward(ctx, h);
|
||||||
|
|
||||||
@ -172,97 +172,67 @@ public:
|
|||||||
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||||
}
|
}
|
||||||
|
|
||||||
h = ggml_add(ctx->ggml_ctx, h, x);
|
h = ggml_add(ctx, h, x);
|
||||||
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class GEGLU : public UnaryBlock {
|
class GEGLU : public GGMLBlock {
|
||||||
protected:
|
protected:
|
||||||
int64_t dim_in;
|
int64_t dim_in;
|
||||||
int64_t dim_out;
|
int64_t dim_out;
|
||||||
|
|
||||||
|
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
|
||||||
|
enum ggml_type wtype = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
|
||||||
|
enum ggml_type bias_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
|
||||||
|
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
|
||||||
|
params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GEGLU(int64_t dim_in, int64_t dim_out)
|
GEGLU(int64_t dim_in, int64_t dim_out)
|
||||||
: dim_in(dim_in), dim_out(dim_out) {
|
: dim_in(dim_in), dim_out(dim_out) {}
|
||||||
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
// x: [ne3, ne2, ne1, dim_in]
|
// x: [ne3, ne2, ne1, dim_in]
|
||||||
// return: [ne3, ne2, ne1, dim_out]
|
// return: [ne3, ne2, ne1, dim_out]
|
||||||
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
|
struct ggml_tensor* w = params["proj.weight"];
|
||||||
|
struct ggml_tensor* b = params["proj.bias"];
|
||||||
|
|
||||||
x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2]
|
auto x_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in]
|
||||||
auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false);
|
auto x_b = ggml_view_1d(ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in]
|
||||||
x = x_vec[0]; // [ne3, ne2, ne1, dim_out]
|
auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ]
|
||||||
auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out]
|
auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ]
|
||||||
|
|
||||||
gate = ggml_cont(ctx->ggml_ctx, gate);
|
auto x_in = x;
|
||||||
|
x = ggml_nn_linear(ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
|
||||||
|
auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
|
||||||
|
|
||||||
gate = ggml_ext_gelu(ctx->ggml_ctx, gate, true);
|
gate = ggml_gelu_inplace(ctx, gate);
|
||||||
|
|
||||||
x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out]
|
x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, dim_out]
|
||||||
|
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class GELU : public UnaryBlock {
|
|
||||||
public:
|
|
||||||
GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
|
|
||||||
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
|
|
||||||
// x: [ne3, ne2, ne1, dim_in]
|
|
||||||
// return: [ne3, ne2, ne1, dim_out]
|
|
||||||
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
|
|
||||||
|
|
||||||
x = proj->forward(ctx, x);
|
|
||||||
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class FeedForward : public GGMLBlock {
|
class FeedForward : public GGMLBlock {
|
||||||
public:
|
public:
|
||||||
enum class Activation {
|
|
||||||
GEGLU,
|
|
||||||
GELU
|
|
||||||
};
|
|
||||||
FeedForward(int64_t dim,
|
FeedForward(int64_t dim,
|
||||||
int64_t dim_out,
|
int64_t dim_out,
|
||||||
int64_t mult = 4,
|
int64_t mult = 4) {
|
||||||
Activation activation = Activation::GEGLU,
|
|
||||||
bool precision_fix = false) {
|
|
||||||
int64_t inner_dim = dim * mult;
|
int64_t inner_dim = dim * mult;
|
||||||
if (activation == Activation::GELU) {
|
|
||||||
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
|
|
||||||
} else {
|
|
||||||
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
|
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
|
||||||
}
|
|
||||||
|
|
||||||
// net_1 is nn.Dropout(), skip for inference
|
// net_1 is nn.Dropout(), skip for inference
|
||||||
bool force_prec_f32 = false;
|
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
|
||||||
float scale = 1.f;
|
|
||||||
if (precision_fix) {
|
|
||||||
scale = 1.f / 128.f;
|
|
||||||
#ifdef SD_USE_VULKAN
|
|
||||||
force_prec_f32 = true;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
// The purpose of the scale here is to prevent NaN issues in certain situations.
|
|
||||||
// For example, when using Vulkan without enabling force_prec_f32,
|
|
||||||
// or when using CUDA but the weights are k-quants.
|
|
||||||
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
// x: [ne3, ne2, ne1, dim]
|
// x: [ne3, ne2, ne1, dim]
|
||||||
// return: [ne3, ne2, ne1, dim_out]
|
// return: [ne3, ne2, ne1, dim_out]
|
||||||
|
|
||||||
auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
|
auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
|
||||||
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
|
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
|
||||||
|
|
||||||
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
|
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
|
||||||
@ -277,16 +247,19 @@ protected:
|
|||||||
int64_t context_dim;
|
int64_t context_dim;
|
||||||
int64_t n_head;
|
int64_t n_head;
|
||||||
int64_t d_head;
|
int64_t d_head;
|
||||||
|
bool flash_attn;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CrossAttention(int64_t query_dim,
|
CrossAttention(int64_t query_dim,
|
||||||
int64_t context_dim,
|
int64_t context_dim,
|
||||||
int64_t n_head,
|
int64_t n_head,
|
||||||
int64_t d_head)
|
int64_t d_head,
|
||||||
|
bool flash_attn = false)
|
||||||
: n_head(n_head),
|
: n_head(n_head),
|
||||||
d_head(d_head),
|
d_head(d_head),
|
||||||
query_dim(query_dim),
|
query_dim(query_dim),
|
||||||
context_dim(context_dim) {
|
context_dim(context_dim),
|
||||||
|
flash_attn(flash_attn) {
|
||||||
int64_t inner_dim = d_head * n_head;
|
int64_t inner_dim = d_head * n_head;
|
||||||
|
|
||||||
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
|
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
|
||||||
@ -297,9 +270,7 @@ public:
|
|||||||
// to_out_1 is nn.Dropout(), skip for inference
|
// to_out_1 is nn.Dropout(), skip for inference
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
|
||||||
ggml_tensor* x,
|
|
||||||
ggml_tensor* context) {
|
|
||||||
// x: [N, n_token, query_dim]
|
// x: [N, n_token, query_dim]
|
||||||
// context: [N, n_context, context_dim]
|
// context: [N, n_context, context_dim]
|
||||||
// return: [N, n_token, query_dim]
|
// return: [N, n_token, query_dim]
|
||||||
@ -317,7 +288,7 @@ public:
|
|||||||
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
|
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
|
||||||
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
|
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
|
||||||
|
|
||||||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
|
x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false, false, flash_attn); // [N, n_token, inner_dim]
|
||||||
|
|
||||||
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
|
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
|
||||||
return x;
|
return x;
|
||||||
@ -335,15 +306,16 @@ public:
|
|||||||
int64_t n_head,
|
int64_t n_head,
|
||||||
int64_t d_head,
|
int64_t d_head,
|
||||||
int64_t context_dim,
|
int64_t context_dim,
|
||||||
bool ff_in = false)
|
bool ff_in = false,
|
||||||
|
bool flash_attn = false)
|
||||||
: n_head(n_head), d_head(d_head), ff_in(ff_in) {
|
: n_head(n_head), d_head(d_head), ff_in(ff_in) {
|
||||||
// disable_self_attn is always False
|
// disable_self_attn is always False
|
||||||
// disable_temporal_crossattention is always False
|
// disable_temporal_crossattention is always False
|
||||||
// switch_temporal_ca_to_sa is always False
|
// switch_temporal_ca_to_sa is always False
|
||||||
// inner_dim is always None or equal to dim
|
// inner_dim is always None or equal to dim
|
||||||
// gated_ff is always True
|
// gated_ff is always True
|
||||||
blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
|
blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
|
||||||
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
|
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
|
||||||
blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
|
blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
|
||||||
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||||
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||||
@ -355,9 +327,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
|
||||||
ggml_tensor* x,
|
|
||||||
ggml_tensor* context) {
|
|
||||||
// x: [N, n_token, query_dim]
|
// x: [N, n_token, query_dim]
|
||||||
// context: [N, n_context, context_dim]
|
// context: [N, n_context, context_dim]
|
||||||
// return: [N, n_token, query_dim]
|
// return: [N, n_token, query_dim]
|
||||||
@ -377,21 +347,21 @@ public:
|
|||||||
x = norm_in->forward(ctx, x);
|
x = norm_in->forward(ctx, x);
|
||||||
x = ff_in->forward(ctx, x);
|
x = ff_in->forward(ctx, x);
|
||||||
// self.is_res is always True
|
// self.is_res is always True
|
||||||
x = ggml_add(ctx->ggml_ctx, x, x_skip);
|
x = ggml_add(ctx, x, x_skip);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto r = x;
|
auto r = x;
|
||||||
x = norm1->forward(ctx, x);
|
x = norm1->forward(ctx, x);
|
||||||
x = attn1->forward(ctx, x, x); // self-attention
|
x = attn1->forward(ctx, x, x); // self-attention
|
||||||
x = ggml_add(ctx->ggml_ctx, x, r);
|
x = ggml_add(ctx, x, r);
|
||||||
r = x;
|
r = x;
|
||||||
x = norm2->forward(ctx, x);
|
x = norm2->forward(ctx, x);
|
||||||
x = attn2->forward(ctx, x, context); // cross-attention
|
x = attn2->forward(ctx, x, context); // cross-attention
|
||||||
x = ggml_add(ctx->ggml_ctx, x, r);
|
x = ggml_add(ctx, x, r);
|
||||||
r = x;
|
r = x;
|
||||||
x = norm3->forward(ctx, x);
|
x = norm3->forward(ctx, x);
|
||||||
x = ff->forward(ctx, x);
|
x = ff->forward(ctx, x);
|
||||||
x = ggml_add(ctx->ggml_ctx, x, r);
|
x = ggml_add(ctx, x, r);
|
||||||
|
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
@ -404,23 +374,6 @@ protected:
|
|||||||
int64_t d_head;
|
int64_t d_head;
|
||||||
int64_t depth = 1; // 1
|
int64_t depth = 1; // 1
|
||||||
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
|
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
|
||||||
bool use_linear = false;
|
|
||||||
|
|
||||||
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
|
|
||||||
auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
|
|
||||||
if (iter != tensor_storage_map.end()) {
|
|
||||||
int64_t inner_dim = n_head * d_head;
|
|
||||||
if (iter->second.n_dims == 4 && use_linear) {
|
|
||||||
use_linear = false;
|
|
||||||
blocks["proj_in"] = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
|
|
||||||
blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
|
|
||||||
} else if (iter->second.n_dims == 2 && !use_linear) {
|
|
||||||
use_linear = true;
|
|
||||||
blocks["proj_in"] = std::make_shared<Linear>(in_channels, inner_dim);
|
|
||||||
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SpatialTransformer(int64_t in_channels,
|
SpatialTransformer(int64_t in_channels,
|
||||||
@ -428,42 +381,32 @@ public:
|
|||||||
int64_t d_head,
|
int64_t d_head,
|
||||||
int64_t depth,
|
int64_t depth,
|
||||||
int64_t context_dim,
|
int64_t context_dim,
|
||||||
bool use_linear)
|
bool flash_attn = false)
|
||||||
: in_channels(in_channels),
|
: in_channels(in_channels),
|
||||||
n_head(n_head),
|
n_head(n_head),
|
||||||
d_head(d_head),
|
d_head(d_head),
|
||||||
depth(depth),
|
depth(depth),
|
||||||
context_dim(context_dim),
|
context_dim(context_dim) {
|
||||||
use_linear(use_linear) {
|
// We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
|
||||||
// disable_self_attn is always False
|
// disable_self_attn is always False
|
||||||
int64_t inner_dim = n_head * d_head; // in_channels
|
int64_t inner_dim = n_head * d_head; // in_channels
|
||||||
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
|
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
|
||||||
if (use_linear) {
|
|
||||||
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
|
|
||||||
} else {
|
|
||||||
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
|
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < depth; i++) {
|
for (int i = 0; i < depth; i++) {
|
||||||
std::string name = "transformer_blocks." + std::to_string(i);
|
std::string name = "transformer_blocks." + std::to_string(i);
|
||||||
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
|
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_linear) {
|
|
||||||
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
|
|
||||||
} else {
|
|
||||||
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
|
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
virtual ggml_tensor* forward(GGMLRunnerContext* ctx,
|
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
|
||||||
ggml_tensor* x,
|
|
||||||
ggml_tensor* context) {
|
|
||||||
// x: [N, in_channels, h, w]
|
// x: [N, in_channels, h, w]
|
||||||
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
|
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
|
||||||
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
|
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
|
||||||
auto proj_in = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
|
auto proj_in = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
|
||||||
auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
|
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
|
||||||
|
|
||||||
auto x_in = x;
|
auto x_in = x;
|
||||||
int64_t n = x->ne[3];
|
int64_t n = x->ne[3];
|
||||||
@ -472,15 +415,10 @@ public:
|
|||||||
int64_t inner_dim = n_head * d_head;
|
int64_t inner_dim = n_head * d_head;
|
||||||
|
|
||||||
x = norm->forward(ctx, x);
|
x = norm->forward(ctx, x);
|
||||||
if (use_linear) {
|
|
||||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
|
|
||||||
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
|
|
||||||
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
|
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
|
||||||
} else {
|
|
||||||
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
|
||||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
|
x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
|
||||||
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < depth; i++) {
|
for (int i = 0; i < depth; i++) {
|
||||||
std::string name = "transformer_blocks." + std::to_string(i);
|
std::string name = "transformer_blocks." + std::to_string(i);
|
||||||
@ -489,37 +427,29 @@ public:
|
|||||||
x = transformer_block->forward(ctx, x, context);
|
x = transformer_block->forward(ctx, x, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_linear) {
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
|
||||||
// proj_out
|
x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
|
||||||
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
|
|
||||||
|
|
||||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
|
|
||||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
|
|
||||||
} else {
|
|
||||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
|
|
||||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
|
|
||||||
|
|
||||||
// proj_out
|
// proj_out
|
||||||
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
|
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
|
||||||
}
|
|
||||||
|
|
||||||
x = ggml_add(ctx->ggml_ctx, x, x_in);
|
x = ggml_add(ctx, x, x_in);
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class AlphaBlender : public GGMLBlock {
|
class AlphaBlender : public GGMLBlock {
|
||||||
protected:
|
protected:
|
||||||
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
|
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
|
||||||
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
|
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
|
||||||
enum ggml_type wtype = GGML_TYPE_F32;
|
enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
|
||||||
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
|
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
float get_alpha() {
|
float get_alpha() {
|
||||||
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
|
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
|
||||||
// so learned_with_images is same as learned
|
// so learned_with_images is same as learned
|
||||||
float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
|
float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
|
||||||
return sigmoid(alpha);
|
return sigmoid(alpha);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -530,23 +460,23 @@ public:
|
|||||||
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
|
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
ggml_tensor* x_spatial,
|
struct ggml_tensor* x_spatial,
|
||||||
ggml_tensor* x_temporal) {
|
struct ggml_tensor* x_temporal) {
|
||||||
// image_only_indicator is always tensor([0.])
|
// image_only_indicator is always tensor([0.])
|
||||||
float alpha = get_alpha();
|
float alpha = get_alpha();
|
||||||
auto x = ggml_add(ctx->ggml_ctx,
|
auto x = ggml_add(ctx,
|
||||||
ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
|
ggml_scale(ctx, x_spatial, alpha),
|
||||||
ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
|
ggml_scale(ctx, x_temporal, 1.0f - alpha));
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class VideoResBlock : public ResBlock {
|
class VideoResBlock : public ResBlock {
|
||||||
public:
|
public:
|
||||||
VideoResBlock(int64_t channels,
|
VideoResBlock(int channels,
|
||||||
int64_t emb_channels,
|
int emb_channels,
|
||||||
int64_t out_channels,
|
int out_channels,
|
||||||
std::pair<int, int> kernel_size = {3, 3},
|
std::pair<int, int> kernel_size = {3, 3},
|
||||||
int64_t video_kernel_size = 3,
|
int64_t video_kernel_size = 3,
|
||||||
int dims = 2) // always 2
|
int dims = 2) // always 2
|
||||||
@ -555,9 +485,9 @@ public:
|
|||||||
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
|
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
ggml_tensor* x,
|
struct ggml_tensor* x,
|
||||||
ggml_tensor* emb,
|
struct ggml_tensor* emb,
|
||||||
int num_video_frames) {
|
int num_video_frames) {
|
||||||
// x: [N, channels, h, w] aka [b*t, channels, h, w]
|
// x: [N, channels, h, w] aka [b*t, channels, h, w]
|
||||||
// emb: [N, emb_channels] aka [b*t, emb_channels]
|
// emb: [N, emb_channels] aka [b*t, emb_channels]
|
||||||
@ -573,21 +503,21 @@ public:
|
|||||||
int64_t H = x->ne[1];
|
int64_t H = x->ne[1];
|
||||||
int64_t W = x->ne[0];
|
int64_t W = x->ne[0];
|
||||||
|
|
||||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
|
x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
|
||||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
|
||||||
auto x_mix = x;
|
auto x_mix = x;
|
||||||
|
|
||||||
emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
|
emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
|
||||||
|
|
||||||
x = time_stack->forward(ctx, x, emb); // b t c (h w)
|
x = time_stack->forward(ctx, x, emb); // b t c (h w)
|
||||||
|
|
||||||
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
|
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
|
||||||
|
|
||||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
|
||||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
|
x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
|
||||||
|
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // __COMMON_BLOCK_HPP__
|
#endif // __COMMON_HPP__
|
||||||
1221
conditioner.hpp
Normal file
@ -1,7 +1,8 @@
|
|||||||
#ifndef __CONTROL_HPP__
|
#ifndef __CONTROL_HPP__
|
||||||
#define __CONTROL_HPP__
|
#define __CONTROL_HPP__
|
||||||
|
|
||||||
#include "common_block.hpp"
|
#include "common.hpp"
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
#include "model.h"
|
#include "model.h"
|
||||||
|
|
||||||
#define CONTROL_NET_GRAPH_SIZE 1536
|
#define CONTROL_NET_GRAPH_SIZE 1536
|
||||||
@ -26,7 +27,6 @@ protected:
|
|||||||
int num_heads = 8;
|
int num_heads = 8;
|
||||||
int num_head_channels = -1; // channels // num_heads
|
int num_head_channels = -1; // channels // num_heads
|
||||||
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
|
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
|
||||||
bool use_linear_projection = false;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
int model_channels = 320;
|
int model_channels = 320;
|
||||||
@ -82,7 +82,7 @@ public:
|
|||||||
int64_t d_head,
|
int64_t d_head,
|
||||||
int64_t depth,
|
int64_t depth,
|
||||||
int64_t context_dim) -> SpatialTransformer* {
|
int64_t context_dim) -> SpatialTransformer* {
|
||||||
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
|
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
|
||||||
};
|
};
|
||||||
|
|
||||||
auto make_zero_conv = [&](int64_t channels) {
|
auto make_zero_conv = [&](int64_t channels) {
|
||||||
@ -164,26 +164,26 @@ public:
|
|||||||
blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
|
blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* resblock_forward(std::string name,
|
struct ggml_tensor* resblock_forward(std::string name,
|
||||||
GGMLRunnerContext* ctx,
|
struct ggml_context* ctx,
|
||||||
ggml_tensor* x,
|
struct ggml_tensor* x,
|
||||||
ggml_tensor* emb) {
|
struct ggml_tensor* emb) {
|
||||||
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
|
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
|
||||||
return block->forward(ctx, x, emb);
|
return block->forward(ctx, x, emb);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* attention_layer_forward(std::string name,
|
struct ggml_tensor* attention_layer_forward(std::string name,
|
||||||
GGMLRunnerContext* ctx,
|
struct ggml_context* ctx,
|
||||||
ggml_tensor* x,
|
struct ggml_tensor* x,
|
||||||
ggml_tensor* context) {
|
struct ggml_tensor* context) {
|
||||||
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
|
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
|
||||||
return block->forward(ctx, x, context);
|
return block->forward(ctx, x, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
|
struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
|
||||||
ggml_tensor* hint,
|
struct ggml_tensor* hint,
|
||||||
ggml_tensor* emb,
|
struct ggml_tensor* emb,
|
||||||
ggml_tensor* context) {
|
struct ggml_tensor* context) {
|
||||||
int num_input_blocks = 15;
|
int num_input_blocks = 15;
|
||||||
auto h = hint;
|
auto h = hint;
|
||||||
for (int i = 0; i < num_input_blocks; i++) {
|
for (int i = 0; i < num_input_blocks; i++) {
|
||||||
@ -192,32 +192,32 @@ public:
|
|||||||
|
|
||||||
h = block->forward(ctx, h);
|
h = block->forward(ctx, h);
|
||||||
} else {
|
} else {
|
||||||
h = ggml_silu_inplace(ctx->ggml_ctx, h);
|
h = ggml_silu_inplace(ctx, h);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<ggml_tensor*> forward(GGMLRunnerContext* ctx,
|
std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
|
||||||
ggml_tensor* x,
|
struct ggml_tensor* x,
|
||||||
ggml_tensor* hint,
|
struct ggml_tensor* hint,
|
||||||
ggml_tensor* guided_hint,
|
struct ggml_tensor* guided_hint,
|
||||||
ggml_tensor* timesteps,
|
struct ggml_tensor* timesteps,
|
||||||
ggml_tensor* context,
|
struct ggml_tensor* context,
|
||||||
ggml_tensor* y = nullptr) {
|
struct ggml_tensor* y = NULL) {
|
||||||
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
|
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
|
||||||
// timesteps: [N,]
|
// timesteps: [N,]
|
||||||
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
|
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
|
||||||
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||||
if (context != nullptr) {
|
if (context != NULL) {
|
||||||
if (context->ne[2] != x->ne[3]) {
|
if (context->ne[2] != x->ne[3]) {
|
||||||
context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
|
context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (y != nullptr) {
|
if (y != NULL) {
|
||||||
if (y->ne[1] != x->ne[3]) {
|
if (y->ne[1] != x->ne[3]) {
|
||||||
y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
|
y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -228,27 +228,27 @@ public:
|
|||||||
|
|
||||||
auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
|
auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
|
||||||
|
|
||||||
auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels); // [N, model_channels]
|
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels]
|
||||||
|
|
||||||
auto emb = time_embed_0->forward(ctx, t_emb);
|
auto emb = time_embed_0->forward(ctx, t_emb);
|
||||||
emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
|
emb = ggml_silu_inplace(ctx, emb);
|
||||||
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
|
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
|
||||||
|
|
||||||
// SDXL/SVD
|
// SDXL/SVD
|
||||||
if (y != nullptr) {
|
if (y != NULL) {
|
||||||
auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
|
auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
|
||||||
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
|
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
|
||||||
|
|
||||||
auto label_emb = label_embed_0->forward(ctx, y);
|
auto label_emb = label_embed_0->forward(ctx, y);
|
||||||
label_emb = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
|
label_emb = ggml_silu_inplace(ctx, label_emb);
|
||||||
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
|
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
|
||||||
|
|
||||||
emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
|
emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim]
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<ggml_tensor*> outs;
|
std::vector<struct ggml_tensor*> outs;
|
||||||
|
|
||||||
if (guided_hint == nullptr) {
|
if (guided_hint == NULL) {
|
||||||
guided_hint = input_hint_block_forward(ctx, hint, emb, context);
|
guided_hint = input_hint_block_forward(ctx, hint, emb, context);
|
||||||
}
|
}
|
||||||
outs.push_back(guided_hint);
|
outs.push_back(guided_hint);
|
||||||
@ -257,7 +257,7 @@ public:
|
|||||||
|
|
||||||
// input block 0
|
// input block 0
|
||||||
auto h = input_blocks_0_0->forward(ctx, x);
|
auto h = input_blocks_0_0->forward(ctx, x);
|
||||||
h = ggml_add(ctx->ggml_ctx, h, guided_hint);
|
h = ggml_add(ctx, h, guided_hint);
|
||||||
outs.push_back(zero_convs_0->forward(ctx, h));
|
outs.push_back(zero_convs_0->forward(ctx, h));
|
||||||
|
|
||||||
// input block 1-11
|
// input block 1-11
|
||||||
@ -310,28 +310,27 @@ struct ControlNet : public GGMLRunner {
|
|||||||
SDVersion version = VERSION_SD1;
|
SDVersion version = VERSION_SD1;
|
||||||
ControlNetBlock control_net;
|
ControlNetBlock control_net;
|
||||||
|
|
||||||
ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory
|
ggml_backend_buffer_t control_buffer = NULL; // keep control output tensors in backend memory
|
||||||
ggml_context* control_ctx = nullptr;
|
ggml_context* control_ctx = NULL;
|
||||||
std::vector<ggml_tensor*> controls; // (12 input block outputs, 1 middle block output) SD 1.5
|
std::vector<struct ggml_tensor*> controls; // (12 input block outputs, 1 middle block output) SD 1.5
|
||||||
ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference
|
struct ggml_tensor* guided_hint = NULL; // guided_hint cache, for faster inference
|
||||||
bool guided_hint_cached = false;
|
bool guided_hint_cached = false;
|
||||||
|
|
||||||
ControlNet(ggml_backend_t backend,
|
ControlNet(ggml_backend_t backend,
|
||||||
bool offload_params_to_cpu,
|
std::map<std::string, enum ggml_type>& tensor_types,
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
|
||||||
SDVersion version = VERSION_SD1)
|
SDVersion version = VERSION_SD1)
|
||||||
: GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
|
: GGMLRunner(backend), control_net(version) {
|
||||||
control_net.init(params_ctx, tensor_storage_map, "");
|
control_net.init(params_ctx, tensor_types, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
~ControlNet() override {
|
~ControlNet() {
|
||||||
free_control_ctx();
|
free_control_ctx();
|
||||||
}
|
}
|
||||||
|
|
||||||
void alloc_control_ctx(std::vector<ggml_tensor*> outs) {
|
void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
|
||||||
ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
|
params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
|
||||||
params.mem_buffer = nullptr;
|
params.mem_buffer = NULL;
|
||||||
params.no_alloc = true;
|
params.no_alloc = true;
|
||||||
control_ctx = ggml_init(params);
|
control_ctx = ggml_init(params);
|
||||||
|
|
||||||
@ -347,43 +346,43 @@ struct ControlNet : public GGMLRunner {
|
|||||||
control_buffer_size += ggml_nbytes(controls[i]);
|
control_buffer_size += ggml_nbytes(controls[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
|
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
|
||||||
|
|
||||||
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
|
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
|
||||||
}
|
}
|
||||||
|
|
||||||
void free_control_ctx() {
|
void free_control_ctx() {
|
||||||
if (control_buffer != nullptr) {
|
if (control_buffer != NULL) {
|
||||||
ggml_backend_buffer_free(control_buffer);
|
ggml_backend_buffer_free(control_buffer);
|
||||||
control_buffer = nullptr;
|
control_buffer = NULL;
|
||||||
}
|
}
|
||||||
if (control_ctx != nullptr) {
|
if (control_ctx != NULL) {
|
||||||
ggml_free(control_ctx);
|
ggml_free(control_ctx);
|
||||||
control_ctx = nullptr;
|
control_ctx = NULL;
|
||||||
}
|
}
|
||||||
guided_hint = nullptr;
|
guided_hint = NULL;
|
||||||
guided_hint_cached = false;
|
guided_hint_cached = false;
|
||||||
controls.clear();
|
controls.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string get_desc() override {
|
std::string get_desc() {
|
||||||
return "control_net";
|
return "control_net";
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
||||||
control_net.get_param_tensors(tensors, prefix);
|
control_net.get_param_tensors(tensors, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph* build_graph(ggml_tensor* x,
|
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
|
||||||
ggml_tensor* hint,
|
struct ggml_tensor* hint,
|
||||||
ggml_tensor* timesteps,
|
struct ggml_tensor* timesteps,
|
||||||
ggml_tensor* context,
|
struct ggml_tensor* context,
|
||||||
ggml_tensor* y = nullptr) {
|
struct ggml_tensor* y = NULL) {
|
||||||
ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
|
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
|
||||||
|
|
||||||
x = to_backend(x);
|
x = to_backend(x);
|
||||||
if (guided_hint_cached) {
|
if (guided_hint_cached) {
|
||||||
hint = nullptr;
|
hint = NULL;
|
||||||
} else {
|
} else {
|
||||||
hint = to_backend(hint);
|
hint = to_backend(hint);
|
||||||
}
|
}
|
||||||
@ -391,17 +390,15 @@ struct ControlNet : public GGMLRunner {
|
|||||||
y = to_backend(y);
|
y = to_backend(y);
|
||||||
timesteps = to_backend(timesteps);
|
timesteps = to_backend(timesteps);
|
||||||
|
|
||||||
auto runner_ctx = get_context();
|
auto outs = control_net.forward(compute_ctx,
|
||||||
|
|
||||||
auto outs = control_net.forward(&runner_ctx,
|
|
||||||
x,
|
x,
|
||||||
hint,
|
hint,
|
||||||
guided_hint_cached ? guided_hint : nullptr,
|
guided_hint_cached ? guided_hint : NULL,
|
||||||
timesteps,
|
timesteps,
|
||||||
context,
|
context,
|
||||||
y);
|
y);
|
||||||
|
|
||||||
if (control_ctx == nullptr) {
|
if (control_ctx == NULL) {
|
||||||
alloc_control_ctx(outs);
|
alloc_control_ctx(outs);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -413,31 +410,27 @@ struct ControlNet : public GGMLRunner {
|
|||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool compute(int n_threads,
|
void compute(int n_threads,
|
||||||
ggml_tensor* x,
|
struct ggml_tensor* x,
|
||||||
ggml_tensor* hint,
|
struct ggml_tensor* hint,
|
||||||
ggml_tensor* timesteps,
|
struct ggml_tensor* timesteps,
|
||||||
ggml_tensor* context,
|
struct ggml_tensor* context,
|
||||||
ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
ggml_tensor** output = nullptr,
|
struct ggml_tensor** output = NULL,
|
||||||
ggml_context* output_ctx = nullptr) {
|
struct ggml_context* output_ctx = NULL) {
|
||||||
// x: [N, in_channels, h, w]
|
// x: [N, in_channels, h, w]
|
||||||
// timesteps: [N, ]
|
// timesteps: [N, ]
|
||||||
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
|
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
|
||||||
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||||
auto get_graph = [&]() -> ggml_cgraph* {
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
return build_graph(x, hint, timesteps, context, y);
|
return build_graph(x, hint, timesteps, context, y);
|
||||||
};
|
};
|
||||||
|
|
||||||
bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
|
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
|
||||||
if (res) {
|
|
||||||
// cache guided_hint
|
|
||||||
guided_hint_cached = true;
|
guided_hint_cached = true;
|
||||||
}
|
}
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool load_from_file(const std::string& file_path, int n_threads) {
|
bool load_from_file(const std::string& file_path) {
|
||||||
LOG_INFO("loading control net from '%s'", file_path.c_str());
|
LOG_INFO("loading control net from '%s'", file_path.c_str());
|
||||||
alloc_params_buffer();
|
alloc_params_buffer();
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
@ -445,12 +438,12 @@ struct ControlNet : public GGMLRunner {
|
|||||||
std::set<std::string> ignore_tensors;
|
std::set<std::string> ignore_tensors;
|
||||||
|
|
||||||
ModelLoader model_loader;
|
ModelLoader model_loader;
|
||||||
if (!model_loader.init_from_file_and_convert_name(file_path)) {
|
if (!model_loader.init_from_file(file_path)) {
|
||||||
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
|
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
|
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load control net tensors from model loader failed");
|
LOG_ERROR("load control net tensors from model loader failed");
|
||||||
182
diffusion_model.hpp
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
#ifndef __DIFFUSION_MODEL_H__
|
||||||
|
#define __DIFFUSION_MODEL_H__
|
||||||
|
|
||||||
|
#include "flux.hpp"
|
||||||
|
#include "mmdit.hpp"
|
||||||
|
#include "unet.hpp"
|
||||||
|
|
||||||
|
struct DiffusionModel {
|
||||||
|
virtual void compute(int n_threads,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* c_concat,
|
||||||
|
struct ggml_tensor* y,
|
||||||
|
struct ggml_tensor* guidance,
|
||||||
|
int num_video_frames = -1,
|
||||||
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
|
float control_strength = 0.f,
|
||||||
|
struct ggml_tensor** output = NULL,
|
||||||
|
struct ggml_context* output_ctx = NULL,
|
||||||
|
std::vector<int> skip_layers = std::vector<int>()) = 0;
|
||||||
|
virtual void alloc_params_buffer() = 0;
|
||||||
|
virtual void free_params_buffer() = 0;
|
||||||
|
virtual void free_compute_buffer() = 0;
|
||||||
|
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
|
||||||
|
virtual size_t get_params_buffer_size() = 0;
|
||||||
|
virtual int64_t get_adm_in_channels() = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct UNetModel : public DiffusionModel {
|
||||||
|
UNetModelRunner unet;
|
||||||
|
|
||||||
|
UNetModel(ggml_backend_t backend,
|
||||||
|
std::map<std::string, enum ggml_type>& tensor_types,
|
||||||
|
SDVersion version = VERSION_SD1,
|
||||||
|
bool flash_attn = false)
|
||||||
|
: unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void alloc_params_buffer() {
|
||||||
|
unet.alloc_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_params_buffer() {
|
||||||
|
unet.free_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_compute_buffer() {
|
||||||
|
unet.free_compute_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
|
||||||
|
unet.get_param_tensors(tensors, "model.diffusion_model");
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_buffer_size() {
|
||||||
|
return unet.get_params_buffer_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_adm_in_channels() {
|
||||||
|
return unet.unet.adm_in_channels;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(int n_threads,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* c_concat,
|
||||||
|
struct ggml_tensor* y,
|
||||||
|
struct ggml_tensor* guidance,
|
||||||
|
int num_video_frames = -1,
|
||||||
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
|
float control_strength = 0.f,
|
||||||
|
struct ggml_tensor** output = NULL,
|
||||||
|
struct ggml_context* output_ctx = NULL,
|
||||||
|
std::vector<int> skip_layers = std::vector<int>()) {
|
||||||
|
(void)skip_layers; // SLG doesn't work with UNet models
|
||||||
|
return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MMDiTModel : public DiffusionModel {
|
||||||
|
MMDiTRunner mmdit;
|
||||||
|
|
||||||
|
MMDiTModel(ggml_backend_t backend,
|
||||||
|
std::map<std::string, enum ggml_type>& tensor_types)
|
||||||
|
: mmdit(backend, tensor_types, "model.diffusion_model") {
|
||||||
|
}
|
||||||
|
|
||||||
|
void alloc_params_buffer() {
|
||||||
|
mmdit.alloc_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_params_buffer() {
|
||||||
|
mmdit.free_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_compute_buffer() {
|
||||||
|
mmdit.free_compute_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
|
||||||
|
mmdit.get_param_tensors(tensors, "model.diffusion_model");
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_buffer_size() {
|
||||||
|
return mmdit.get_params_buffer_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_adm_in_channels() {
|
||||||
|
return 768 + 1280;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(int n_threads,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* c_concat,
|
||||||
|
struct ggml_tensor* y,
|
||||||
|
struct ggml_tensor* guidance,
|
||||||
|
int num_video_frames = -1,
|
||||||
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
|
float control_strength = 0.f,
|
||||||
|
struct ggml_tensor** output = NULL,
|
||||||
|
struct ggml_context* output_ctx = NULL,
|
||||||
|
std::vector<int> skip_layers = std::vector<int>()) {
|
||||||
|
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FluxModel : public DiffusionModel {
|
||||||
|
Flux::FluxRunner flux;
|
||||||
|
|
||||||
|
FluxModel(ggml_backend_t backend,
|
||||||
|
std::map<std::string, enum ggml_type>& tensor_types,
|
||||||
|
SDVersion version = VERSION_FLUX,
|
||||||
|
bool flash_attn = false)
|
||||||
|
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void alloc_params_buffer() {
|
||||||
|
flux.alloc_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_params_buffer() {
|
||||||
|
flux.free_params_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_compute_buffer() {
|
||||||
|
flux.free_compute_buffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
|
||||||
|
flux.get_param_tensors(tensors, "model.diffusion_model");
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_buffer_size() {
|
||||||
|
return flux.get_params_buffer_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_adm_in_channels() {
|
||||||
|
return 768;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(int n_threads,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* c_concat,
|
||||||
|
struct ggml_tensor* y,
|
||||||
|
struct ggml_tensor* guidance,
|
||||||
|
int num_video_frames = -1,
|
||||||
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
|
float control_strength = 0.f,
|
||||||
|
struct ggml_tensor** output = NULL,
|
||||||
|
struct ggml_context* output_ctx = NULL,
|
||||||
|
std::vector<int> skip_layers = std::vector<int>()) {
|
||||||
|
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, output, output_ctx, skip_layers);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
@ -1,21 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Anima
|
|
||||||
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main
|
|
||||||
- gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main
|
|
||||||
- Download vae
|
|
||||||
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae
|
|
||||||
- Download Qwen3-0.6B-Base
|
|
||||||
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/text_encoders
|
|
||||||
- gguf: https://huggingface.co/mradermacher/Qwen3-0.6B-Base-GGUF/tree/main
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
```sh
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="anima image example" src="../assets/anima/example.png" />
|
|
||||||
173
docs/build.md
@ -1,173 +0,0 @@
|
|||||||
# Build from scratch
|
|
||||||
|
|
||||||
## Get the Code
|
|
||||||
|
|
||||||
```
|
|
||||||
git clone --recursive https://github.com/leejet/stable-diffusion.cpp
|
|
||||||
cd stable-diffusion.cpp
|
|
||||||
```
|
|
||||||
|
|
||||||
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
|
|
||||||
|
|
||||||
```
|
|
||||||
cd stable-diffusion.cpp
|
|
||||||
git pull origin master
|
|
||||||
git submodule init
|
|
||||||
git submodule update
|
|
||||||
```
|
|
||||||
|
|
||||||
## Build (CPU only)
|
|
||||||
|
|
||||||
If you don't have a GPU or CUDA installed, you can build a CPU-only version.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
mkdir build && cd build
|
|
||||||
cmake ..
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
## Build with OpenBLAS
|
|
||||||
|
|
||||||
```shell
|
|
||||||
mkdir build && cd build
|
|
||||||
cmake .. -DGGML_OPENBLAS=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
## Build with CUDA
|
|
||||||
|
|
||||||
This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
mkdir build && cd build
|
|
||||||
cmake .. -DSD_CUDA=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
## Build with HipBLAS
|
|
||||||
|
|
||||||
This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
|
|
||||||
To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
|
|
||||||
|
|
||||||
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
mkdir build && cd build
|
|
||||||
if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
|
|
||||||
if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
|
|
||||||
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
## Build with MUSA
|
|
||||||
|
|
||||||
This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
mkdir build && cd build
|
|
||||||
cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
## Build with Metal
|
|
||||||
|
|
||||||
Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
mkdir build && cd build
|
|
||||||
cmake .. -DSD_METAL=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
## Build with Vulkan
|
|
||||||
|
|
||||||
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
mkdir build && cd build
|
|
||||||
cmake .. -DSD_VULKAN=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
## Build with OpenCL (for Adreno GPU)
|
|
||||||
|
|
||||||
Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
|
|
||||||
|
|
||||||
To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
|
|
||||||
|
|
||||||
Building for Android:
|
|
||||||
|
|
||||||
Android NDK:
|
|
||||||
Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
|
|
||||||
|
|
||||||
Setup OpenCL Dependencies for NDK:
|
|
||||||
|
|
||||||
You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
|
|
||||||
|
|
||||||
* OpenCL Headers:
|
|
||||||
```bash
|
|
||||||
# In a temporary working directory
|
|
||||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
|
||||||
cd OpenCL-Headers
|
|
||||||
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
|
|
||||||
# e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
|
|
||||||
sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
|
|
||||||
cd ..
|
|
||||||
```
|
|
||||||
|
|
||||||
* OpenCL ICD Loader:
|
|
||||||
```shell
|
|
||||||
# In the same temporary working directory
|
|
||||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
|
||||||
cd OpenCL-ICD-Loader
|
|
||||||
mkdir build_ndk && cd build_ndk
|
|
||||||
|
|
||||||
# Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
|
|
||||||
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
|
|
||||||
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
|
|
||||||
-DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
|
|
||||||
-DANDROID_ABI=arm64-v8a \
|
|
||||||
-DANDROID_PLATFORM=24 \
|
|
||||||
-DANDROID_STL=c++_shared
|
|
||||||
|
|
||||||
ninja
|
|
||||||
# Replace <YOUR_NDK_PATH>
|
|
||||||
# e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
|
|
||||||
sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
|
|
||||||
cd ../..
|
|
||||||
```
|
|
||||||
|
|
||||||
Build `stable-diffusion.cpp` for Android with OpenCL:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
mkdir build-android && cd build-android
|
|
||||||
|
|
||||||
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
|
|
||||||
# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
|
|
||||||
cmake .. -G Ninja \
|
|
||||||
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
|
|
||||||
-DANDROID_ABI=arm64-v8a \
|
|
||||||
-DANDROID_PLATFORM=android-28 \
|
|
||||||
-DGGML_OPENMP=OFF \
|
|
||||||
-DSD_OPENCL=ON
|
|
||||||
|
|
||||||
ninja
|
|
||||||
```
|
|
||||||
*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
|
|
||||||
|
|
||||||
## Build with SYCL
|
|
||||||
|
|
||||||
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
|
|
||||||
|
|
||||||
```shell
|
|
||||||
# Export relevant ENV variables
|
|
||||||
source /opt/intel/oneapi/setvars.sh
|
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
|
||||||
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
|
||||||
|
|
||||||
# Option 2: Use FP16
|
|
||||||
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
|
||||||
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
141
docs/caching.md
@ -1,141 +0,0 @@
|
|||||||
## Caching
|
|
||||||
|
|
||||||
Caching methods accelerate diffusion inference by reusing intermediate computations when changes between steps are small.
|
|
||||||
|
|
||||||
### Cache Modes
|
|
||||||
|
|
||||||
| Mode | Target | Description |
|
|
||||||
|------|--------|-------------|
|
|
||||||
| `ucache` | UNET models | Condition-level caching with error tracking |
|
|
||||||
| `easycache` | DiT models | Condition-level cache |
|
|
||||||
| `dbcache` | DiT models | Block-level L1 residual threshold |
|
|
||||||
| `taylorseer` | DiT models | Taylor series approximation |
|
|
||||||
| `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
|
|
||||||
| `spectrum` | UNET models | Chebyshev + Taylor output forecasting |
|
|
||||||
|
|
||||||
### UCache (UNET Models)
|
|
||||||
|
|
||||||
UCache caches the residual difference (output - input) and reuses it when input changes are below threshold.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
sd-cli -m model.safetensors -p "a cat" --cache-mode ucache --cache-option "threshold=1.5"
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
| Parameter | Description | Default |
|
|
||||||
|-----------|-------------|---------|
|
|
||||||
| `threshold` | Error threshold for reuse decision | 1.0 |
|
|
||||||
| `start` | Start caching at this percent of steps | 0.15 |
|
|
||||||
| `end` | Stop caching at this percent of steps | 0.95 |
|
|
||||||
| `decay` | Error decay rate (0-1) | 1.0 |
|
|
||||||
| `relative` | Scale threshold by output norm (0/1) | 1 |
|
|
||||||
| `reset` | Reset error after computing (0/1) | 1 |
|
|
||||||
|
|
||||||
#### Reset Parameter
|
|
||||||
|
|
||||||
The `reset` parameter controls error accumulation behavior:
|
|
||||||
|
|
||||||
- `reset=1` (default): Resets accumulated error after each computed step. More aggressive caching, works well with most samplers.
|
|
||||||
- `reset=0`: Keeps error accumulated. More conservative, recommended for `euler_a` sampler.
|
|
||||||
|
|
||||||
### EasyCache (DiT Models)
|
|
||||||
|
|
||||||
Condition-level caching for DiT models. Caches and reuses outputs when input changes are below threshold.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
--cache-mode easycache --cache-option "threshold=0.3"
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
| Parameter | Description | Default |
|
|
||||||
|-----------|-------------|---------|
|
|
||||||
| `threshold` | Input change threshold for reuse | 0.2 |
|
|
||||||
| `start` | Start caching at this percent of steps | 0.15 |
|
|
||||||
| `end` | Stop caching at this percent of steps | 0.95 |
|
|
||||||
|
|
||||||
### Cache-DIT (DiT Models)
|
|
||||||
|
|
||||||
For DiT models like FLUX and QWEN, use block-level caching modes.
|
|
||||||
|
|
||||||
#### DBCache
|
|
||||||
|
|
||||||
Caches blocks based on L1 residual difference threshold:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
--cache-mode dbcache --cache-option "threshold=0.25,warmup=4"
|
|
||||||
```
|
|
||||||
|
|
||||||
#### TaylorSeer
|
|
||||||
|
|
||||||
Uses Taylor series approximation to predict block outputs:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
--cache-mode taylorseer
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Cache-DIT (Combined)
|
|
||||||
|
|
||||||
Combines DBCache and TaylorSeer:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
--cache-mode cache-dit
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
| Parameter | Description | Default |
|
|
||||||
|-----------|-------------|---------|
|
|
||||||
| `Fn` | Front blocks to always compute | 8 |
|
|
||||||
| `Bn` | Back blocks to always compute | 0 |
|
|
||||||
| `threshold` | L1 residual difference threshold | 0.08 |
|
|
||||||
| `warmup` | Steps before caching starts | 8 |
|
|
||||||
|
|
||||||
#### SCM Options
|
|
||||||
|
|
||||||
Steps Computation Mask controls which steps can be cached:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
--scm-mask "1,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1"
|
|
||||||
```
|
|
||||||
|
|
||||||
Mask values: `1` = compute, `0` = can cache.
|
|
||||||
|
|
||||||
| Policy | Description |
|
|
||||||
|--------|-------------|
|
|
||||||
| `dynamic` | Check threshold before caching |
|
|
||||||
| `static` | Always cache on cacheable steps |
|
|
||||||
|
|
||||||
```bash
|
|
||||||
--scm-policy dynamic
|
|
||||||
```
|
|
||||||
|
|
||||||
### Spectrum (UNET Models)
|
|
||||||
|
|
||||||
Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).
|
|
||||||
|
|
||||||
```bash
|
|
||||||
sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Parameters
|
|
||||||
|
|
||||||
| Parameter | Description | Default |
|
|
||||||
|-----------|-------------|---------|
|
|
||||||
| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
|
|
||||||
| `m` | Chebyshev polynomial degree | 3 |
|
|
||||||
| `lam` | Ridge regression regularization | 1.0 |
|
|
||||||
| `window` | Initial window size (compute every N steps) | 2 |
|
|
||||||
| `flex` | Window growth per computed step after warmup | 0.50 |
|
|
||||||
| `warmup` | Steps to always compute before caching starts | 4 |
|
|
||||||
| `stop` | Stop caching at this fraction of total steps | 0.9 |
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
### Performance Tips
|
|
||||||
|
|
||||||
- Start with default thresholds and adjust based on output quality
|
|
||||||
- Lower threshold = better quality, less speedup
|
|
||||||
- Higher threshold = more speedup, potential quality loss
|
|
||||||
- More steps generally means more caching opportunities
|
|
||||||
@ -1,33 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Chroma
|
|
||||||
- If you don't want to do the conversion yourself, download the preconverted gguf model from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF)
|
|
||||||
- Otherwise, download chroma's safetensors from [lodestones/Chroma](https://huggingface.co/lodestones/Chroma)
|
|
||||||
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
|
|
||||||
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
|
|
||||||
|
|
||||||
## Convert Chroma weights
|
|
||||||
|
|
||||||
You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
|
|
||||||
```
|
|
||||||
|
|
||||||
## Run
|
|
||||||
|
|
||||||
### Example
|
|
||||||
For example:
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Chroma1-Radiance
|
|
||||||
- safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
|
|
||||||
- gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
|
|
||||||
|
|
||||||
- Download t5xxl
|
|
||||||
- safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,137 +0,0 @@
|
|||||||
# Running distilled models: SSD1B, Vega and SDx.x with tiny U-Nets
|
|
||||||
|
|
||||||
## Preface
|
|
||||||
|
|
||||||
These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B and Vega U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
|
|
||||||
Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
|
|
||||||
|
|
||||||
## SSD1B
|
|
||||||
|
|
||||||
Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
|
|
||||||
|
|
||||||
* https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
|
|
||||||
* https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
|
|
||||||
|
|
||||||
Useful LoRAs are also available:
|
|
||||||
|
|
||||||
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
|
|
||||||
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
|
|
||||||
|
|
||||||
## Vega
|
|
||||||
|
|
||||||
Segmind's Vega model is available online here:
|
|
||||||
|
|
||||||
* https://huggingface.co/segmind/Segmind-Vega/resolve/main/segmind-vega.safetensors
|
|
||||||
|
|
||||||
VegaRT is an example for an LCM-LoRA:
|
|
||||||
|
|
||||||
* https://huggingface.co/segmind/Segmind-VegaRT/resolve/main/pytorch_lora_weights.safetensors
|
|
||||||
|
|
||||||
Both files can be used out-of-the-box, unlike the models described in next sections.
|
|
||||||
|
|
||||||
|
|
||||||
## SD1.x, SD2.x with tiny U-Nets
|
|
||||||
|
|
||||||
These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
|
|
||||||
|
|
||||||
* https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
|
|
||||||
|
|
||||||
### SD2.x
|
|
||||||
|
|
||||||
NotaAI provides the following model online:
|
|
||||||
|
|
||||||
* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
|
|
||||||
|
|
||||||
Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from diffusers import StableDiffusionPipeline
|
|
||||||
pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
|
|
||||||
```
|
|
||||||
|
|
||||||
Second, create the .safetensors file by running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python convert_diffusers_to_original_stable_diffusion.py \
|
|
||||||
--model_path models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
|
|
||||||
--checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
|
|
||||||
```
|
|
||||||
|
|
||||||
This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
|
|
||||||
|
|
||||||
### SD1.x
|
|
||||||
|
|
||||||
Several Tiny SD 1.x models are available online, such as:
|
|
||||||
|
|
||||||
* https://huggingface.co/segmind/tiny-sd
|
|
||||||
* https://huggingface.co/segmind/portrait-finetuned
|
|
||||||
* https://huggingface.co/nota-ai/bk-sdm-tiny
|
|
||||||
|
|
||||||
These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
|
|
||||||
Download and prepare the model using Python:
|
|
||||||
|
|
||||||
##### Download the model using Python on your computer, for example this way:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import torch
|
|
||||||
from diffusers import StableDiffusionPipeline
|
|
||||||
pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
|
|
||||||
unet=pipe.unet
|
|
||||||
for param in unet.parameters():
|
|
||||||
param.data = param.data.contiguous() # <- important here
|
|
||||||
pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Run the conversion script:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python convert_diffusers_to_original_stable_diffusion.py \
|
|
||||||
--model_path ./segmindtiny-sd \
|
|
||||||
--checkpoint_path ./segmind_tiny-sd.ckpt --half
|
|
||||||
```
|
|
||||||
|
|
||||||
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
|
|
||||||
|
|
||||||
|
|
||||||
##### Another available .ckpt file:
|
|
||||||
|
|
||||||
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
|
|
||||||
|
|
||||||
To use this file, you must first adjust its non-contiguous tensors:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import torch
|
|
||||||
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
|
|
||||||
for key, value in ckpt['state_dict'].items():
|
|
||||||
if isinstance(value, torch.Tensor):
|
|
||||||
ckpt['state_dict'][key] = value.contiguous()
|
|
||||||
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
### SDXS-512
|
|
||||||
|
|
||||||
Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
|
|
||||||
|
|
||||||
##### 1. Download the diffusers model from Hugging Face using Python:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from diffusers import StableDiffusionPipeline
|
|
||||||
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
|
|
||||||
pipe.save_pretrained(save_directory="sdxs")
|
|
||||||
```
|
|
||||||
##### 2. Create a safetensors file
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python convert_diffusers_to_original_stable_diffusion.py \
|
|
||||||
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
|
|
||||||
```
|
|
||||||
|
|
||||||
##### 3. Run the model as follows:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
|
|
||||||
--cfg-scale 1 --steps 1
|
|
||||||
```
|
|
||||||
|
|
||||||
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
|
|
||||||
@ -1,39 +1,15 @@
|
|||||||
# Docker
|
## Docker
|
||||||
|
|
||||||
## Run CLI
|
### Building using Docker
|
||||||
|
|
||||||
```shell
|
|
||||||
docker run --rm -v /path/to/models:/models -v /path/to/output/:/output ghcr.io/leejet/stable-diffusion.cpp:master [args...]
|
|
||||||
# For example
|
|
||||||
# docker run --rm -v ./models:/models -v ./build:/output ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
|
|
||||||
```
|
|
||||||
|
|
||||||
## Run server
|
|
||||||
|
|
||||||
```shell
|
|
||||||
docker run --rm --init -v /path/to/models:/models -v /path/to/output/:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master [args...]
|
|
||||||
# For example
|
|
||||||
# docker run --rm --init -v ./models:/models -v ./build:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
|
|
||||||
```
|
|
||||||
|
|
||||||
## Building using Docker
|
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker build -t sd .
|
docker build -t sd .
|
||||||
```
|
```
|
||||||
|
|
||||||
## Building variants using Docker
|
### Run
|
||||||
|
|
||||||
Vulkan:
|
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker build -f Dockerfile.vulkan -t sd .
|
docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
|
||||||
```
|
|
||||||
|
|
||||||
## Run locally built image's CLI
|
|
||||||
|
|
||||||
```shell
|
|
||||||
docker run --rm -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
|
|
||||||
# For example
|
# For example
|
||||||
# docker run --rm -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
|
# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
|
||||||
```
|
```
|
||||||
@ -1,9 +1,9 @@
|
|||||||
## Using ESRGAN to upscale results
|
## Using ESRGAN to upscale results
|
||||||
|
|
||||||
You can use ESRGAN—such as the model [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth)—to upscale the generated images and improve their overall resolution and clarity.
|
You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
|
||||||
|
|
||||||
- Specify the model path using the `--upscale-model PATH` parameter. example:
|
- Specify the model path using the `--upscale-model PATH` parameter. example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
|
sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
|
||||||
```
|
```
|
||||||
|
|||||||
10
docs/flux.md
@ -15,9 +15,9 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
|
|||||||
|
|
||||||
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
|
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
|
||||||
|
|
||||||
For example:
|
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
|
||||||
```
|
```
|
||||||
.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
|
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run
|
## Run
|
||||||
@ -28,7 +28,7 @@ For example:
|
|||||||
For example:
|
For example:
|
||||||
|
|
||||||
```
|
```
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
|
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
|
||||||
```
|
```
|
||||||
|
|
||||||
Using formats of different precisions will yield results of varying quality.
|
Using formats of different precisions will yield results of varying quality.
|
||||||
@ -44,7 +44,7 @@ Using formats of different precisions will yield results of varying quality.
|
|||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
|
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4
|
||||||
```
|
```
|
||||||
|
|
||||||
| q8_0 |
|
| q8_0 |
|
||||||
@ -60,7 +60,7 @@ Since many flux LoRA training libraries have used various LoRA naming formats, i
|
|||||||
- LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
|
- LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
|
||||||
|
|
||||||
```
|
```
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
|
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models
|
||||||
```
|
```
|
||||||
|
|
||||||

|

|
||||||
|
|||||||
@ -1,92 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
## Flux.2-dev
|
|
||||||
|
|
||||||
### Download weights
|
|
||||||
|
|
||||||
- Download FLUX.2-dev
|
|
||||||
- gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
|
|
||||||
- Download vae
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
|
|
||||||
- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
|
|
||||||
- gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
|
|
||||||
|
|
||||||
### Examples
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="flux2 example" src="../assets/flux2/example.png" />
|
|
||||||
|
|
||||||
## Flux.2 klein 4B / Flux.2 klein base 4B
|
|
||||||
|
|
||||||
### Download weights
|
|
||||||
|
|
||||||
- Download FLUX.2-klein-4B
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-4B
|
|
||||||
- gguf: https://huggingface.co/leejet/FLUX.2-klein-4B-GGUF/tree/main
|
|
||||||
- Download FLUX.2-klein-base-4B
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B
|
|
||||||
- gguf: https://huggingface.co/leejet/FLUX.2-klein-base-4B-GGUF/tree/main
|
|
||||||
- Download vae
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
|
|
||||||
- Download Qwen3 4b
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/flux2-klein-4B/tree/main/split_files/text_encoders
|
|
||||||
- gguf: https://huggingface.co/unsloth/Qwen3-4B-GGUF/tree/main
|
|
||||||
|
|
||||||
### Examples
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="flux2-klein-4b" src="../assets/flux2/flux2-klein-4b.png" />
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="flux2-klein-4b-edit" src="../assets/flux2/flux2-klein-4b-edit.png" />
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="flux2-klein-base-4b" src="../assets/flux2/flux2-klein-base-4b.png" />
|
|
||||||
|
|
||||||
## Flux.2 klein 9B / Flux.2 klein base 9B
|
|
||||||
|
|
||||||
### Download weights
|
|
||||||
|
|
||||||
- Download FLUX.2-klein-9B
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-9B
|
|
||||||
- gguf: https://huggingface.co/leejet/FLUX.2-klein-9B-GGUF/tree/main
|
|
||||||
- Download FLUX.2-klein-base-9B
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-9B
|
|
||||||
- gguf: https://huggingface.co/leejet/FLUX.2-klein-base-9B-GGUF/tree/main
|
|
||||||
- Download vae
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
|
|
||||||
- Download Qwen3 8B
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/flux2-klein-9B/tree/main/split_files/text_encoders
|
|
||||||
- gguf: https://huggingface.co/unsloth/Qwen3-8B-GGUF/tree/main
|
|
||||||
|
|
||||||
### Examples
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="flux2-klein-9b" src="../assets/flux2/flux2-klein-9b.png" />
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="flux2-klein-9b-edit" src="../assets/flux2/flux2-klein-9b-edit.png" />
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="flux2-klein-base-9b" src="../assets/flux2/flux2-klein-base-9b.png" />
|
|
||||||
@ -82,4 +82,4 @@ cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_H
|
|||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
If everything went OK, `build\bin\sd-cli.exe` file should appear.
|
If everything went OK, `build\bin\sd.exe` file should appear.
|
||||||
|
|||||||
@ -1,39 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Kontext
|
|
||||||
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF)
|
|
||||||
- Otherwise, download FLUX.1-Kontext-dev from https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev/blob/main/flux1-kontext-dev.safetensors
|
|
||||||
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
|
|
||||||
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
|
|
||||||
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
|
|
||||||
|
|
||||||
## Convert Kontext weights
|
|
||||||
|
|
||||||
You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
|
|
||||||
```
|
|
||||||
|
|
||||||
## Run
|
|
||||||
|
|
||||||
- `--cfg-scale` is recommended to be set to 1.
|
|
||||||
|
|
||||||
### Example
|
|
||||||
For example:
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
| ref_image | prompt | output |
|
|
||||||
| ---- | ---- |---- |
|
|
||||||
|  | change 'flux.cpp' to 'kontext.cpp' | |
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -7,7 +7,7 @@
|
|||||||
Here's a simple example:
|
Here's a simple example:
|
||||||
|
|
||||||
```
|
```
|
||||||
./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
|
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
|
||||||
```
|
```
|
||||||
|
|
||||||
| without LCM-LoRA (--cfg-scale 7) | with LCM-LoRA (--cfg-scale 1) |
|
| without LCM-LoRA (--cfg-scale 7) | with LCM-LoRA (--cfg-scale 1) |
|
||||||
|
|||||||
15
docs/lora.md
@ -7,20 +7,7 @@
|
|||||||
Here's a simple example:
|
Here's a simple example:
|
||||||
|
|
||||||
```
|
```
|
||||||
./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
|
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
|
||||||
```
|
```
|
||||||
|
|
||||||
`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
|
`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
|
||||||
|
|
||||||
# Lora Apply Mode
|
|
||||||
|
|
||||||
There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
|
|
||||||
|
|
||||||
By default, the mode is selected automatically:
|
|
||||||
|
|
||||||
* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
|
|
||||||
* Otherwise, the **immediately** mode is used.
|
|
||||||
|
|
||||||
The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
|
|
||||||
In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.
|
|
||||||
|
|
||||||
|
|||||||
@ -1,19 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Ovis-Image-7B
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/leejet/Ovis-Image-7B-GGUF
|
|
||||||
- Download vae
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
|
|
||||||
- Download Ovis 2.5
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/text_encoders
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="ovis image example" src="../assets/ovis_image/example.png" />
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
## Use Flash Attention to save memory and improve speed.
|
|
||||||
|
|
||||||
Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
|
|
||||||
eg.:
|
|
||||||
- flux 768x768 ~600mb
|
|
||||||
- SD2 768x768 ~1400mb
|
|
||||||
|
|
||||||
For most backends, it slows things down, but for cuda it generally speeds it up too.
|
|
||||||
At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
|
|
||||||
|
|
||||||
Run by adding `--diffusion-fa` to the arguments and watch for:
|
|
||||||
```
|
|
||||||
[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
|
|
||||||
```
|
|
||||||
and the compute buffer shrink in the debug log:
|
|
||||||
```
|
|
||||||
[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Offload weights to the CPU to save VRAM without reducing generation speed.
|
|
||||||
|
|
||||||
Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.
|
|
||||||
|
|
||||||
## Use quantization to reduce memory usage.
|
|
||||||
|
|
||||||
[quantization](./quantization_and_gguf.md)
|
|
||||||
@ -6,15 +6,16 @@ You can use [PhotoMaker](https://github.com/TencentARC/PhotoMaker) to personaliz
|
|||||||
|
|
||||||
Download PhotoMaker model file (in safetensor format) [here](https://huggingface.co/bssrdf/PhotoMaker). The official release of the model file (in .bin format) does not work with ```stablediffusion.cpp```.
|
Download PhotoMaker model file (in safetensor format) [here](https://huggingface.co/bssrdf/PhotoMaker). The official release of the model file (in .bin format) does not work with ```stablediffusion.cpp```.
|
||||||
|
|
||||||
- Specify the PhotoMaker model path using the `--photo-maker PATH` parameter.
|
- Specify the PhotoMaker model path using the `--stacked-id-embd-dir PATH` parameter.
|
||||||
- Specify the input images path using the `--pm-id-images-dir PATH` parameter.
|
- Specify the input images path using the `--input-id-images-dir PATH` parameter.
|
||||||
|
- input images **must** have the same width and height for preprocessing (to be improved)
|
||||||
|
|
||||||
In prompt, make sure you have a class word followed by the trigger word ```"img"``` (hard-coded for now). The class word could be one of ```"man, woman, girl, boy"```. If input ID images contain asian faces, add ```Asian``` before the class
|
In prompt, make sure you have a class word followed by the trigger word ```"img"``` (hard-coded for now). The class word could be one of ```"man, woman, girl, boy"```. If input ID images contain asian faces, add ```Asian``` before the class
|
||||||
word.
|
word.
|
||||||
|
|
||||||
Another PhotoMaker specific parameter:
|
Another PhotoMaker specific parameter:
|
||||||
|
|
||||||
- ```--pm-style-strength (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
|
- ```--style-ratio (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
|
||||||
|
|
||||||
Other parameters recommended for running Photomaker:
|
Other parameters recommended for running Photomaker:
|
||||||
|
|
||||||
@ -27,7 +28,7 @@ If on low memory GPUs (<= 8GB), recommend running with ```--vae-on-cpu``` option
|
|||||||
Example:
|
Example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
bin/sd-cli -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --photo-maker ../models/photomaker-v1.safetensors --pm-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --pm-style-strength 10 --vae-on-cpu --steps 50
|
bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
|
||||||
```
|
```
|
||||||
|
|
||||||
## PhotoMaker Version 2
|
## PhotoMaker Version 2
|
||||||
@ -40,7 +41,7 @@ Running PMV2 is now a two-step process:
|
|||||||
```
|
```
|
||||||
python face_detect.py input_image_dir
|
python face_detect.py input_image_dir
|
||||||
```
|
```
|
||||||
An ```id_embeds.bin``` file will be generated in ```input_images_dir```
|
An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
|
||||||
|
|
||||||
**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
|
**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
|
||||||
|
|
||||||
@ -48,6 +49,6 @@ An ```id_embeds.bin``` file will be generated in ```input_images_dir```
|
|||||||
|
|
||||||
You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
|
You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
|
||||||
|
|
||||||
- All the command line parameters from Version 1 remain the same for Version 2 plus one extra pointing to a valid ```id_embeds``` file: --pm-id-embed-path [path_to__id_embeds.bin]
|
- All the command line parameters from Version 1 remain the same for Version 2
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -23,5 +23,5 @@ You can also convert weights in the formats `ckpt/safetensors/diffusers` to gguf
|
|||||||
For example:
|
For example:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./bin/sd-cli -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
|
./bin/sd -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
|
||||||
```
|
```
|
||||||
@ -1,23 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Qwen Image
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/QuantStack/Qwen-Image-GGUF/tree/main
|
|
||||||
- Download vae
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
|
|
||||||
- Download qwen_2.5_vl 7b
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
|
|
||||||
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线: 探索视觉生成基础模型的极限,开创理解与生成一体化的未来。二、Qwen-Image的模型特色:1、复杂文字渲染。支持中英渲染、自动布局; 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景:赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="qwen example" src="../assets/qwen/example.png" />
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,48 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Qwen Image
|
|
||||||
- Qwen Image Edit
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-GGUF/tree/main
|
|
||||||
- Qwen Image Edit 2509
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-2509-GGUF/tree/main
|
|
||||||
- Qwen Image Edit 2511
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/tree/main
|
|
||||||
- Download vae
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
|
|
||||||
- Download qwen_2.5_vl 7b
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
|
|
||||||
- gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
### Qwen Image Edit
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
|
|
||||||
|
|
||||||
|
|
||||||
### Qwen Image Edit 2509
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
|
|
||||||
|
|
||||||
### Qwen Image Edit 2511
|
|
||||||
|
|
||||||
To use the new Qwen Image Edit 2511 mode, the `--qwen-image-zero-cond-t` flag must be enabled; otherwise, image editing quality will degrade significantly.
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-edit-2511-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --qwen-image-zero-cond-t
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2511.png" />
|
|
||||||
37
docs/sd.md
@ -1,37 +0,0 @@
|
|||||||
## Download weights
|
|
||||||
|
|
||||||
- download original weights(.ckpt or .safetensors). For example
|
|
||||||
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
|
|
||||||
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
|
|
||||||
- Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
|
|
||||||
- Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
|
|
||||||
|
|
||||||
### txt2img example
|
|
||||||
|
|
||||||
```sh
|
|
||||||
./bin/sd-cli -m ../models/sd-v1-4.ckpt -p "a lovely cat"
|
|
||||||
# ./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
|
|
||||||
# ./bin/sd-cli -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
|
|
||||||
# ./bin/sd-cli -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
|
|
||||||
# ./bin/sd-cli --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
|
|
||||||
# ./bin/sd-cli -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
Using formats of different precisions will yield results of varying quality.
|
|
||||||
|
|
||||||
| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
|
|
||||||
| ---- |---- |---- |---- |---- |---- |---- |
|
|
||||||
|  | | | | | | |
|
|
||||||
|
|
||||||
### img2img example
|
|
||||||
|
|
||||||
- `./output.png` is the image generated from the above txt2img pipeline
|
|
||||||
|
|
||||||
|
|
||||||
```
|
|
||||||
./bin/sd-cli -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
|
|
||||||
```
|
|
||||||
|
|
||||||
<p align="center">
|
|
||||||
<img src="../assets/img2img_output.png" width="256x">
|
|
||||||
</p>
|
|
||||||
@ -14,7 +14,7 @@
|
|||||||
For example:
|
For example:
|
||||||
|
|
||||||
```
|
```
|
||||||
.\bin\Release\sd-cli.exe -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
|
.\bin\Release\sd.exe -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
|
||||||
```
|
```
|
||||||
|
|
||||||

|

|
||||||
@ -7,33 +7,11 @@ You can use TAESD to accelerate the decoding of latent images by following these
|
|||||||
Or curl
|
Or curl
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -L -O https://huggingface.co/madebyollin/taesd/resolve/main/diffusion_pytorch_model.safetensors
|
curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
|
||||||
```
|
```
|
||||||
|
|
||||||
- Specify the model path using the `--taesd PATH` parameter. example:
|
- Specify the model path using the `--taesd PATH` parameter. example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
|
sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
|
||||||
```
|
```
|
||||||
|
|
||||||
### Qwen-Image and wan (TAEHV)
|
|
||||||
|
|
||||||
sd.cpp also supports [TAEHV](https://github.com/madebyollin/taehv) (#937), which can be used for Qwen-Image and wan.
|
|
||||||
|
|
||||||
- For **Qwen-Image and wan2.1 and wan2.2-A14B**, download the wan2.1 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_1.safetensors)
|
|
||||||
|
|
||||||
Or curl
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_1.safetensors
|
|
||||||
```
|
|
||||||
|
|
||||||
- For **wan2.2-TI2V-5B**, use the wan2.2 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_2.safetensors)
|
|
||||||
|
|
||||||
Or curl
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_2.safetensors
|
|
||||||
```
|
|
||||||
|
|
||||||
Then simply replace the `--vae xxx.safetensors` with `--tae xxx.safetensors` in the commands. If it still out of VRAM, add `--vae-conv-direct` to your command though might be slower.
|
|
||||||
|
|||||||
207
docs/wan.md
@ -1,207 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Wan
|
|
||||||
- Wan2.1
|
|
||||||
- Wan2.1 T2V 1.3B
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- Wan2.1 T2V 14B
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/city96/Wan2.1-T2V-14B-gguf/tree/main
|
|
||||||
- Wan2.1 I2V 14B 480P
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/tree/main
|
|
||||||
- Wan2.1 I2V 14B 720P
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-720P-gguf/tree/main
|
|
||||||
- Wan2.1 FLF2V 14B 720P
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/city96/Wan2.1-FLF2V-14B-720P-gguf/tree/main
|
|
||||||
- Wan2.1 VACE 1.3B
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/calcuis/wan-1.3b-gguf/tree/main
|
|
||||||
- Wan2.1 VACE 14B
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/QuantStack/Wan2.1_14B_VACE-GGUF/tree/main
|
|
||||||
- Wan2.2
|
|
||||||
- Wan2.2 TI2V 5B
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/QuantStack/Wan2.2-TI2V-5B-GGUF/tree/main
|
|
||||||
- Wan2.2 T2V A14B
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/QuantStack/Wan2.2-T2V-A14B-GGUF/tree/main
|
|
||||||
- Wan2.2 I2V A14B
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/QuantStack/Wan2.2-I2V-A14B-GGUF/tree/main
|
|
||||||
- Download vae
|
|
||||||
- wan_2.1_vae (for all the wan model except Wan2.2 TI2V 5B)
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors
|
|
||||||
- wan_2.2_vae (for Wan2.2 TI2V 5B only)
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors
|
|
||||||
|
|
||||||
> Wan models vae requires really much VRAM! If you do not have enough VRAM, please try tae instead, though the results may be poorer. For tae usage, please refer to [taesd](taesd.md)
|
|
||||||
|
|
||||||
- Download umt5_xxl
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors
|
|
||||||
- gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main
|
|
||||||
|
|
||||||
- Download clip_vison_h (for Wan2.1 I2V/FLF2V only)
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/clip_vision/clip_vision_h.safetensors
|
|
||||||
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
### Wan2.1 T2V 1.3B
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1_t2v_1.3B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_1.3B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
### Wan2.1 T2V 14B
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-t2v-14b-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Wan2.1 I2V 14B
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-i2v-14b-480p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
### Wan2.2 T2V A14B
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.2_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
### Wan2.2 I2V A14B
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.2_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
### Wan2.2 T2V A14B T2I
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<img width="832" height="480" alt="Wan2 2_14B_t2i" src="../assets/wan/Wan2.2_14B_t2i.png" />
|
|
||||||
|
|
||||||
### Wan2.2 T2V 14B with Lora
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat<lora:wan2.2_t2v_lightx2v_4steps_lora_v1.1_low_noise:1><lora:|high_noise|wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise:1>" --cfg-scale 3.5 --sampling-method euler --steps 4 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 4 -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --lora-model-dir ..\..\ComfyUI\models\loras --video-frames 33 --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.2_14B_t2v_lora.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Wan2.2 TI2V 5B
|
|
||||||
|
|
||||||
#### T2V
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.2_5B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
#### I2V
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.2_5B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
### Wan2.1 FLF2V 14B
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-flf2v-14b-720p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
### Wan2.2 FLF2V 14B
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -p "glass flower blossom" -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.2_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
### Wan2.1 VACE 1.3B
|
|
||||||
|
|
||||||
#### T2V
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_1.3B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
|
|
||||||
#### R2V
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_1.3B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
|
|
||||||
#### V2V
|
|
||||||
|
|
||||||
```
|
|
||||||
mkdir post+depth
|
|
||||||
ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\frame_%04d.jpg
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_1.3B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
### Wan2.1 VACE 14B
|
|
||||||
|
|
||||||
#### T2V
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_14B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
|
|
||||||
#### R2V
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_14B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#### V2V
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部, 畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
|
|
||||||
```
|
|
||||||
|
|
||||||
<video src=../assets/wan/Wan2.1_14B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
|
|
||||||
@ -1,41 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Z-Image-Turbo
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
|
|
||||||
- Download Z-Image
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/z_image/tree/main/split_files/diffusion_models
|
|
||||||
- gguf: https://huggingface.co/unsloth/Z-Image-GGUF/tree/main
|
|
||||||
- Download vae
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
|
|
||||||
- Download Qwen3 4b
|
|
||||||
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/text_encoders
|
|
||||||
- gguf: https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
### Z-Image-Turbo
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
|
|
||||||
```
|
|
||||||
|
|
||||||
<img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
|
|
||||||
|
|
||||||
### Z-Image-Base
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\z_image_bf16.safetensors --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
|
|
||||||
```
|
|
||||||
|
|
||||||
<img width="256" alt="z-image example" src="../assets/z_image/base_bf16.png" />
|
|
||||||
|
|
||||||
## Comparison of Different Quantization Types
|
|
||||||
|
|
||||||
| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|
|
|
||||||
|---|---|---|---|---|---|---|---|
|
|
||||||
| <img width="256" alt="bf16" src="../assets/z_image/bf16.png" /> | <img width="256" alt="q8_0" src="../assets/z_image/q8_0.png" /> | <img width="256" alt="q6_K" src="../assets/z_image/q6_K.png" /> | <img width="256" alt="q5_0" src="../assets/z_image/q5_0.png" /> | <img width="256" alt="q4_K" src="../assets/z_image/q4_K.png" /> | <img width="256" alt="q4_0" src="../assets/z_image/q4_0.png" /> | <img width="256" alt="q3_K" src="../assets/z_image/q3_K.png" /> | <img width="256" alt="q2_K" src="../assets/z_image/q2_K.png" /> |
|
|
||||||
197
esrgan.hpp
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
#ifndef __ESRGAN_HPP__
|
||||||
|
#define __ESRGAN_HPP__
|
||||||
|
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
#include "model.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
=================================== ESRGAN ===================================
|
||||||
|
References:
|
||||||
|
https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
|
||||||
|
https://github.com/XPixelGroup/BasicSR/blob/v1.4.2/basicsr/archs/rrdbnet_arch.py
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
class ResidualDenseBlock : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int num_feat;
|
||||||
|
int num_grow_ch;
|
||||||
|
|
||||||
|
public:
|
||||||
|
ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
|
||||||
|
: num_feat(num_feat), num_grow_ch(num_grow_ch) {
|
||||||
|
blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
return ggml_leaky_relu(ctx, x, 0.2f, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [n, num_feat, h, w]
|
||||||
|
// return: [n, num_feat, h, w]
|
||||||
|
|
||||||
|
auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
|
||||||
|
auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
|
||||||
|
auto conv3 = std::dynamic_pointer_cast<Conv2d>(blocks["conv3"]);
|
||||||
|
auto conv4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv4"]);
|
||||||
|
auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
|
||||||
|
|
||||||
|
auto x1 = lrelu(ctx, conv1->forward(ctx, x));
|
||||||
|
auto x_cat = ggml_concat(ctx, x, x1, 2);
|
||||||
|
auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat));
|
||||||
|
x_cat = ggml_concat(ctx, x_cat, x2, 2);
|
||||||
|
auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat));
|
||||||
|
x_cat = ggml_concat(ctx, x_cat, x3, 2);
|
||||||
|
auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat));
|
||||||
|
x_cat = ggml_concat(ctx, x_cat, x4, 2);
|
||||||
|
auto x5 = conv5->forward(ctx, x_cat);
|
||||||
|
|
||||||
|
x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
|
||||||
|
return x5;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class RRDB : public GGMLBlock {
|
||||||
|
public:
|
||||||
|
RRDB(int num_feat, int num_grow_ch = 32) {
|
||||||
|
blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
|
||||||
|
blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
|
||||||
|
blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [n, num_feat, h, w]
|
||||||
|
// return: [n, num_feat, h, w]
|
||||||
|
|
||||||
|
auto rdb1 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb1"]);
|
||||||
|
auto rdb2 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb2"]);
|
||||||
|
auto rdb3 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb3"]);
|
||||||
|
|
||||||
|
auto out = rdb1->forward(ctx, x);
|
||||||
|
out = rdb2->forward(ctx, out);
|
||||||
|
out = rdb3->forward(ctx, out);
|
||||||
|
|
||||||
|
out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class RRDBNet : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int scale = 4; // default RealESRGAN_x4plus_anime_6B
|
||||||
|
int num_block = 6; // default RealESRGAN_x4plus_anime_6B
|
||||||
|
int num_in_ch = 3;
|
||||||
|
int num_out_ch = 3;
|
||||||
|
int num_feat = 64; // default RealESRGAN_x4plus_anime_6B
|
||||||
|
int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B
|
||||||
|
|
||||||
|
public:
|
||||||
|
RRDBNet() {
|
||||||
|
blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
for (int i = 0; i < num_block; i++) {
|
||||||
|
std::string name = "body." + std::to_string(i);
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
|
||||||
|
}
|
||||||
|
blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
// upsample
|
||||||
|
blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv_hr"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
return ggml_leaky_relu(ctx, x, 0.2f, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [n, num_in_ch, h, w]
|
||||||
|
// return: [n, num_out_ch, h*4, w*4]
|
||||||
|
auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
|
||||||
|
auto conv_body = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
|
||||||
|
auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
|
||||||
|
auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
|
||||||
|
auto conv_hr = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
|
||||||
|
auto conv_last = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
|
||||||
|
|
||||||
|
auto feat = conv_first->forward(ctx, x);
|
||||||
|
auto body_feat = feat;
|
||||||
|
for (int i = 0; i < num_block; i++) {
|
||||||
|
std::string name = "body." + std::to_string(i);
|
||||||
|
auto block = std::dynamic_pointer_cast<RRDB>(blocks[name]);
|
||||||
|
|
||||||
|
body_feat = block->forward(ctx, body_feat);
|
||||||
|
}
|
||||||
|
body_feat = conv_body->forward(ctx, body_feat);
|
||||||
|
feat = ggml_add(ctx, feat, body_feat);
|
||||||
|
// upsample
|
||||||
|
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
|
||||||
|
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
|
||||||
|
auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ESRGAN : public GGMLRunner {
|
||||||
|
RRDBNet rrdb_net;
|
||||||
|
int scale = 4;
|
||||||
|
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
|
||||||
|
|
||||||
|
ESRGAN(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
|
||||||
|
: GGMLRunner(backend) {
|
||||||
|
rrdb_net.init(params_ctx, tensor_types, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() {
|
||||||
|
return "esrgan";
|
||||||
|
}
|
||||||
|
|
||||||
|
bool load_from_file(const std::string& file_path) {
|
||||||
|
LOG_INFO("loading esrgan from '%s'", file_path.c_str());
|
||||||
|
|
||||||
|
alloc_params_buffer();
|
||||||
|
std::map<std::string, ggml_tensor*> esrgan_tensors;
|
||||||
|
rrdb_net.get_param_tensors(esrgan_tensors);
|
||||||
|
|
||||||
|
ModelLoader model_loader;
|
||||||
|
if (!model_loader.init_from_file(file_path)) {
|
||||||
|
LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool success = model_loader.load_tensors(esrgan_tensors, backend);
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
LOG_ERROR("load esrgan tensors from model loader failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("esrgan model loaded");
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
|
||||||
|
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||||
|
x = to_backend(x);
|
||||||
|
struct ggml_tensor* out = rrdb_net.forward(compute_ctx, x);
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(const int n_threads,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
ggml_tensor** output,
|
||||||
|
ggml_context* output_ctx = NULL) {
|
||||||
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
|
return build_graph(x);
|
||||||
|
};
|
||||||
|
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __ESRGAN_HPP__
|
||||||
@ -1,4 +1,3 @@
|
|||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
|
|
||||||
add_subdirectory(cli)
|
add_subdirectory(cli)
|
||||||
add_subdirectory(server)
|
|
||||||
@ -1,6 +1,6 @@
|
|||||||
set(TARGET sd-cli)
|
set(TARGET sd)
|
||||||
|
|
||||||
add_executable(${TARGET} main.cpp)
|
add_executable(${TARGET} main.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)
|
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||||
@ -1,149 +0,0 @@
|
|||||||
# Run
|
|
||||||
|
|
||||||
```
|
|
||||||
usage: ./bin/sd-cli [options]
|
|
||||||
|
|
||||||
CLI Options:
|
|
||||||
-o, --output <string> path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
|
|
||||||
./output.png) (eg. output_%03d.png)
|
|
||||||
--preview-path <string> path to write preview image to (default: ./preview.png)
|
|
||||||
--preview-interval <int> interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
|
|
||||||
every step)
|
|
||||||
--output-begin-idx <int> starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
|
|
||||||
--canny apply canny preprocessor (edge detection)
|
|
||||||
--convert-name convert tensor name (for convert mode)
|
|
||||||
-v, --verbose print extra info
|
|
||||||
--color colors the logging tags according to level
|
|
||||||
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
|
|
||||||
--preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs
|
|
||||||
-M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
|
|
||||||
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
|
|
||||||
-h, --help show this help message and exit
|
|
||||||
|
|
||||||
Context Options:
|
|
||||||
-m, --model <string> path to full model
|
|
||||||
--clip_l <string> path to the clip-l text encoder
|
|
||||||
--clip_g <string> path to the clip-g text encoder
|
|
||||||
--clip_vision <string> path to the clip-vision encoder
|
|
||||||
--t5xxl <string> path to the t5xxl text encoder
|
|
||||||
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
|
|
||||||
--llm_vision <string> path to the llm vit
|
|
||||||
--qwen2vl <string> alias of --llm. Deprecated.
|
|
||||||
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
|
||||||
--diffusion-model <string> path to the standalone diffusion model
|
|
||||||
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
|
|
||||||
--vae <string> path to standalone vae model
|
|
||||||
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
|
||||||
--tae <string> alias of --taesd
|
|
||||||
--control-net <string> path to control net model
|
|
||||||
--embd-dir <string> embeddings directory
|
|
||||||
--lora-model-dir <string> lora model directory
|
|
||||||
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
|
|
||||||
--photo-maker <string> path to PHOTOMAKER model
|
|
||||||
--upscale-model <string> path to esrgan model.
|
|
||||||
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
|
|
||||||
CPU physical cores
|
|
||||||
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
|
||||||
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
|
|
||||||
--vae-tiling process vae in tiles to reduce memory usage
|
|
||||||
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
|
||||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
|
|
||||||
--mmap whether to memory-map model
|
|
||||||
--control-net-cpu keep controlnet in cpu (for low vram)
|
|
||||||
--clip-on-cpu keep clip in cpu (for low vram)
|
|
||||||
--vae-on-cpu keep vae in cpu (for low vram)
|
|
||||||
--fa use flash attention
|
|
||||||
--diffusion-fa use flash attention in the diffusion model only
|
|
||||||
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
|
|
||||||
--vae-conv-direct use ggml_conv2d_direct in the vae model
|
|
||||||
--circular enable circular padding for convolutions
|
|
||||||
--circularx enable circular RoPE wrapping on x-axis (width) only
|
|
||||||
--circulary enable circular RoPE wrapping on y-axis (height) only
|
|
||||||
--chroma-disable-dit-mask disable dit mask for chroma
|
|
||||||
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
|
|
||||||
--chroma-enable-t5-mask enable t5 mask for chroma
|
|
||||||
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
|
|
||||||
type of the weight file
|
|
||||||
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
|
|
||||||
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
|
|
||||||
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
|
|
||||||
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
|
|
||||||
contain any quantized parameters, the at_runtime mode will be used; otherwise,
|
|
||||||
immediately will be used.The immediately mode may have precision and
|
|
||||||
compatibility issues with quantized parameters, but it usually offers faster inference
|
|
||||||
speed and, in some cases, lower memory usage. The at_runtime mode, on the
|
|
||||||
other hand, is exactly the opposite.
|
|
||||||
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
|
|
||||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
|
|
||||||
(overrides --vae-tile-size)
|
|
||||||
|
|
||||||
Generation Options:
|
|
||||||
-p, --prompt <string> the prompt to render
|
|
||||||
-n, --negative-prompt <string> the negative prompt (default: "")
|
|
||||||
-i, --init-img <string> path to the init image
|
|
||||||
--end-img <string> path to the end image, required by flf2v
|
|
||||||
--mask <string> path to the mask image
|
|
||||||
--control-image <string> path to control image, control net
|
|
||||||
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
|
|
||||||
lexicographical (character) order. For example, if the control video path is
|
|
||||||
`frames`, the directory contain images such as 00.png, 01.png, ... etc.
|
|
||||||
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
|
|
||||||
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
|
|
||||||
-H, --height <int> image height, in pixel space (default: 512)
|
|
||||||
-W, --width <int> image width, in pixel space (default: 512)
|
|
||||||
--steps <int> number of sample steps (default: 20)
|
|
||||||
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
|
|
||||||
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
|
|
||||||
will be 1 for SD1.x, 2 for SD2.x
|
|
||||||
-b, --batch-count <int> batch count
|
|
||||||
--video-frames <int> video frames (default: 1)
|
|
||||||
--fps <int> fps (default: 24)
|
|
||||||
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
|
|
||||||
NitroSD-Vibrant
|
|
||||||
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
|
|
||||||
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
|
|
||||||
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
|
|
||||||
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
|
|
||||||
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
|
|
||||||
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
|
|
||||||
medium
|
|
||||||
--skip-layer-start <float> SLG enabling point (default: 0.01)
|
|
||||||
--skip-layer-end <float> SLG disabling point (default: 0.2)
|
|
||||||
--eta <float> eta in DDIM, only for DDIM and TCD (default: 0)
|
|
||||||
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
|
|
||||||
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
|
|
||||||
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
|
|
||||||
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
|
|
||||||
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
|
|
||||||
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
|
|
||||||
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
|
|
||||||
--high-noise-eta <float> (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
|
|
||||||
--strength <float> strength for noising/unnoising (default: 0.75)
|
|
||||||
--pm-style-strength <float>
|
|
||||||
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
|
|
||||||
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
|
|
||||||
--vace-strength <float> wan vace strength
|
|
||||||
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
|
|
||||||
--disable-auto-resize-ref-image disable auto resize of ref images
|
|
||||||
-s, --seed RNG seed (default: 42, use random seed for < 0)
|
|
||||||
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
|
|
||||||
tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
|
|
||||||
otherwise)
|
|
||||||
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
|
|
||||||
ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
|
|
||||||
euler_a otherwise
|
|
||||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
|
|
||||||
kl_optimal, lcm, bong_tangent], default: discrete
|
|
||||||
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
|
|
||||||
--skip-layers layers to skip for SLG steps (default: [7,8,9])
|
|
||||||
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
|
|
||||||
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
|
|
||||||
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
|
|
||||||
'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
|
|
||||||
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
|
|
||||||
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
|
|
||||||
spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
|
|
||||||
"threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
|
|
||||||
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
|
|
||||||
--scm-policy SCM policy: 'dynamic' (default) or 'static'
|
|
||||||
```
|
|
||||||
@ -1,217 +0,0 @@
|
|||||||
#ifndef __AVI_WRITER_H__
|
|
||||||
#define __AVI_WRITER_H__
|
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
|
||||||
|
|
||||||
#include "stable-diffusion.h"
|
|
||||||
|
|
||||||
#ifndef INCLUDE_STB_IMAGE_WRITE_H
|
|
||||||
#include "stb_image_write.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
uint32_t offset;
|
|
||||||
uint32_t size;
|
|
||||||
} avi_index_entry;
|
|
||||||
|
|
||||||
// Write 32-bit little-endian integer
|
|
||||||
void write_u32_le(FILE* f, uint32_t val) {
|
|
||||||
fwrite(&val, 4, 1, f);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write 16-bit little-endian integer
|
|
||||||
void write_u16_le(FILE* f, uint16_t val) {
|
|
||||||
fwrite(&val, 2, 1, f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create an MJPG AVI file from an array of sd_image_t images.
|
|
||||||
* Images are encoded to JPEG using stb_image_write.
|
|
||||||
*
|
|
||||||
* @param filename Output AVI file name.
|
|
||||||
* @param images Array of input images.
|
|
||||||
* @param num_images Number of images in the array.
|
|
||||||
* @param fps Frames per second for the video.
|
|
||||||
* @param quality JPEG quality (0-100).
|
|
||||||
* @return 0 on success, -1 on failure.
|
|
||||||
*/
|
|
||||||
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality = 90) {
|
|
||||||
if (num_images == 0) {
|
|
||||||
fprintf(stderr, "Error: Image array is empty.\n");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
FILE* f = fopen(filename, "wb");
|
|
||||||
if (!f) {
|
|
||||||
perror("Error opening file for writing");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t width = images[0].width;
|
|
||||||
uint32_t height = images[0].height;
|
|
||||||
uint32_t channels = images[0].channel;
|
|
||||||
if (channels != 3 && channels != 4) {
|
|
||||||
fprintf(stderr, "Error: Unsupported channel count: %u\n", channels);
|
|
||||||
fclose(f);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- RIFF AVI Header ---
|
|
||||||
fwrite("RIFF", 4, 1, f);
|
|
||||||
long riff_size_pos = ftell(f);
|
|
||||||
write_u32_le(f, 0); // Placeholder for file size
|
|
||||||
fwrite("AVI ", 4, 1, f);
|
|
||||||
|
|
||||||
// 'hdrl' LIST (header list)
|
|
||||||
fwrite("LIST", 4, 1, f);
|
|
||||||
write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
|
|
||||||
fwrite("hdrl", 4, 1, f);
|
|
||||||
|
|
||||||
// 'avih' chunk (AVI main header)
|
|
||||||
fwrite("avih", 4, 1, f);
|
|
||||||
write_u32_le(f, 56);
|
|
||||||
write_u32_le(f, 1000000 / fps); // Microseconds per frame
|
|
||||||
write_u32_le(f, 0); // Max bytes per second
|
|
||||||
write_u32_le(f, 0); // Padding granularity
|
|
||||||
write_u32_le(f, 0x110); // Flags (HASINDEX | ISINTERLEAVED)
|
|
||||||
write_u32_le(f, num_images); // Total frames
|
|
||||||
write_u32_le(f, 0); // Initial frames
|
|
||||||
write_u32_le(f, 1); // Number of streams
|
|
||||||
write_u32_le(f, width * height * 3); // Suggested buffer size
|
|
||||||
write_u32_le(f, width);
|
|
||||||
write_u32_le(f, height);
|
|
||||||
write_u32_le(f, 0); // Reserved
|
|
||||||
write_u32_le(f, 0); // Reserved
|
|
||||||
write_u32_le(f, 0); // Reserved
|
|
||||||
write_u32_le(f, 0); // Reserved
|
|
||||||
|
|
||||||
// 'strl' LIST (stream list)
|
|
||||||
fwrite("LIST", 4, 1, f);
|
|
||||||
write_u32_le(f, 4 + 8 + 56 + 8 + 40);
|
|
||||||
fwrite("strl", 4, 1, f);
|
|
||||||
|
|
||||||
// 'strh' chunk (stream header)
|
|
||||||
fwrite("strh", 4, 1, f);
|
|
||||||
write_u32_le(f, 56);
|
|
||||||
fwrite("vids", 4, 1, f); // Stream type: video
|
|
||||||
fwrite("MJPG", 4, 1, f); // Codec: Motion JPEG
|
|
||||||
write_u32_le(f, 0); // Flags
|
|
||||||
write_u16_le(f, 0); // Priority
|
|
||||||
write_u16_le(f, 0); // Language
|
|
||||||
write_u32_le(f, 0); // Initial frames
|
|
||||||
write_u32_le(f, 1); // Scale
|
|
||||||
write_u32_le(f, fps); // Rate
|
|
||||||
write_u32_le(f, 0); // Start
|
|
||||||
write_u32_le(f, num_images); // Length
|
|
||||||
write_u32_le(f, width * height * 3); // Suggested buffer size
|
|
||||||
write_u32_le(f, (uint32_t)-1); // Quality
|
|
||||||
write_u32_le(f, 0); // Sample size
|
|
||||||
write_u16_le(f, 0); // rcFrame.left
|
|
||||||
write_u16_le(f, 0); // rcFrame.top
|
|
||||||
write_u16_le(f, 0); // rcFrame.right
|
|
||||||
write_u16_le(f, 0); // rcFrame.bottom
|
|
||||||
|
|
||||||
// 'strf' chunk (stream format: BITMAPINFOHEADER)
|
|
||||||
fwrite("strf", 4, 1, f);
|
|
||||||
write_u32_le(f, 40);
|
|
||||||
write_u32_le(f, 40); // biSize
|
|
||||||
write_u32_le(f, width);
|
|
||||||
write_u32_le(f, height);
|
|
||||||
write_u16_le(f, 1); // biPlanes
|
|
||||||
write_u16_le(f, 24); // biBitCount
|
|
||||||
fwrite("MJPG", 4, 1, f); // biCompression (FOURCC)
|
|
||||||
write_u32_le(f, width * height * 3); // biSizeImage
|
|
||||||
write_u32_le(f, 0); // XPelsPerMeter
|
|
||||||
write_u32_le(f, 0); // YPelsPerMeter
|
|
||||||
write_u32_le(f, 0); // Colors used
|
|
||||||
write_u32_le(f, 0); // Colors important
|
|
||||||
|
|
||||||
// 'movi' LIST (video frames)
|
|
||||||
// long movi_list_pos = ftell(f);
|
|
||||||
fwrite("LIST", 4, 1, f);
|
|
||||||
long movi_size_pos = ftell(f);
|
|
||||||
write_u32_le(f, 0); // Placeholder for movi size
|
|
||||||
fwrite("movi", 4, 1, f);
|
|
||||||
|
|
||||||
avi_index_entry* index = (avi_index_entry*)malloc(sizeof(avi_index_entry) * num_images);
|
|
||||||
if (!index) {
|
|
||||||
fclose(f);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Encode and write each frame as JPEG
|
|
||||||
struct {
|
|
||||||
uint8_t* buf;
|
|
||||||
size_t size;
|
|
||||||
} jpeg_data;
|
|
||||||
|
|
||||||
for (int i = 0; i < num_images; i++) {
|
|
||||||
jpeg_data.buf = nullptr;
|
|
||||||
jpeg_data.size = 0;
|
|
||||||
|
|
||||||
// Callback function to collect JPEG data into memory
|
|
||||||
auto write_to_buf = [](void* context, void* data, int size) {
|
|
||||||
auto jd = (decltype(jpeg_data)*)context;
|
|
||||||
jd->buf = (uint8_t*)realloc(jd->buf, jd->size + size);
|
|
||||||
memcpy(jd->buf + jd->size, data, size);
|
|
||||||
jd->size += size;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Encode to JPEG in memory
|
|
||||||
stbi_write_jpg_to_func(
|
|
||||||
write_to_buf,
|
|
||||||
&jpeg_data,
|
|
||||||
images[i].width,
|
|
||||||
images[i].height,
|
|
||||||
channels,
|
|
||||||
images[i].data,
|
|
||||||
quality);
|
|
||||||
|
|
||||||
// Write '00dc' chunk (video frame)
|
|
||||||
fwrite("00dc", 4, 1, f);
|
|
||||||
write_u32_le(f, (uint32_t)jpeg_data.size);
|
|
||||||
index[i].offset = ftell(f) - 8;
|
|
||||||
index[i].size = (uint32_t)jpeg_data.size;
|
|
||||||
fwrite(jpeg_data.buf, 1, jpeg_data.size, f);
|
|
||||||
|
|
||||||
// Align to even byte size
|
|
||||||
if (jpeg_data.size % 2)
|
|
||||||
fputc(0, f);
|
|
||||||
|
|
||||||
free(jpeg_data.buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finalize 'movi' size
|
|
||||||
long cur_pos = ftell(f);
|
|
||||||
long movi_size = cur_pos - movi_size_pos - 4;
|
|
||||||
fseek(f, movi_size_pos, SEEK_SET);
|
|
||||||
write_u32_le(f, movi_size);
|
|
||||||
fseek(f, cur_pos, SEEK_SET);
|
|
||||||
|
|
||||||
// Write 'idx1' index
|
|
||||||
fwrite("idx1", 4, 1, f);
|
|
||||||
write_u32_le(f, num_images * 16);
|
|
||||||
for (int i = 0; i < num_images; i++) {
|
|
||||||
fwrite("00dc", 4, 1, f);
|
|
||||||
write_u32_le(f, 0x10);
|
|
||||||
write_u32_le(f, index[i].offset);
|
|
||||||
write_u32_le(f, index[i].size);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finalize RIFF size
|
|
||||||
cur_pos = ftell(f);
|
|
||||||
long file_size = cur_pos - riff_size_pos - 4;
|
|
||||||
fseek(f, riff_size_pos, SEEK_SET);
|
|
||||||
write_u32_le(f, file_size);
|
|
||||||
fseek(f, cur_pos, SEEK_SET);
|
|
||||||
|
|
||||||
fclose(f);
|
|
||||||
free(index);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __AVI_WRITER_H__
|
|
||||||
@ -1,73 +0,0 @@
|
|||||||
set(TARGET sd-server)
|
|
||||||
|
|
||||||
option(SD_SERVER_BUILD_FRONTEND "Build server frontend with pnpm" ON)
|
|
||||||
|
|
||||||
set(FRONTEND_DIR "${CMAKE_CURRENT_SOURCE_DIR}/frontend")
|
|
||||||
set(GENERATED_HTML_HEADER "${FRONTEND_DIR}/dist/gen_index_html.h")
|
|
||||||
|
|
||||||
set(HAVE_FRONTEND_BUILD OFF)
|
|
||||||
|
|
||||||
if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}")
|
|
||||||
if(WIN32)
|
|
||||||
find_program(PNPM_EXECUTABLE NAMES pnpm.cmd pnpm)
|
|
||||||
else()
|
|
||||||
find_program(PNPM_EXECUTABLE NAMES pnpm)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(PNPM_EXECUTABLE)
|
|
||||||
message(STATUS "Frontend dir found: ${FRONTEND_DIR}")
|
|
||||||
message(STATUS "pnpm found: ${PNPM_EXECUTABLE}")
|
|
||||||
|
|
||||||
set(HAVE_FRONTEND_BUILD ON)
|
|
||||||
|
|
||||||
add_custom_target(${TARGET}_frontend_install
|
|
||||||
COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" install
|
|
||||||
WORKING_DIRECTORY "${FRONTEND_DIR}"
|
|
||||||
COMMENT "Installing frontend dependencies"
|
|
||||||
VERBATIM
|
|
||||||
)
|
|
||||||
|
|
||||||
add_custom_target(${TARGET}_frontend_build
|
|
||||||
COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build
|
|
||||||
WORKING_DIRECTORY "${FRONTEND_DIR}"
|
|
||||||
COMMENT "Building frontend"
|
|
||||||
VERBATIM
|
|
||||||
)
|
|
||||||
|
|
||||||
add_custom_target(${TARGET}_frontend_header
|
|
||||||
COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build:header
|
|
||||||
WORKING_DIRECTORY "${FRONTEND_DIR}"
|
|
||||||
COMMENT "Generating gen_index_html.h"
|
|
||||||
VERBATIM
|
|
||||||
)
|
|
||||||
|
|
||||||
add_dependencies(${TARGET}_frontend_build ${TARGET}_frontend_install)
|
|
||||||
add_dependencies(${TARGET}_frontend_header ${TARGET}_frontend_build)
|
|
||||||
|
|
||||||
add_custom_target(${TARGET}_frontend
|
|
||||||
DEPENDS ${TARGET}_frontend_header
|
|
||||||
)
|
|
||||||
|
|
||||||
set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE)
|
|
||||||
else()
|
|
||||||
message(WARNING "pnpm not found, frontend build disabled")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_executable(${TARGET} main.cpp)
|
|
||||||
|
|
||||||
if(HAVE_FRONTEND_BUILD)
|
|
||||||
add_dependencies(${TARGET} ${TARGET}_frontend)
|
|
||||||
target_sources(${TARGET} PRIVATE "${GENERATED_HTML_HEADER}")
|
|
||||||
target_include_directories(${TARGET} PRIVATE "${FRONTEND_DIR}/dist")
|
|
||||||
target_compile_definitions(${TARGET} PRIVATE HAVE_INDEX_HTML)
|
|
||||||
message(STATUS "HAVE_INDEX_HTML enabled")
|
|
||||||
else()
|
|
||||||
message(STATUS "HAVE_INDEX_HTML disabled")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)
|
|
||||||
@ -1,227 +0,0 @@
|
|||||||
# Frontend
|
|
||||||
|
|
||||||
## Build with Frontend
|
|
||||||
|
|
||||||
The server can optionally build the web frontend and embed it into the binary as `gen_index_html.h`.
|
|
||||||
|
|
||||||
### Requirements
|
|
||||||
|
|
||||||
Install the following tools:
|
|
||||||
|
|
||||||
* **Node.js** ≥ 22.18
|
|
||||||
https://nodejs.org/
|
|
||||||
|
|
||||||
* **pnpm** ≥ 10
|
|
||||||
Install via npm:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm install -g pnpm
|
|
||||||
```
|
|
||||||
|
|
||||||
Verify installation:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
node -v
|
|
||||||
pnpm -v
|
|
||||||
```
|
|
||||||
|
|
||||||
### Install frontend dependencies
|
|
||||||
|
|
||||||
Go to the frontend directory and install dependencies:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd examples/server/frontend
|
|
||||||
pnpm install
|
|
||||||
```
|
|
||||||
|
|
||||||
### Build the server with CMake
|
|
||||||
|
|
||||||
Enable the frontend build option when configuring CMake:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cmake -B build -DSD_SERVER_BUILD_FRONTEND=ON
|
|
||||||
cmake --build build --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
If `pnpm` is available, the build system will automatically run:
|
|
||||||
|
|
||||||
```
|
|
||||||
pnpm run build
|
|
||||||
pnpm run build:header
|
|
||||||
```
|
|
||||||
|
|
||||||
and embed the generated frontend into the server binary.
|
|
||||||
|
|
||||||
## Frontend Repository
|
|
||||||
|
|
||||||
The web frontend is maintained in a **separate repository**, https://github.com/leejet/stable-ui.
|
|
||||||
|
|
||||||
If you want to modify the UI or frontend logic, please submit pull requests to the **frontend repository**.
|
|
||||||
|
|
||||||
This repository (`stable-diffusion.cpp`) only vendors the frontend periodically. Changes from the frontend repo are synchronized:
|
|
||||||
|
|
||||||
* approximately **every 1–2 weeks**, or
|
|
||||||
* when there are **major frontend updates**
|
|
||||||
|
|
||||||
Because of this, frontend changes will **not appear here immediately** after being merged upstream.
|
|
||||||
|
|
||||||
## Using an external frontend
|
|
||||||
|
|
||||||
By default, the server uses the **embedded frontend** generated during the build (`gen_index_html.h`).
|
|
||||||
|
|
||||||
You can also serve a custom frontend file instead of the embedded one by using:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
--serve-html-path <path-to-index.html>
|
|
||||||
```
|
|
||||||
|
|
||||||
For example:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
sd-server --serve-html-path ./index.html
|
|
||||||
```
|
|
||||||
|
|
||||||
In this case, the server will load and serve the specified `index.html` file instead of the embedded frontend. This is useful when:
|
|
||||||
|
|
||||||
* developing or testing frontend changes
|
|
||||||
* using a custom UI
|
|
||||||
* avoiding rebuilding the binary after frontend modifications
|
|
||||||
|
|
||||||
# Run
|
|
||||||
|
|
||||||
```
|
|
||||||
usage: ./bin/sd-server [options]
|
|
||||||
|
|
||||||
Svr Options:
|
|
||||||
-l, --listen-ip <string> server listen ip (default: 127.0.0.1)
|
|
||||||
--serve-html-path <string> path to HTML file to serve at root (optional)
|
|
||||||
--listen-port <int> server listen port (default: 1234)
|
|
||||||
-v, --verbose print extra info
|
|
||||||
--color colors the logging tags according to level
|
|
||||||
-h, --help show this help message and exit
|
|
||||||
|
|
||||||
Context Options:
|
|
||||||
-m, --model <string> path to full model
|
|
||||||
--clip_l <string> path to the clip-l text encoder
|
|
||||||
--clip_g <string> path to the clip-g text encoder
|
|
||||||
--clip_vision <string> path to the clip-vision encoder
|
|
||||||
--t5xxl <string> path to the t5xxl text encoder
|
|
||||||
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
|
|
||||||
--llm_vision <string> path to the llm vit
|
|
||||||
--qwen2vl <string> alias of --llm. Deprecated.
|
|
||||||
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
|
||||||
--diffusion-model <string> path to the standalone diffusion model
|
|
||||||
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
|
|
||||||
--vae <string> path to standalone vae model
|
|
||||||
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
|
||||||
--tae <string> alias of --taesd
|
|
||||||
--control-net <string> path to control net model
|
|
||||||
--embd-dir <string> embeddings directory
|
|
||||||
--lora-model-dir <string> lora model directory
|
|
||||||
--tensor-type-rules <string> weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
|
|
||||||
--photo-maker <string> path to PHOTOMAKER model
|
|
||||||
--upscale-model <string> path to esrgan model.
|
|
||||||
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
|
|
||||||
CPU physical cores
|
|
||||||
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
|
|
||||||
--vae-tile-overlap <float> tile overlap for vae tiling, in fraction of tile size (default: 0.5)
|
|
||||||
--vae-tiling process vae in tiles to reduce memory usage
|
|
||||||
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
|
|
||||||
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
|
|
||||||
--mmap whether to memory-map model
|
|
||||||
--control-net-cpu keep controlnet in cpu (for low vram)
|
|
||||||
--clip-on-cpu keep clip in cpu (for low vram)
|
|
||||||
--vae-on-cpu keep vae in cpu (for low vram)
|
|
||||||
--fa use flash attention
|
|
||||||
--diffusion-fa use flash attention in the diffusion model only
|
|
||||||
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
|
|
||||||
--vae-conv-direct use ggml_conv2d_direct in the vae model
|
|
||||||
--circular enable circular padding for convolutions
|
|
||||||
--circularx enable circular RoPE wrapping on x-axis (width) only
|
|
||||||
--circulary enable circular RoPE wrapping on y-axis (height) only
|
|
||||||
--chroma-disable-dit-mask disable dit mask for chroma
|
|
||||||
--qwen-image-zero-cond-t enable zero_cond_t for qwen image
|
|
||||||
--chroma-enable-t5-mask enable t5 mask for chroma
|
|
||||||
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
|
|
||||||
type of the weight file
|
|
||||||
--rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
|
|
||||||
--sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
|
|
||||||
--prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
|
|
||||||
--lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
|
|
||||||
contain any quantized parameters, the at_runtime mode will be used; otherwise,
|
|
||||||
immediately will be used.The immediately mode may have precision and
|
|
||||||
compatibility issues with quantized parameters, but it usually offers faster inference
|
|
||||||
speed and, in some cases, lower memory usage. The at_runtime mode, on the
|
|
||||||
other hand, is exactly the opposite.
|
|
||||||
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
|
|
||||||
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
|
|
||||||
(overrides --vae-tile-size)
|
|
||||||
|
|
||||||
Default Generation Options:
|
|
||||||
-p, --prompt <string> the prompt to render
|
|
||||||
-n, --negative-prompt <string> the negative prompt (default: "")
|
|
||||||
-i, --init-img <string> path to the init image
|
|
||||||
--end-img <string> path to the end image, required by flf2v
|
|
||||||
--mask <string> path to the mask image
|
|
||||||
--control-image <string> path to control image, control net
|
|
||||||
--control-video <string> path to control video frames, It must be a directory path. The video frames inside should be stored as images in
|
|
||||||
lexicographical (character) order. For example, if the control video path is
|
|
||||||
`frames`, the directory contain images such as 00.png, 01.png, ... etc.
|
|
||||||
--pm-id-images-dir <string> path to PHOTOMAKER input id images dir
|
|
||||||
--pm-id-embed-path <string> path to PHOTOMAKER v2 id embed
|
|
||||||
-H, --height <int> image height, in pixel space (default: 512)
|
|
||||||
-W, --width <int> image width, in pixel space (default: 512)
|
|
||||||
--steps <int> number of sample steps (default: 20)
|
|
||||||
--high-noise-steps <int> (high noise) number of sample steps (default: -1 = auto)
|
|
||||||
--clip-skip <int> ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
|
|
||||||
will be 1 for SD1.x, 2 for SD2.x
|
|
||||||
-b, --batch-count <int> batch count
|
|
||||||
--video-frames <int> video frames (default: 1)
|
|
||||||
--fps <int> fps (default: 24)
|
|
||||||
--timestep-shift <int> shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
|
|
||||||
NitroSD-Vibrant
|
|
||||||
--upscale-repeats <int> Run the ESRGAN upscaler this many times (default: 1)
|
|
||||||
--upscale-tile-size <int> tile size for ESRGAN upscaling (default: 128)
|
|
||||||
--cfg-scale <float> unconditional guidance scale: (default: 7.0)
|
|
||||||
--img-cfg-scale <float> image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
|
|
||||||
--guidance <float> distilled guidance scale for models with guidance input (default: 3.5)
|
|
||||||
--slg-scale <float> skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
|
|
||||||
medium
|
|
||||||
--skip-layer-start <float> SLG enabling point (default: 0.01)
|
|
||||||
--skip-layer-end <float> SLG disabling point (default: 0.2)
|
|
||||||
--eta <float> eta in DDIM, only for DDIM and TCD (default: 0)
|
|
||||||
--flow-shift <float> shift value for Flow models like SD3.x or WAN (default: auto)
|
|
||||||
--high-noise-cfg-scale <float> (high noise) unconditional guidance scale: (default: 7.0)
|
|
||||||
--high-noise-img-cfg-scale <float> (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
|
|
||||||
--high-noise-guidance <float> (high noise) distilled guidance scale for models with guidance input (default: 3.5)
|
|
||||||
--high-noise-slg-scale <float> (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
|
|
||||||
--high-noise-skip-layer-start <float> (high noise) SLG enabling point (default: 0.01)
|
|
||||||
--high-noise-skip-layer-end <float> (high noise) SLG disabling point (default: 0.2)
|
|
||||||
--high-noise-eta <float> (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
|
|
||||||
--strength <float> strength for noising/unnoising (default: 0.75)
|
|
||||||
--pm-style-strength <float>
|
|
||||||
--control-strength <float> strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
|
|
||||||
--moe-boundary <float> timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
|
|
||||||
--vace-strength <float> wan vace strength
|
|
||||||
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
|
|
||||||
--disable-auto-resize-ref-image disable auto resize of ref images
|
|
||||||
-s, --seed RNG seed (default: 42, use random seed for < 0)
|
|
||||||
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
|
|
||||||
tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
|
|
||||||
otherwise)
|
|
||||||
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
|
|
||||||
ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
|
|
||||||
euler_a otherwise
|
|
||||||
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
|
|
||||||
kl_optimal, lcm, bong_tangent], default: discrete
|
|
||||||
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
|
|
||||||
--skip-layers layers to skip for SLG steps (default: [7,8,9])
|
|
||||||
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
|
|
||||||
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
|
|
||||||
--cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
|
|
||||||
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
|
|
||||||
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
|
|
||||||
"threshold=0.25" or "threshold=1.5,reset=0"
|
|
||||||
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
|
|
||||||
--scm-policy SCM policy: 'dynamic' (default) or 'static'
|
|
||||||
```
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Subproject commit 1a34176cd6d39ad3a226b2b69047e71f6797f6bc
|
|
||||||