Compare commits

..

4 Commits

Author SHA1 Message Date
leejet
c429774fe9 prevent errors when loading models without 'alphas_cumprod' 2024-01-01 16:17:33 +08:00
leejet
66be6e2e58 Merge branch 'master' into refector 2024-01-01 14:58:02 +08:00
leejet
93eb2fc404 avoid build failure on gcc 2023-12-31 23:10:24 +08:00
leejet
e2a33d4ee7 refactor: reorganize code and use c api 2023-12-31 23:00:24 +08:00
86 changed files with 4925 additions and 2446335 deletions

View File

@ -3,6 +3,7 @@ UseTab: Never
IndentWidth: 4
TabWidth: 4
AllowShortIfStatementsOnASingleLine: false
IndentCaseLabels: false
ColumnLimit: 0
AccessModifierOffset: -4
NamespaceIndentation: All

View File

@ -4,36 +4,17 @@ on:
workflow_dispatch: # allows manual triggering
inputs:
create_release:
description: "Create new release"
description: 'Create new release'
required: true
type: boolean
push:
branches:
- master
- ci
paths:
[
".github/workflows/**",
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
"**/*.hpp",
"**/*.c",
"**/*.cpp",
"**/*.cu",
]
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
pull_request:
types: [opened, synchronize, reopened]
paths:
[
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
"**/*.hpp",
"**/*.c",
"**/*.cpp",
"**/*.cu",
]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -49,6 +30,7 @@ jobs:
with:
submodules: recursive
- name: Dependencies
id: depends
run: |
@ -60,37 +42,14 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON
cmake ..
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
run: |
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
#- name: Test
#id: cmake_test
#run: |
#cd build
#ctest --verbose --timeout 900
macOS-latest-cmake:
runs-on: macos-latest
@ -104,8 +63,9 @@ jobs:
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew install zip
brew update
- name: Build
id: cmake_build
@ -113,61 +73,30 @@ jobs:
sysctl -a
mkdir build
cd build
cmake .. -DGGML_AVX2=ON -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" -DSD_BUILD_SHARED_LIBS=ON
cmake ..
cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
run: |
echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
echo "OS_NAME=`sw_vers -productName`" >> "$GITHUB_OUTPUT"
echo "OS_VERSION=`sw_vers -productVersion`" >> "$GITHUB_OUTPUT"
echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp ggml/LICENSE ./build/bin/ggml.txt
cp LICENSE ./build/bin/stable-diffusion.cpp.txt
zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
#- name: Test
#id: cmake_test
#run: |
#cd build
#ctest --verbose --timeout 900
windows-latest-cmake:
runs-on: windows-2025
env:
VULKAN_VERSION: 1.3.261.1
runs-on: windows-latest
strategy:
matrix:
include:
- build: "noavx"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
- build: "avx2"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "avx"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
- build: "avx512"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;80;75"
# - build: "rocm5.5"
# defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
- build: 'vulkan'
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: 'noavx'
defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
- build: 'avx2'
defines: '-DGGML_AVX2=ON'
- build: 'avx'
defines: '-DGGML_AVX2=OFF'
- build: 'avx512'
defines: '-DGGML_AVX512=ON'
steps:
- name: Clone
id: checkout
@ -175,37 +104,6 @@ jobs:
with:
submodules: recursive
- name: Install cuda-toolkit
id: cuda-toolkit
if: ${{ matrix.build == 'cuda12' }}
uses: Jimver/cuda-toolkit@v0.2.19
with:
cuda: "12.6.2"
method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Install rocm-toolkit
id: rocm-toolkit
if: ${{ matrix.build == 'rocm5.5' }}
uses: Cyberhan123/rocm-toolkit@v0.1.0
with:
rocm: "5.5.0"
- name: Install Ninja
id: install-ninja
if: ${{ matrix.build == 'rocm5.5' }}
uses: urkle/action-get-ninja@v1
with:
version: 1.11.1
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'vulkan' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Build
id: cmake_build
run: |
@ -227,6 +125,12 @@ jobs:
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
#- name: Test
#id: cmake_test
#run: |
#cd build
#ctest -C Release --verbose --timeout 900
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -236,44 +140,14 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
$filePath = ".\build\bin\Release\*"
if (Test-Path $filePath) {
echo "Exists at path $filePath"
Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
} elseif (Test-Path ".\build\bin\stable-diffusion.dll") {
$filePath = ".\build\bin\*"
echo "Exists at path $filePath"
Copy-Item ggml/LICENSE .\build\bin\ggml.txt
Copy-Item LICENSE .\build\bin\stable-diffusion.cpp.txt
} else {
ls .\build\bin
throw "Can't find stable-diffusion.dll"
}
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip $filePath
- name: Copy and pack Cuda runtime
id: pack_cuda_runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
$dst='.\build\bin\cudart\'
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
name: sd-cudart-sd-bin-win-cu12-x64.zip
path: |
cudart-sd-bin-win-cu12-x64.zip
Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
@ -290,11 +164,7 @@ jobs:
steps:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
with:
path: ./artifact
pattern: sd-*
merge-multiple: true
uses: actions/download-artifact@v3
- name: Get commit hash
id: commit

1
.gitignore vendored
View File

@ -10,4 +10,5 @@ test/
*.gguf
output*.png
models*
!taesd-model.gguf
*.log

2
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "ggml"]
path = ggml
url = https://github.com/ggerganov/ggml.git
url = https://github.com/leejet/ggml.git

View File

@ -24,111 +24,44 @@ endif()
# general
#option(SD_BUILD_TESTS "sd: build tests" ${SD_STANDALONE})
option(SD_BUILD_EXAMPLES "sd: build examples" ${SD_STANDALONE})
option(SD_CUDA "sd: cuda backend" OFF)
option(SD_HIPBLAS "sd: rocm backend" OFF)
option(SD_CUBLAS "sd: cuda backend" OFF)
option(SD_METAL "sd: metal backend" OFF)
option(SD_VULKAN "sd: vulkan backend" OFF)
option(SD_OPENCL "sd: opencl backend" OFF)
option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF)
option(SD_FLASH_ATTN "sd: use flash attention for x4 less memory usage" OFF)
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
option(BUILD_SHARED_LIBS "sd: build shared libs" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON)
if(SD_CUDA)
message("-- Use CUDA as backend stable-diffusion")
set(GGML_CUDA ON)
add_definitions(-DSD_USE_CUDA)
if(SD_CUBLAS)
message("Use CUBLAS as backend stable-diffusion")
set(GGML_CUBLAS ON)
add_definitions(-DSD_USE_CUBLAS)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
endif()
if(SD_METAL)
message("-- Use Metal as backend stable-diffusion")
message("Use Metal as backend stable-diffusion")
set(GGML_METAL ON)
add_definitions(-DSD_USE_METAL)
endif()
if (SD_VULKAN)
message("-- Use Vulkan as backend stable-diffusion")
set(GGML_VULKAN ON)
add_definitions(-DSD_USE_VULKAN)
endif ()
if (SD_OPENCL)
message("-- Use OpenCL as backend stable-diffusion")
set(GGML_OPENCL ON)
add_definitions(-DSD_USE_OPENCL)
endif ()
if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
endif ()
if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
if(SD_FLASH_ATTN)
message("Use Flash Attention for memory optimization")
add_definitions(-DSD_USE_FLASH_ATTENTION)
endif()
set(SD_LIB stable-diffusion)
file(GLOB SD_LIB_SOURCES
"*.h"
"*.cpp"
"*.hpp"
)
# we can get only one share lib
if(SD_BUILD_SHARED_LIBS)
message("-- Build shared library")
message(${SD_LIB_SOURCES})
set(BUILD_SHARED_LIBS OFF)
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
add_definitions(-DSD_BUILD_SHARED_LIB)
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
else()
message("-- Build static library")
set(BUILD_SHARED_LIBS OFF)
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
endif()
if(SD_SYCL)
message("-- Use SYCL as backend stable-diffusion")
set(GGML_SYCL ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
add_definitions(-DSD_USE_SYCL)
# disable fast-math on host, see:
# https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
if (WIN32)
set(SYCL_COMPILE_OPTIONS /fp:precise)
else()
set(SYCL_COMPILE_OPTIONS -fp-model=precise)
endif()
message("-- Turn off fast-math for host in SYCL backend")
target_compile_options(${SD_LIB} PRIVATE ${SYCL_COMPILE_OPTIONS})
endif()
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
# see https://github.com/ggerganov/ggml/pull/682
add_definitions(-DGGML_MAX_NAME=128)
# deps
# Only add ggml if it hasn't been added yet
if (NOT TARGET ggml)
add_subdirectory(ggml)
endif()
add_subdirectory(ggml)
add_subdirectory(thirdparty)
set(SD_LIB stable-diffusion)
add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp upscaler.cpp
ggml_extend.hpp clip.hpp common.hpp unet.hpp tae.hpp esrgan.hpp lora.hpp denoiser.hpp rng.hpp rng_philox.hpp)
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC cxx_std_11)

View File

@ -1,22 +0,0 @@
ARG MUSA_VERSION=rc3.1.1
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build
RUN apt-get update && apt-get install -y ccache cmake git
WORKDIR /sd.cpp
COPY . .
RUN mkdir build && cd build && \
cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
-DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
cmake --build . --config Release
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime
COPY --from=build /sd.cpp/build/bin/sd /sd
ENTRYPOINT [ "/sd" ]

346
README.md
View File

@ -1,31 +1,28 @@
<p align="center">
<img src="./assets/cat_with_sd_cpp_42.png" width="360x">
<img src="./assets/a%20lovely%20cat.png" width="256x">
</p>
# stable-diffusion.cpp
Inference of Stable Diffusion and Flux in pure C/C++
Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in pure C/C++
## Features
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
- Super lightweight and without external dependencies
- SD1.x, SD2.x, SDXL and [SD3/SD3.5](./docs/sd3.md) support
- SD1.x, SD2.x and SDXL support
- !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
- [Flux-dev/Flux-schnell Support](./docs/flux.md)
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Chroma](./docs/chroma.md)
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
- 16-bit, 32-bit float support
- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
- 4-bit, 5-bit and 8-bit integer quantization support
- Accelerated memory-efficient CPU inference
- Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
- AVX, AVX2 and AVX512 support for x86 architectures
- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
- Full CUDA and Metal backend for GPU acceleration.
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
- No need to convert to `.ggml` or `.gguf` anymore!
- Flash Attention for memory usage optimization
- Flash Attention for memory usage optimization (only cpu for now)
- Original `txt2img` and `img2img` mode
- Negative prompt
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
@ -34,7 +31,6 @@ Inference of Stable Diffusion and Flux in pure C/C++
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
- VAE tiling processing for reduce memory usage
- Control Net support with SD 1.5
- Sampling method
- `Euler A`
- `Euler`
@ -50,21 +46,21 @@ Inference of Stable Diffusion and Flux in pure C/C++
- Linux
- Mac OS
- Windows
- Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
- Android (via Termux)
### TODO
- [ ] More sampling methods
- [ ] Make inference faster
- The current implementation of ggml_conv_2d is slow and has high memory usage
- Implement Winograd Convolution 2D for 3x3 kernel filtering
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
- [ ] Implement Textual Inversion (embeddings)
- [ ] Implement Inpainting support
- [ ] k-quants support
## Usage
For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
If the built product does not meet your requirements, you can choose to build it manually.
### Get the Code
```
@ -87,13 +83,11 @@ git submodule update
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
- Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
- Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
```shell
curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
# curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
```
### Build
@ -114,31 +108,12 @@ cmake .. -DGGML_OPENBLAS=ON
cmake --build . --config Release
```
##### Using CUDA
##### Using CUBLAS
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
```
cmake .. -DSD_CUDA=ON
cmake --build . --config Release
```
##### Using HipBLAS
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
```
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
cmake --build . --config Release
```
##### Using MUSA
This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
```bash
cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
cmake .. -DSD_CUBLAS=ON
cmake --build . --config Release
```
@ -151,130 +126,15 @@ cmake .. -DSD_METAL=ON
cmake --build . --config Release
```
##### Using Vulkan
### Using Flash Attention
Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.
```
cmake .. -DSD_VULKAN=ON
cmake .. -DSD_FLASH_ATTN=ON
cmake --build . --config Release
```
##### Using OpenCL (for Adreno GPU)
Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
To build for Windows ARM please refers to [Windows 11 Arm64
](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
Building for Android:
Android NDK:
Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
Setup OpenCL Dependencies for NDK:
You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
* OpenCL Headers:
```bash
# In a temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
cd ..
```
* OpenCL ICD Loader:
```bash
# In the same temporary working directory
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build_ndk && cd build_ndk
# Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=24 \
-DANDROID_STL=c++_shared
ninja
# Replace <YOUR_NDK_PATH>
# e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
cd ../..
```
Build `stable-diffusion.cpp` for Android with OpenCL:
```bash
mkdir build-android && cd build-android
# Replace <YOUR_NDK_PATH> with your actual NDK installation path
# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
cmake .. -G Ninja \
-DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DGGML_OPENMP=OFF \
-DSD_OPENCL=ON
ninja
```
*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
##### Using SYCL
Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
```
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
cmake --build . --config Release
```
Example of text2img by using SYCL backend:
- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
<p align="center">
<img src="./assets/sycl_sd3_output.png" width="360x">
</p>
##### Using Flash Attention
Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
eg.:
- flux 768x768 ~600mb
- SD2 768x768 ~1400mb
For most backends, it slows things down, but for cuda it generally speeds it up too.
At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
Run by adding `--diffusion-fa` to the arguments and watch for:
```
[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
```
and the compute buffer shrink in the debug log:
```
[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
```
### Run
```
@ -282,83 +142,53 @@ usage: ./bin/sd [arguments]
arguments:
-h, --help show this help message and exit
-M, --mode [MODE] run mode, one of: [img_gen, convert], default: img_gen
-t, --threads N number of threads to use during computation (default: -1)
-M, --mode [txt2img or img2img] generation mode (default: txt2img)
-t, --threads N number of threads to use during computation (default: -1).
If threads <= 0, then threads will be set to the number of CPU physical cores
-m, --model [MODEL] path to full model
--diffusion-model path to the standalone diffusion model
--clip_l path to the clip-l text encoder
--clip_g path to the clip-g text encoder
--t5xxl path to the t5xxl text encoder
-m, --model [MODEL] path to model
--vae [VAE] path to vae
--taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--control-net [CONTROL_PATH] path to control net model
--embd-dir [EMBEDDING_PATH] path to embeddings
--stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings
--input-id-images-dir [DIR] path to PHOTOMAKER input id images dir
--normalize-input normalize PHOTOMAKER input id images
--upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
--upscale-repeats Run the ESRGAN upscaler this many times (default 1)
--type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
If not specified, the default is the type of the weight file
--tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
--upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.
--type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
If not specified, the default is the type of the weight file.
--lora-model-dir [DIR] lora model directory
-i, --init-img [IMAGE] path to the input image, required by img2img
--mask [MASK] path to the mask image, required by img2img with mask
--control-image [IMAGE] path to image condition, control net
-r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
-o, --output OUTPUT path to write result image to (default: ./output.png)
-p, --prompt [PROMPT] the prompt to render
-n, --negative-prompt PROMPT the negative prompt (default: "")
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
--img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)
--slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)
0 means disabled, a value of 2.5 is nice for sd3.5 medium
--eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
--skip-layer-start START SLG enabling point: (default: 0.01)
--skip-layer-end END SLG disabling point: (default: 0.2)
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
--strength STRENGTH strength for noising/unnoising (default: 0.75)
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20)
--control-strength STRENGTH strength to apply Control Net (default: 0.9)
1.0 corresponds to full destruction of information in init image
-H, --height H image height, in pixel space (default: 512)
-W, --width W image width, in pixel space (default: 512)
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}
sampling method (default: "euler_a")
--steps STEPS number of sample steps (default: 20)
--rng {std_default, cuda} RNG (default: cuda)
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
-b, --batch-count COUNT number of images to generate
--schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)
--clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-b, --batch-count COUNT number of images to generate.
--schedule {discrete, karras} Denoiser sigma schedule (default: discrete)
--clip-skip N number of layers to skip of clip model (default: 0)
--vae-tiling process vae in tiles to reduce memory usage
--vae-on-cpu keep vae in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--diffusion-fa use flash attention in the diffusion model (for low vram)
Might lower quality, since it implies converting k and v to f16.
This might crash if it is not supported by the backend.
--control-net-cpu keep controlnet in cpu (for low vram)
--canny apply canny preprocessor (edge detection)
--color colors the logging tags according to level
--chroma-disable-dit-mask disable dit mask for chroma
--chroma-enable-t5-mask enable t5 mask for chroma
--chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
-v, --verbose print extra info
```
#### Quantization
You can specify the model weight type using the `--type` parameter. The weights are automatically converted when loading the model.
- `f16` for 16-bit floating-point
- `f32` for 32-bit floating-point
- `q8_0` for 8-bit integer quantization
- `q5_0` or `q5_1` for 5-bit integer quantization
- `q4_0` or `q4_1` for 4-bit integer quantization
#### txt2img example
```sh
./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
```
Using formats of different precisions will yield results of varying quality.
@ -373,42 +203,93 @@ Using formats of different precisions will yield results of varying quality.
```
./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
```
<p align="center">
<img src="./assets/img2img_output.png" width="256x">
</p>
## More Guides
#### with LoRA
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
- [Using ESRGAN to upscale results](./docs/esrgan.md)
- [Using TAESD to faster decoding](./docs/taesd.md)
- [Docker](./docs/docker.md)
- [Quantization and GGUF](./docs/quantization_and_gguf.md)
- You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
## Bindings
- LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.
Here's a simple example:
* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)
```
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
```
## UIs
`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
These projects use `stable-diffusion.cpp` as a backend for their image generation.
#### LCM/LCM-LoRA
- [Jellybox](https://jellybox.com)
- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
- Download LCM-LoRA form https://huggingface.co/latent-consistency/lcm-lora-sdv1-5
- Specify LCM-LoRA by adding `<lora:lcm-lora-sdv1-5:1>` to prompt
- It's advisable to set `--cfg-scale` to `1.0` instead of the default `7.0`. For `--steps`, a range of `2-8` steps is recommended. For `--sampling-method`, `lcm`/`euler_a` is recommended.
Here's a simple example:
```
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
```
| without LCM-LoRA (--cfg-scale 7) | with LCM-LoRA (--cfg-scale 1) |
| ---- |---- |
| ![](./assets/without_lcm.png) |![](./assets/with_lcm.png) |
## Using TAESD to faster decoding
You can use TAESD to accelerate the decoding of latent images by following these steps:
- Download the model [weights](https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors).
Or curl
```bash
curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
```
- Specify the model path using the `--taesd PATH` parameter. example:
```bash
sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
```
## Using ESRGAN to upscale results
You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
- Specify the model path using the `--upscale-model PATH` parameter. example:
```bash
sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
```
### Docker
#### Building using Docker
```shell
docker build -t sd .
```
#### Run
```shell
docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
# For example
# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```
## Memory Requirements
| precision | f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
| ---- | ---- |---- |---- |---- |---- |---- |---- |
| **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
| **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |
## Contributors
@ -416,19 +297,12 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
[![Contributors](https://contrib.rocks/image?repo=leejet/stable-diffusion.cpp)](https://github.com/leejet/stable-diffusion.cpp/graphs/contributors)
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=leejet/stable-diffusion.cpp&type=Date)](https://star-history.com/#leejet/stable-diffusion.cpp&Date)
## References
- [ggml](https://github.com/ggerganov/ggml)
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
- [ComfyUI](https://github.com/comfyanonymous/ComfyUI)
- [k-diffusion](https://github.com/crowsonkb/k-diffusion)
- [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
- [generative-models](https://github.com/Stability-AI/generative-models/)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 539 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 416 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 490 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 464 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 468 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 566 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 475 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 481 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 496 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 311 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 107 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 MiB

1202
clip.hpp

File diff suppressed because it is too large Load Diff

View File

@ -3,521 +3,84 @@
#include "ggml_extend.hpp"
class DownSampleBlock : public GGMLBlock {
protected:
struct DownSample {
// hparams
int channels;
int out_channels;
bool vae_downsample;
public:
DownSampleBlock(int channels,
int out_channels,
bool vae_downsample = false)
: channels(channels),
out_channels(out_channels),
vae_downsample(vae_downsample) {
// conv2d params
struct ggml_tensor* op_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* op_b; // [out_channels,]
bool vae_downsample = false;
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // op_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // op_b
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
op_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
op_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
if (vae_downsample) {
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
tensors[prefix + "conv.weight"] = op_w;
tensors[prefix + "conv.bias"] = op_b;
} else {
blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
tensors[prefix + "op.weight"] = op_w;
tensors[prefix + "op.bias"] = op_b;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
struct ggml_tensor* c = NULL;
if (vae_downsample) {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_pad(ctx, x, 1, 1, 0, 0);
x = conv->forward(ctx, x);
c = ggml_pad(ctx, x, 1, 1, 0, 0);
c = ggml_nn_conv_2d(ctx, c, op_w, op_b, 2, 2, 0, 0);
} else {
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
x = conv->forward(ctx, x);
c = ggml_nn_conv_2d(ctx, x, op_w, op_b, 2, 2, 1, 1);
}
return x; // [N, out_channels, h/2, w/2]
return c; // [N, out_channels, h/2, w/2]
}
};
class UpSampleBlock : public GGMLBlock {
protected:
struct UpSample {
// hparams
int channels;
int out_channels;
public:
UpSampleBlock(int channels,
int out_channels)
: channels(channels),
out_channels(out_channels) {
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
// conv2d params
struct ggml_tensor* conv_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* conv_b; // [out_channels,]
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // op_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // op_b
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "conv.weight"] = conv_w;
tensors[prefix + "conv.bias"] = conv_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2]
x = ggml_nn_conv_2d(ctx, x, conv_w, conv_b, 1, 1, 1, 1); // [N, out_channels, h*2, w*2]
return x;
}
};
class ResBlock : public GGMLBlock {
protected:
// network hparams
int64_t channels; // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
int64_t emb_channels; // time_embed_dim
int64_t out_channels; // mult * model_channels
std::pair<int, int> kernel_size;
int dims;
bool skip_t_emb;
bool exchange_temb_dims;
std::shared_ptr<GGMLBlock> conv_nd(int dims,
int64_t in_channels,
int64_t out_channels,
std::pair<int, int> kernel_size,
std::pair<int, int> padding) {
GGML_ASSERT(dims == 2 || dims == 3);
if (dims == 3) {
return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
} else {
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
}
}
public:
ResBlock(int64_t channels,
int64_t emb_channels,
int64_t out_channels,
std::pair<int, int> kernel_size = {3, 3},
int dims = 2,
bool exchange_temb_dims = false,
bool skip_t_emb = false)
: channels(channels),
emb_channels(emb_channels),
out_channels(out_channels),
kernel_size(kernel_size),
dims(dims),
skip_t_emb(skip_t_emb),
exchange_temb_dims(exchange_temb_dims) {
std::pair<int, int> padding = {kernel_size.first / 2, kernel_size.second / 2};
blocks["in_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(channels));
// in_layer_1 is nn.SILU()
blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
if (!skip_t_emb) {
// emb_layer_0 is nn.SILU()
blocks["emb_layers.1"] = std::shared_ptr<GGMLBlock>(new Linear(emb_channels, out_channels));
}
blocks["out_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
// out_layer_1 is nn.SILU()
// out_layer_2 is nn.Dropout(), skip for inference
blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
if (out_channels != channels) {
blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
}
}
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
// [N, c, t, h, w] => [N, c, t, h * w]
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
// emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
auto in_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["in_layers.0"]);
auto in_layers_2 = std::dynamic_pointer_cast<UnaryBlock>(blocks["in_layers.2"]);
auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
if (emb == NULL) {
GGML_ASSERT(skip_t_emb);
}
// in_layers
auto h = in_layers_0->forward(ctx, x);
h = ggml_silu_inplace(ctx, h);
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
// emb_layers
if (!skip_t_emb) {
auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
auto emb_out = ggml_silu(ctx, emb);
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
if (dims == 2) {
emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
} else {
emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
if (exchange_temb_dims) {
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
}
}
h = ggml_add(ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
// out_layers
h = out_layers_0->forward(ctx, h);
h = ggml_silu_inplace(ctx, h);
// dropout, skip for inference
h = out_layers_3->forward(ctx, h);
// skip connection
if (out_channels != channels) {
auto skip_connection = std::dynamic_pointer_cast<UnaryBlock>(blocks["skip_connection"]);
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
h = ggml_add(ctx, h, x);
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
};
class GEGLU : public GGMLBlock {
protected:
int64_t dim_in;
int64_t dim_out;
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
enum ggml_type wtype = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
enum ggml_type bias_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
}
public:
GEGLU(int64_t dim_in, int64_t dim_out)
: dim_in(dim_in), dim_out(dim_out) {}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out]
struct ggml_tensor* w = params["proj.weight"];
struct ggml_tensor* b = params["proj.bias"];
auto x_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in]
auto x_b = ggml_view_1d(ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in]
auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ]
auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ]
auto x_in = x;
x = ggml_nn_linear(ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
gate = ggml_gelu_inplace(ctx, gate);
x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, dim_out]
return x;
}
};
class FeedForward : public GGMLBlock {
public:
FeedForward(int64_t dim,
int64_t dim_out,
int64_t mult = 4) {
int64_t inner_dim = dim * mult;
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
// net_1 is nn.Dropout(), skip for inference
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [ne3, ne2, ne1, dim]
// return: [ne3, ne2, ne1, dim_out]
auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
return x;
}
};
class CrossAttention : public GGMLBlock {
protected:
int64_t query_dim;
int64_t context_dim;
int64_t n_head;
int64_t d_head;
bool flash_attn;
public:
CrossAttention(int64_t query_dim,
int64_t context_dim,
int64_t n_head,
int64_t d_head,
bool flash_attn = false)
: n_head(n_head),
d_head(d_head),
query_dim(query_dim),
context_dim(context_dim),
flash_attn(flash_attn) {
int64_t inner_dim = d_head * n_head;
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, query_dim));
// to_out_1 is nn.Dropout(), skip for inference
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
// x: [N, n_token, query_dim]
// context: [N, n_context, context_dim]
// return: [N, n_token, query_dim]
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
auto to_k = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
int64_t n = x->ne[2];
int64_t n_token = x->ne[1];
int64_t n_context = context->ne[1];
int64_t inner_dim = d_head * n_head;
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false, false, flash_attn); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
return x;
}
};
class BasicTransformerBlock : public GGMLBlock {
protected:
int64_t n_head;
int64_t d_head;
bool ff_in;
public:
BasicTransformerBlock(int64_t dim,
int64_t n_head,
int64_t d_head,
int64_t context_dim,
bool ff_in = false,
bool flash_attn = false)
: n_head(n_head), d_head(d_head), ff_in(ff_in) {
// disable_self_attn is always False
// disable_temporal_crossattention is always False
// switch_temporal_ca_to_sa is always False
// inner_dim is always None or equal to dim
// gated_ff is always True
blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["norm3"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
if (ff_in) {
blocks["norm_in"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["ff_in"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
// x: [N, n_token, query_dim]
// context: [N, n_context, context_dim]
// return: [N, n_token, query_dim]
auto attn1 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn1"]);
auto attn2 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn2"]);
auto ff = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
auto norm3 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm3"]);
if (ff_in) {
auto norm_in = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_in"]);
auto ff_in = std::dynamic_pointer_cast<FeedForward>(blocks["ff_in"]);
auto x_skip = x;
x = norm_in->forward(ctx, x);
x = ff_in->forward(ctx, x);
// self.is_res is always True
x = ggml_add(ctx, x, x_skip);
}
auto r = x;
x = norm1->forward(ctx, x);
x = attn1->forward(ctx, x, x); // self-attention
x = ggml_add(ctx, x, r);
r = x;
x = norm2->forward(ctx, x);
x = attn2->forward(ctx, x, context); // cross-attention
x = ggml_add(ctx, x, r);
r = x;
x = norm3->forward(ctx, x);
x = ff->forward(ctx, x);
x = ggml_add(ctx, x, r);
return x;
}
};
class SpatialTransformer : public GGMLBlock {
protected:
int64_t in_channels; // mult * model_channels
int64_t n_head;
int64_t d_head;
int64_t depth = 1; // 1
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
public:
SpatialTransformer(int64_t in_channels,
int64_t n_head,
int64_t d_head,
int64_t depth,
int64_t context_dim,
bool flash_attn = false)
: in_channels(in_channels),
n_head(n_head),
d_head(d_head),
depth(depth),
context_dim(context_dim) {
// We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
// disable_self_attn is always False
int64_t inner_dim = n_head * d_head; // in_channels
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
}
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
}
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
// x: [N, in_channels, h, w]
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
auto proj_in = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
auto x_in = x;
int64_t n = x->ne[3];
int64_t h = x->ne[1];
int64_t w = x->ne[0];
int64_t inner_dim = n_head * d_head;
x = norm->forward(ctx, x);
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
x = transformer_block->forward(ctx, x, context);
}
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
// proj_out
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
x = ggml_add(ctx, x, x_in);
return x;
}
};
class AlphaBlender : public GGMLBlock {
protected:
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
}
float get_alpha() {
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
// so learned_with_images is same as learned
float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
return sigmoid(alpha);
}
public:
AlphaBlender() {
// merge_strategy is always learned_with_images
// for inference, we don't need to set alpha
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x_spatial,
struct ggml_tensor* x_temporal) {
// image_only_indicator is always tensor([0.])
float alpha = get_alpha();
auto x = ggml_add(ctx,
ggml_scale(ctx, x_spatial, alpha),
ggml_scale(ctx, x_temporal, 1.0f - alpha));
return x;
}
};
class VideoResBlock : public ResBlock {
public:
VideoResBlock(int channels,
int emb_channels,
int out_channels,
std::pair<int, int> kernel_size = {3, 3},
int64_t video_kernel_size = 3,
int dims = 2) // always 2
: ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* emb,
int num_video_frames) {
// x: [N, channels, h, w] aka [b*t, channels, h, w]
// emb: [N, emb_channels] aka [b*t, emb_channels]
// image_only_indicator is always tensor([0.])
auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
x = ResBlock::forward(ctx, x, emb);
int64_t T = num_video_frames;
int64_t B = x->ne[3] / T;
int64_t C = x->ne[2];
int64_t H = x->ne[1];
int64_t W = x->ne[0];
x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
auto x_mix = x;
emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
x = time_stack->forward(ctx, x, emb); // b t c (h w)
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
return x;
}
};
#endif // __COMMON_HPP__
#endif // __COMMON_HPP__

File diff suppressed because it is too large Load Diff

View File

@ -1,458 +0,0 @@
#ifndef __CONTROL_HPP__
#define __CONTROL_HPP__
#include "common.hpp"
#include "ggml_extend.hpp"
#include "model.h"
#define CONTROL_NET_GRAPH_SIZE 1536
/*
=================================== ControlNet ===================================
Reference: https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/cldm/cldm.py
*/
class ControlNetBlock : public GGMLBlock {
protected:
SDVersion version = VERSION_SD1;
// network hparams
int in_channels = 4;
int out_channels = 4;
int hint_channels = 3;
int num_res_blocks = 2;
std::vector<int> attention_resolutions = {4, 2, 1};
std::vector<int> channel_mult = {1, 2, 4, 4};
std::vector<int> transformer_depth = {1, 1, 1, 1};
int time_embed_dim = 1280; // model_channels*4
int num_heads = 8;
int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
public:
int model_channels = 320;
int adm_in_channels = 2816; // only for VERSION_SDXL
ControlNetBlock(SDVersion version = VERSION_SD1)
: version(version) {
if (sd_version_is_sd2(version)) {
context_dim = 1024;
num_head_channels = 64;
num_heads = -1;
} else if (sd_version_is_sdxl(version)) {
context_dim = 2048;
attention_resolutions = {4, 2};
channel_mult = {1, 2, 4};
transformer_depth = {1, 2, 10};
num_head_channels = 64;
num_heads = -1;
} else if (version == VERSION_SVD) {
in_channels = 8;
out_channels = 4;
context_dim = 1024;
adm_in_channels = 768;
num_head_channels = 64;
num_heads = -1;
}
blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
// time_embed_1 is nn.SiLU()
blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
// label_emb_1 is nn.SiLU()
blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
}
// input_blocks
blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
std::vector<int> input_block_chans;
input_block_chans.push_back(model_channels);
int ch = model_channels;
int input_block_idx = 0;
int ds = 1;
auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
return new ResBlock(channels, emb_channels, out_channels);
};
auto get_attention_layer = [&](int64_t in_channels,
int64_t n_head,
int64_t d_head,
int64_t depth,
int64_t context_dim) -> SpatialTransformer* {
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
};
auto make_zero_conv = [&](int64_t channels) {
return new Conv2d(channels, channels, {1, 1});
};
blocks["zero_convs.0.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(model_channels));
blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}));
// nn.SiLU()
blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}));
size_t len_mults = channel_mult.size();
for (int i = 0; i < len_mults; i++) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
input_block_idx += 1;
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
blocks[name] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
ch = mult * model_channels;
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
n_head,
d_head,
transformer_depth[i],
context_dim));
}
blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
input_block_chans.push_back(ch);
}
if (i != len_mults - 1) {
input_block_idx += 1;
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
blocks[name] = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
input_block_chans.push_back(ch);
ds *= 2;
}
}
// middle blocks
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
n_head,
d_head,
transformer_depth[transformer_depth.size() - 1],
context_dim));
blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
// middle_block_out
blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
}
struct ggml_tensor* resblock_forward(std::string name,
struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* emb) {
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
return block->forward(ctx, x, emb);
}
struct ggml_tensor* attention_layer_forward(std::string name,
struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
return block->forward(ctx, x, context);
}
struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
struct ggml_tensor* hint,
struct ggml_tensor* emb,
struct ggml_tensor* context) {
int num_input_blocks = 15;
auto h = hint;
for (int i = 0; i < num_input_blocks; i++) {
if (i % 2 == 0) {
auto block = std::dynamic_pointer_cast<Conv2d>(blocks["input_hint_block." + std::to_string(i)]);
h = block->forward(ctx, h);
} else {
h = ggml_silu_inplace(ctx, h);
}
}
return h;
}
std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* hint,
struct ggml_tensor* guided_hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* y = NULL) {
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
// timesteps: [N,]
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
// y: [N, adm_in_channels] or [1, adm_in_channels]
if (context != NULL) {
if (context->ne[2] != x->ne[3]) {
context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
}
}
if (y != NULL) {
if (y->ne[1] != x->ne[3]) {
y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
}
}
auto time_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
auto time_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
auto zero_convs_0 = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs.0.0"]);
auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels]
auto emb = time_embed_0->forward(ctx, t_emb);
emb = ggml_silu_inplace(ctx, emb);
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
// SDXL/SVD
if (y != NULL) {
auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
auto label_emb = label_embed_0->forward(ctx, y);
label_emb = ggml_silu_inplace(ctx, label_emb);
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim]
}
std::vector<struct ggml_tensor*> outs;
if (guided_hint == NULL) {
guided_hint = input_hint_block_forward(ctx, hint, emb, context);
}
outs.push_back(guided_hint);
// input_blocks
// input block 0
auto h = input_blocks_0_0->forward(ctx, x);
h = ggml_add(ctx, h, guided_hint);
outs.push_back(zero_convs_0->forward(ctx, h));
// input block 1-11
size_t len_mults = channel_mult.size();
int input_block_idx = 0;
int ds = 1;
for (int i = 0; i < len_mults; i++) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
input_block_idx += 1;
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
h = resblock_forward(name, ctx, h, emb); // [N, mult*model_channels, h, w]
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
h = attention_layer_forward(name, ctx, h, context); // [N, mult*model_channels, h, w]
}
auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
outs.push_back(zero_conv->forward(ctx, h));
}
if (i != len_mults - 1) {
ds *= 2;
input_block_idx += 1;
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
auto block = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
h = block->forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
outs.push_back(zero_conv->forward(ctx, h));
}
}
// [N, 4*model_channels, h/8, w/8]
// middle_block
h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
h = attention_layer_forward("middle_block.1", ctx, h, context); // [N, 4*model_channels, h/8, w/8]
h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
// out
outs.push_back(middle_block_out->forward(ctx, h));
return outs;
}
};
struct ControlNet : public GGMLRunner {
SDVersion version = VERSION_SD1;
ControlNetBlock control_net;
ggml_backend_buffer_t control_buffer = NULL; // keep control output tensors in backend memory
ggml_context* control_ctx = NULL;
std::vector<struct ggml_tensor*> controls; // (12 input block outputs, 1 middle block output) SD 1.5
struct ggml_tensor* guided_hint = NULL; // guided_hint cache, for faster inference
bool guided_hint_cached = false;
ControlNet(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
SDVersion version = VERSION_SD1)
: GGMLRunner(backend), control_net(version) {
control_net.init(params_ctx, tensor_types, "");
}
~ControlNet() {
free_control_ctx();
}
void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
params.mem_buffer = NULL;
params.no_alloc = true;
control_ctx = ggml_init(params);
controls.resize(outs.size() - 1);
size_t control_buffer_size = 0;
guided_hint = ggml_dup_tensor(control_ctx, outs[0]);
control_buffer_size += ggml_nbytes(guided_hint);
for (int i = 0; i < outs.size() - 1; i++) {
controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
control_buffer_size += ggml_nbytes(controls[i]);
}
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
}
void free_control_ctx() {
if (control_buffer != NULL) {
ggml_backend_buffer_free(control_buffer);
control_buffer = NULL;
}
if (control_ctx != NULL) {
ggml_free(control_ctx);
control_ctx = NULL;
}
guided_hint = NULL;
guided_hint_cached = false;
controls.clear();
}
std::string get_desc() {
return "control_net";
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
control_net.get_param_tensors(tensors, prefix);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
struct ggml_tensor* hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* y = NULL) {
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
x = to_backend(x);
if (guided_hint_cached) {
hint = NULL;
} else {
hint = to_backend(hint);
}
context = to_backend(context);
y = to_backend(y);
timesteps = to_backend(timesteps);
auto outs = control_net.forward(compute_ctx,
x,
hint,
guided_hint_cached ? guided_hint : NULL,
timesteps,
context,
y);
if (control_ctx == NULL) {
alloc_control_ctx(outs);
}
ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint));
for (int i = 0; i < outs.size() - 1; i++) {
ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i]));
}
return gf;
}
void compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* y,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL) {
// x: [N, in_channels, h, w]
// timesteps: [N, ]
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
// y: [N, adm_in_channels] or [1, adm_in_channels]
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x, hint, timesteps, context, y);
};
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
guided_hint_cached = true;
}
bool load_from_file(const std::string& file_path) {
LOG_INFO("loading control net from '%s'", file_path.c_str());
alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
control_net.get_param_tensors(tensors);
std::set<std::string> ignore_tensors;
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
return false;
}
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
if (!success) {
LOG_ERROR("load control net tensors from model loader failed");
return false;
}
LOG_INFO("control net model loaded");
return success;
}
};
#endif // __CONTROL_HPP__

File diff suppressed because it is too large Load Diff

View File

@ -1,187 +0,0 @@
#ifndef __DIFFUSION_MODEL_H__
#define __DIFFUSION_MODEL_H__
#include "flux.hpp"
#include "mmdit.hpp"
#include "unet.hpp"
struct DiffusionModel {
virtual void compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) = 0;
virtual void alloc_params_buffer() = 0;
virtual void free_params_buffer() = 0;
virtual void free_compute_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0;
virtual int64_t get_adm_in_channels() = 0;
};
struct UNetModel : public DiffusionModel {
UNetModelRunner unet;
UNetModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
SDVersion version = VERSION_SD1,
bool flash_attn = false)
: unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
}
void alloc_params_buffer() {
unet.alloc_params_buffer();
}
void free_params_buffer() {
unet.free_params_buffer();
}
void free_compute_buffer() {
unet.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
unet.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() {
return unet.get_params_buffer_size();
}
int64_t get_adm_in_channels() {
return unet.unet.adm_in_channels;
}
void compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
(void)skip_layers; // SLG doesn't work with UNet models
return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
}
};
struct MMDiTModel : public DiffusionModel {
MMDiTRunner mmdit;
MMDiTModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types)
: mmdit(backend, tensor_types, "model.diffusion_model") {
}
void alloc_params_buffer() {
mmdit.alloc_params_buffer();
}
void free_params_buffer() {
mmdit.free_params_buffer();
}
void free_compute_buffer() {
mmdit.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
mmdit.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() {
return mmdit.get_params_buffer_size();
}
int64_t get_adm_in_channels() {
return 768 + 1280;
}
void compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
}
};
struct FluxModel : public DiffusionModel {
Flux::FluxRunner flux;
FluxModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
SDVersion version = VERSION_FLUX,
bool flash_attn = false,
bool use_mask = false)
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
}
void alloc_params_buffer() {
flux.alloc_params_buffer();
}
void free_params_buffer() {
flux.free_params_buffer();
}
void free_compute_buffer() {
flux.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
flux.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() {
return flux.get_params_buffer_size();
}
int64_t get_adm_in_channels() {
return 768;
}
void compute(int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
std::vector<ggml_tensor*> ref_latents = {},
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers);
}
};
#endif

View File

@ -1,33 +0,0 @@
# How to Use
You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
## Download weights
- Download Chroma
- If you don't want to do the conversion yourself, download the preconverted gguf model from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF)
- Otherwise, download chroma's safetensors from [lodestones/Chroma](https://huggingface.co/lodestones/Chroma)
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Convert Chroma weights
You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
```
## Run
### Example
For example:
```
.\bin\Release\sd.exe -diffusion-model ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask
```
![](../assets/flux/chroma_v40.png)

View File

@ -1,15 +0,0 @@
## Docker
### Building using Docker
```shell
docker build -t sd .
```
### Run
```shell
docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
# For example
# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```

View File

@ -1,9 +0,0 @@
## Using ESRGAN to upscale results
You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
- Specify the model path using the `--upscale-model PATH` parameter. example:
```bash
sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
```

View File

@ -1,66 +0,0 @@
# How to Use
You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
## Download weights
- Download flux
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
- Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Convert flux weights
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
```
## Run
- `--cfg-scale` is recommended to be set to 1.
### Flux-dev
For example:
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
```
Using formats of different precisions will yield results of varying quality.
| Type | q8_0 | q4_0 | q4_k | q3_k | q2_k |
|---- | ---- |---- |---- |---- |---- |
| **Memory** | 12068.09 MB | 6394.53 MB | 6395.17 MB | 4888.16 MB | 3735.73 MB |
| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q4_k.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
### Flux-schnell
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4
```
| q8_0 |
| ---- |
|![](../assets/flux/flux1-schnell-q8_0.png) |
## Run with LoRA
Since many flux LoRA training libraries have used various LoRA naming formats, it is possible that not all flux LoRA naming formats are supported. It is recommended to use LoRA with naming formats compatible with ComfyUI.
### Flux-dev q8_0 with LoRA
- LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
```
.\bin\Release\sd.exe --diffusion-model ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models
```
![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)

View File

@ -1,85 +0,0 @@
# Using hipBLAS on Windows
To get hipBLAS in `stable-diffusion.cpp` working on Windows, go through this guide section by section.
## Build Tools for Visual Studio 2022
Skip this step if you already have Build Tools installed.
To install Build Tools, go to [Visual Studio Downloads](https://visualstudio.microsoft.com/vs/), download `Visual Studio 2022 and other Products` and run the installer.
## CMake
Skip this step if you already have CMake installed: running `cmake --version` should output `cmake version x.y.z`.
Download latest `Windows x64 Installer` from [Download | CMake](https://cmake.org/download/) and run it.
## ROCm
Skip this step if you already have Build Tools installed.
The [validation tools](https://rocm.docs.amd.com/en/latest/reference/validation_tools.html) not support on Windows. So you should confirm the Version of `ROCM` by yourself.
Fortunately, `AMD` provides complete help documentation, you can use the help documentation to install [ROCM](https://rocm.docs.amd.com/en/latest/deploy/windows/quick_start.html)
>**If you encounter an error, if it is [AMD ROCm Windows Installation Error 215](https://github.com/RadeonOpenCompute/ROCm/issues/2363), don't worry about this error. ROCM has been installed correctly, but the vs studio plugin installation failed, we can ignore it.**
Then we must set `ROCM` as environment variables before running cmake.
Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\5.5\bin`
This is what I use to set the clang:
```Commandline
set CC=C:\Program Files\AMD\ROCm\5.5\bin\clang.exe
set CXX=C:\Program Files\AMD\ROCm\5.5\bin\clang++.exe
```
## Ninja
Skip this step if you already have Ninja installed: running `ninja --version` should output `1.11.1`.
Download latest `ninja-win.zip` from [GitHub Releases Page](https://github.com/ninja-build/ninja/releases/tag/v1.11.1) and unzip. Then set as environment variables. I unzipped it in `C:\Program Files\ninja`, so I set it like this:
```Commandline
set ninja=C:\Program Files\ninja\ninja.exe
```
## Building stable-diffusion.cpp
The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` ,
`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`
>**Notice**: check the `clang` and `clang++` information:
```Commandline
clang --version
clang++ --version
```
If you see like this, we can continue:
```
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
```
```
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
```
>**Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`.
option:
```commandline
mkdir build
cd build
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
cmake --build . --config Release
```
If everything went OK, `build\bin\sd.exe` file should appear.

View File

@ -1,39 +0,0 @@
# How to Use
You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
## Download weights
- Download Kontext
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF)
- Otherwise, download FLUX.1-Kontext-dev from https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev/blob/main/flux1-kontext-dev.safetensors
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Convert Kontext weights
You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.
```
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
```
## Run
- `--cfg-scale` is recommended to be set to 1.
### Example
For example:
```
.\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v
```
| ref_image | prompt | output |
| ---- | ---- |---- |
| ![](../assets/flux/flux1-dev-q8_0.png) | change 'flux.cpp' to 'kontext.cpp' |![](../assets/flux/kontext1_dev_output.png) |

View File

@ -1,15 +0,0 @@
## LCM/LCM-LoRA
- Download LCM-LoRA form https://huggingface.co/latent-consistency/lcm-lora-sdv1-5
- Specify LCM-LoRA by adding `<lora:lcm-lora-sdv1-5:1>` to prompt
- It's advisable to set `--cfg-scale` to `1.0` instead of the default `7.0`. For `--steps`, a range of `2-8` steps is recommended. For `--sampling-method`, `lcm`/`euler_a` is recommended.
Here's a simple example:
```
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
```
| without LCM-LoRA (--cfg-scale 7) | with LCM-LoRA (--cfg-scale 1) |
| ---- |---- |
| ![](../assets/without_lcm.png) |![](../assets/with_lcm.png) |

View File

@ -1,13 +0,0 @@
## LoRA
- You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
- LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
Here's a simple example:
```
./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
```
`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model

View File

@ -1,54 +0,0 @@
## Using PhotoMaker to personalize image generation
You can use [PhotoMaker](https://github.com/TencentARC/PhotoMaker) to personalize generated images with your own ID.
**NOTE**, currently PhotoMaker **ONLY** works with **SDXL** (any SDXL model files will work).
Download PhotoMaker model file (in safetensor format) [here](https://huggingface.co/bssrdf/PhotoMaker). The official release of the model file (in .bin format) does not work with ```stablediffusion.cpp```.
- Specify the PhotoMaker model path using the `--stacked-id-embd-dir PATH` parameter.
- Specify the input images path using the `--input-id-images-dir PATH` parameter.
- input images **must** have the same width and height for preprocessing (to be improved)
In prompt, make sure you have a class word followed by the trigger word ```"img"``` (hard-coded for now). The class word could be one of ```"man, woman, girl, boy"```. If input ID images contain asian faces, add ```Asian``` before the class
word.
Another PhotoMaker specific parameter:
- ```--style-ratio (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
Other parameters recommended for running Photomaker:
- ```--cfg-scale 5.0```
- ```-H 1024```
- ```-W 1024```
If on low memory GPUs (<= 8GB), recommend running with ```--vae-on-cpu``` option to get artifact free images.
Example:
```bash
bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
```
## PhotoMaker Version 2
[PhotoMaker Version 2 (PMV2)](https://github.com/TencentARC/PhotoMaker/blob/main/README_pmv2.md) has some key improvements. Unfortunately it has a very heavy dependency which makes running it a bit involved in ```SD.cpp```.
Running PMV2 is now a two-step process:
- Run a python script ```face_detect.py``` to obtain **id_embeds** for the given input images
```
python face_detect.py input_image_dir
```
An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
- Run the same command as in version 1 but replacing ```photomaker-v1.safetensors``` with ```photomaker-v2.safetensors```.
You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
- All the command line parameters from Version 1 remain the same for Version 2

View File

@ -1,27 +0,0 @@
## Quantization
You can specify the model weight type using the `--type` parameter. The weights are automatically converted when loading the model.
- `f16` for 16-bit floating-point
- `f32` for 32-bit floating-point
- `q8_0` for 8-bit integer quantization
- `q5_0` or `q5_1` for 5-bit integer quantization
- `q4_0` or `q4_1` for 4-bit integer quantization
### Memory Requirements of Stable Diffusion 1.x
| precision | f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
| ---- | ---- |---- |---- |---- |---- |---- |---- |
| **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
| **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |
## Convert to GGUF
You can also convert weights in the formats `ckpt/safetensors/diffusers` to gguf and perform quantization in advance, avoiding the need for quantization every time you load them.
For example:
```sh
./bin/sd -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
```

View File

@ -1,20 +0,0 @@
# How to Use
## Download weights
- Download sd3.5_large from https://huggingface.co/stabilityai/stable-diffusion-3.5-large/blob/main/sd3.5_large.safetensors
- Download clip_g from https://huggingface.co/Comfy-Org/stable-diffusion-3.5-fp8/blob/main/text_encoders/clip_g.safetensors
- Download clip_l from https://huggingface.co/Comfy-Org/stable-diffusion-3.5-fp8/blob/main/text_encoders/clip_l.safetensors
- Download t5xxl from https://huggingface.co/Comfy-Org/stable-diffusion-3.5-fp8/blob/main/text_encoders/t5xxl_fp16.safetensors
## Run
### SD3.5 Large
For example:
```
.\bin\Release\sd.exe -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
```
![](../assets/sd3.5_large.png)

View File

@ -1,17 +0,0 @@
## Using TAESD to faster decoding
You can use TAESD to accelerate the decoding of latent images by following these steps:
- Download the model [weights](https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors).
Or curl
```bash
curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
```
- Specify the model path using the `--taesd PATH` parameter. example:
```bash
sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
```

View File

@ -12,151 +12,279 @@
*/
class ResidualDenseBlock : public GGMLBlock {
protected:
int num_feat;
struct ResidualDenseBlock {
int num_features;
int num_grow_ch;
ggml_tensor* conv1_w; // [num_grow_ch, num_features, 3, 3]
ggml_tensor* conv1_b; // [num_grow_ch]
public:
ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
: num_feat(num_feat), num_grow_ch(num_grow_ch) {
blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
ggml_tensor* conv2_w; // [num_grow_ch, num_features + num_grow_ch, 3, 3]
ggml_tensor* conv2_b; // [num_grow_ch]
ggml_tensor* conv3_w; // [num_grow_ch, num_features + 2 * num_grow_ch, 3, 3]
ggml_tensor* conv3_b; // [num_grow_ch]
ggml_tensor* conv4_w; // [num_grow_ch, num_features + 3 * num_grow_ch, 3, 3]
ggml_tensor* conv4_b; // [num_grow_ch]
ggml_tensor* conv5_w; // [num_features, num_features + 4 * num_grow_ch, 3, 3]
ggml_tensor* conv5_b; // [num_features]
ResidualDenseBlock() {}
ResidualDenseBlock(int num_feat, int n_grow_ch) {
num_features = num_feat;
num_grow_ch = n_grow_ch;
}
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
return ggml_leaky_relu(ctx, x, 0.2f, true);
size_t calculate_mem_size() {
size_t mem_size = num_features * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv1_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv1_b
mem_size += (num_features + num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv2_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv2_b
mem_size += (num_features + 2 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv3_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv3_w
mem_size += (num_features + 3 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv4_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv4_w
mem_size += (num_features + 4 * num_grow_ch) * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv5_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv5_w
return mem_size;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [n, num_feat, h, w]
// return: [n, num_feat, h, w]
int get_num_tensors() {
int num_tensors = 10;
return num_tensors;
}
auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
auto conv3 = std::dynamic_pointer_cast<Conv2d>(blocks["conv3"]);
auto conv4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv4"]);
auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
void init_params(ggml_context* ctx) {
conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, num_grow_ch);
conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + num_grow_ch, num_grow_ch);
conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 2 * num_grow_ch, num_grow_ch);
conv3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv4_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 3 * num_grow_ch, num_grow_ch);
conv4_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv5_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 4 * num_grow_ch, num_features);
conv5_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
}
auto x1 = lrelu(ctx, conv1->forward(ctx, x));
auto x_cat = ggml_concat(ctx, x, x1, 2);
auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x2, 2);
auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x3, 2);
auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x4, 2);
auto x5 = conv5->forward(ctx, x_cat);
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "conv1.weight"] = conv1_w;
tensors[prefix + "conv1.bias"] = conv1_b;
x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
tensors[prefix + "conv2.weight"] = conv2_w;
tensors[prefix + "conv2.bias"] = conv2_b;
tensors[prefix + "conv3.weight"] = conv3_w;
tensors[prefix + "conv3.bias"] = conv3_b;
tensors[prefix + "conv4.weight"] = conv4_w;
tensors[prefix + "conv4.bias"] = conv4_b;
tensors[prefix + "conv5.weight"] = conv5_w;
tensors[prefix + "conv5.bias"] = conv5_b;
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
// x1 = self.lrelu(self.conv1(x))
ggml_tensor* x1 = ggml_nn_conv_2d(ctx, x, conv1_w, conv1_b, 1, 1, 1, 1);
x1 = ggml_leaky_relu(ctx, x1, 0.2f, true);
// x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
ggml_tensor* x_cat = ggml_concat(ctx, x, x1);
ggml_tensor* x2 = ggml_nn_conv_2d(ctx, x_cat, conv2_w, conv2_b, 1, 1, 1, 1);
x2 = ggml_leaky_relu(ctx, x2, 0.2f, true);
// x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
x_cat = ggml_concat(ctx, x_cat, x2);
ggml_tensor* x3 = ggml_nn_conv_2d(ctx, x_cat, conv3_w, conv3_b, 1, 1, 1, 1);
x3 = ggml_leaky_relu(ctx, x3, 0.2f, true);
// x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
x_cat = ggml_concat(ctx, x_cat, x3);
ggml_tensor* x4 = ggml_nn_conv_2d(ctx, x_cat, conv4_w, conv4_b, 1, 1, 1, 1);
x4 = ggml_leaky_relu(ctx, x4, 0.2f, true);
// self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
x_cat = ggml_concat(ctx, x_cat, x4);
ggml_tensor* x5 = ggml_nn_conv_2d(ctx, x_cat, conv5_w, conv5_b, 1, 1, 1, 1);
// return x5 * 0.2 + x
x5 = ggml_add(ctx, ggml_scale(ctx, x5, out_scale), x);
return x5;
}
};
class RRDB : public GGMLBlock {
public:
RRDB(int num_feat, int num_grow_ch = 32) {
blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
struct EsrganBlock {
ResidualDenseBlock rd_blocks[3];
int num_residual_blocks = 3;
EsrganBlock() {}
EsrganBlock(int num_feat, int num_grow_ch) {
for (int i = 0; i < num_residual_blocks; i++) {
rd_blocks[i] = ResidualDenseBlock(num_feat, num_grow_ch);
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [n, num_feat, h, w]
// return: [n, num_feat, h, w]
int get_num_tensors() {
int num_tensors = 0;
for (int i = 0; i < num_residual_blocks; i++) {
num_tensors += rd_blocks[i].get_num_tensors();
}
return num_tensors;
}
auto rdb1 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb1"]);
auto rdb2 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb2"]);
auto rdb3 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb3"]);
size_t calculate_mem_size() {
size_t mem_size = 0;
for (int i = 0; i < num_residual_blocks; i++) {
mem_size += rd_blocks[i].calculate_mem_size();
}
return mem_size;
}
auto out = rdb1->forward(ctx, x);
out = rdb2->forward(ctx, out);
out = rdb3->forward(ctx, out);
void init_params(ggml_context* ctx) {
for (int i = 0; i < num_residual_blocks; i++) {
rd_blocks[i].init_params(ctx);
}
}
out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
for (int i = 0; i < num_residual_blocks; i++) {
rd_blocks[i].map_by_name(tensors, prefix + "rdb" + std::to_string(i + 1) + ".");
}
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x) {
ggml_tensor* out = x;
for (int i = 0; i < num_residual_blocks; i++) {
// out = self.rdb...(x)
out = rd_blocks[i].forward(ctx, out_scale, out);
}
// return out * 0.2 + x
out = ggml_add(ctx, ggml_scale(ctx, out, out_scale), x);
return out;
}
};
class RRDBNet : public GGMLBlock {
protected:
int scale = 4; // default RealESRGAN_x4plus_anime_6B
int num_block = 6; // default RealESRGAN_x4plus_anime_6B
int num_in_ch = 3;
int num_out_ch = 3;
int num_feat = 64; // default RealESRGAN_x4plus_anime_6B
int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B
struct ESRGAN : public GGMLModule {
int scale = 4; // default RealESRGAN_x4plus_anime_6B
int num_blocks = 6; // default RealESRGAN_x4plus_anime_6B
int in_channels = 3;
int out_channels = 3;
int num_features = 64; // default RealESRGAN_x4plus_anime_6B
int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
public:
RRDBNet() {
blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
for (int i = 0; i < num_block; i++) {
std::string name = "body." + std::to_string(i);
blocks[name] = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
ggml_tensor* conv_first_w; // [num_features, in_channels, 3, 3]
ggml_tensor* conv_first_b; // [num_features]
EsrganBlock body_blocks[6];
ggml_tensor* conv_body_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_body_b; // [num_features]
// upsample
ggml_tensor* conv_up1_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_up1_b; // [num_features]
ggml_tensor* conv_up2_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_up2_b; // [num_features]
ggml_tensor* conv_hr_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_hr_b; // [num_features]
ggml_tensor* conv_last_w; // [out_channels, num_features, 3, 3]
ggml_tensor* conv_last_b; // [out_channels]
bool decode_only = false;
ESRGAN() {
name = "esrgan";
for (int i = 0; i < num_blocks; i++) {
body_blocks[i] = EsrganBlock(num_features, num_grow_ch);
}
blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
// upsample
blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
blocks["conv_hr"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
}
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
return ggml_leaky_relu(ctx, x, 0.2f, true);
}
size_t calculate_mem_size() {
size_t mem_size = num_features * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_first_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_first_b
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [n, num_in_ch, h, w]
// return: [n, num_out_ch, h*4, w*4]
auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
auto conv_body = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
auto conv_hr = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
auto conv_last = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
auto feat = conv_first->forward(ctx, x);
auto body_feat = feat;
for (int i = 0; i < num_block; i++) {
std::string name = "body." + std::to_string(i);
auto block = std::dynamic_pointer_cast<RRDB>(blocks[name]);
body_feat = block->forward(ctx, body_feat);
for (int i = 0; i < num_blocks; i++) {
mem_size += body_blocks[i].calculate_mem_size();
}
body_feat = conv_body->forward(ctx, body_feat);
feat = ggml_add(ctx, feat, body_feat);
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_body_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_body_w
// upsample
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
return out;
}
};
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_up1_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_up1_b
struct ESRGAN : public GGMLRunner {
RRDBNet rrdb_net;
int scale = 4;
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_up2_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_up2_b
ESRGAN(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
: GGMLRunner(backend) {
rrdb_net.init(params_ctx, tensor_types, "");
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_hr_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_hr_b
mem_size += out_channels * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_last_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_last_b
return mem_size;
}
std::string get_desc() {
return "esrgan";
size_t get_num_tensors() {
size_t num_tensors = 12;
for (int i = 0; i < num_blocks; i++) {
num_tensors += body_blocks[i].get_num_tensors();
}
return num_tensors;
}
bool load_from_file(const std::string& file_path) {
void init_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
conv_first_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, in_channels, num_features);
conv_first_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_body_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_body_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_up1_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_up1_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_up2_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_up2_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_hr_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_hr_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_last_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, out_channels);
conv_last_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, out_channels);
for (int i = 0; i < num_blocks; i++) {
body_blocks[i].init_params(params_ctx);
}
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
ggml_allocr_free(alloc);
}
bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
LOG_INFO("loading esrgan from '%s'", file_path.c_str());
alloc_params_buffer();
if (!alloc_params_buffer(backend)) {
return false;
}
std::map<std::string, ggml_tensor*> esrgan_tensors;
rrdb_net.get_param_tensors(esrgan_tensors);
// prepare memory for the weights
{
init_params();
map_by_name(esrgan_tensors);
}
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
@ -175,22 +303,120 @@ struct ESRGAN : public GGMLRunner {
return success;
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
tensors["conv_first.weight"] = conv_first_w;
tensors["conv_first.bias"] = conv_first_b;
for (int i = 0; i < num_blocks; i++) {
body_blocks[i].map_by_name(tensors, "body." + std::to_string(i) + ".");
}
tensors["conv_body.weight"] = conv_body_w;
tensors["conv_body.bias"] = conv_body_b;
tensors["conv_up1.weight"] = conv_up1_w;
tensors["conv_up1.bias"] = conv_up1_b;
tensors["conv_up2.weight"] = conv_up2_w;
tensors["conv_up2.bias"] = conv_up2_b;
tensors["conv_hr.weight"] = conv_hr_w;
tensors["conv_hr.bias"] = conv_hr_b;
tensors["conv_last.weight"] = conv_last_w;
tensors["conv_last.bias"] = conv_last_b;
}
ggml_tensor* forward(ggml_context* ctx0, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
// feat = self.conv_first(feat)
auto h = ggml_nn_conv_2d(ctx0, x, conv_first_w, conv_first_b, 1, 1, 1, 1);
auto body_h = h;
// self.body(feat)
for (int i = 0; i < num_blocks; i++) {
body_h = body_blocks[i].forward(ctx0, out_scale, body_h);
}
// body_feat = self.conv_body(self.body(feat))
body_h = ggml_nn_conv_2d(ctx0, body_h, conv_body_w, conv_body_b, 1, 1, 1, 1);
// feat = feat + body_feat
h = ggml_add(ctx0, h, body_h);
// upsample
// feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
h = ggml_upscale(ctx0, h, 2);
h = ggml_nn_conv_2d(ctx0, h, conv_up1_w, conv_up1_b, 1, 1, 1, 1);
h = ggml_leaky_relu(ctx0, h, 0.2f, true);
// feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
h = ggml_upscale(ctx0, h, 2);
h = ggml_nn_conv_2d(ctx0, h, conv_up2_w, conv_up2_b, 1, 1, 1, 1);
h = ggml_leaky_relu(ctx0, h, 0.2f, true);
// out = self.conv_last(self.lrelu(self.conv_hr(feat)))
h = ggml_nn_conv_2d(ctx0, h, conv_hr_w, conv_hr_b, 1, 1, 1, 1);
h = ggml_leaky_relu(ctx0, h, 0.2f, true);
h = ggml_nn_conv_2d(ctx0, h, conv_last_w, conv_last_b, 1, 1, 1, 1);
return h;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
x = to_backend(x);
struct ggml_tensor* out = rrdb_net.forward(compute_ctx, x);
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* x_ = NULL;
struct ggml_tensor* os = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(compute_allocr, os);
if (!ggml_allocr_is_measure(compute_allocr)) {
float scale = 0.2f;
ggml_backend_tensor_set(os, &scale, 0, sizeof(scale));
}
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend)) {
// pass input tensors to gpu memory
x_ = ggml_dup_tensor(ctx0, x);
ggml_allocr_alloc(compute_allocr, x_);
// pass data to device backend
if (!ggml_allocr_is_measure(compute_allocr)) {
ggml_backend_tensor_set(x_, x->data, 0, ggml_nbytes(x));
}
} else {
x_ = x;
}
struct ggml_tensor* out = forward(ctx0, os, x);
ggml_build_forward_expand(gf, out);
ggml_free(ctx0);
return gf;
}
void compute(const int n_threads,
struct ggml_tensor* x,
ggml_tensor** output,
ggml_context* output_ctx = NULL) {
void alloc_compute_buffer(struct ggml_tensor* x) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x);
};
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
GGMLModule::alloc_compute_buffer(get_graph);
}
void compute(struct ggml_tensor* work_result, const int n_threads, struct ggml_tensor* x) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x);
};
GGMLModule::compute(get_graph, n_threads, work_result);
}
};

File diff suppressed because it is too large Load Diff

View File

@ -1,88 +0,0 @@
import os
import sys
import numpy as np
import torch
from diffusers.utils import load_image
# pip install insightface==0.7.3
from insightface.app import FaceAnalysis
from insightface.data import get_image as ins_get_image
from safetensors.torch import save_file
###
# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
###
class FaceAnalysis2(FaceAnalysis):
# NOTE: allows setting det_size for each detection call.
# the model allows it but the wrapping code from insightface
# doesn't show it, and people end up loading duplicate models
# for different sizes where there is absolutely no need to
def get(self, img, max_num=0, det_size=(640, 640)):
if det_size is not None:
self.det_model.input_size = det_size
return super().get(img, max_num)
def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
# NOTE: try detect faces, if no faces detected, lower det_size until it does
detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
for size in detection_sizes:
faces = face_analysis.get(img_data, det_size=size)
if len(faces) > 0:
return faces
return []
if __name__ == "__main__":
#face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
face_detector.prepare(ctx_id=0, det_size=(640, 640))
#input_folder_name = './scarletthead_woman'
input_folder_name = sys.argv[1]
image_basename_list = os.listdir(input_folder_name)
image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
input_id_images = []
for image_path in image_path_list:
input_id_images.append(load_image(image_path))
id_embed_list = []
for img in input_id_images:
img = np.array(img)
img = img[:, :, ::-1]
faces = analyze_faces(face_detector, img)
if len(faces) > 0:
id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
if len(id_embed_list) == 0:
raise ValueError(f"No face detected in input image pool")
id_embeds = torch.stack(id_embed_list)
# for r in id_embeds:
# print(r)
# #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
# weights = dict()
# weights["id_embeds"] = id_embeds
# save_file(weights, input_folder_name+'/id_embeds.safetensors')
binary_data = id_embeds.numpy().tobytes()
two = 4
zero = 0
one = 1
tensor_name = "id_embeds"
# Write binary data to a file
with open(input_folder_name+'/id_embeds.bin', "wb") as f:
f.write(two.to_bytes(4, byteorder='little'))
f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
f.write(zero.to_bytes(4, byteorder='little'))
f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
f.write(one.to_bytes(4, byteorder='little'))
f.write(one.to_bytes(4, byteorder='little'))
f.write(tensor_name.encode('ascii'))
f.write(binary_data)

1279
flux.hpp

File diff suppressed because it is too large Load Diff

2
ggml

@ -1 +1 @@
Subproject commit 9e4bee1c5afc2d677a5b32ecb90cbdb483e81fff
Subproject commit e5d3412fa2ea3de8c4a696c03dce73c470442dc1

File diff suppressed because it is too large Load Diff

View File

@ -1,349 +0,0 @@
#ifndef GITS_NOISE_INL
#define GITS_NOISE_INL
const std::vector<std::vector<float>> GITS_NOISE_0_80 = {
{ 14.61464119f, 7.49001646f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 6.77309084f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 3.07277966f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.07277966f, 1.84880662f, 0.83188516f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_0_85 = {
{ 14.61464119f, 7.49001646f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 1.84880662f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 6.77309084f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.11996698f, 3.07277966f, 1.24153244f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.60512662f, 2.63833880f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_0_90 = {
{ 14.61464119f, 6.77309084f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 3.07277966f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.11996698f, 4.86714602f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 2.95596409f, 1.61558151f, 0.69515091f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.19988537f, 1.51179266f, 0.89115214f, 0.43325692f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_0_95 = {
{ 14.61464119f, 6.77309084f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 2.84484982f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.36326075f, 0.803307f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.41535246f, 0.803307f, 0.38853383f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.78698075f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.75677586f, 3.07277966f, 2.45070267f, 1.91321158f, 1.46270394f, 1.05362725f, 0.72133851f, 0.43325692f, 0.19894916f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_00 = {
{ 14.61464119f, 1.56271636f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.36326075f, 0.803307f, 0.02916753f },
{ 14.61464119f, 7.11996698f, 3.07277966f, 1.56271636f, 0.59516323f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.41535246f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.54755926f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.12350607f, 1.56271636f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.803307f, 0.50118381f, 0.27464288f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.84880662f, 1.36964464f, 1.01931262f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.46139455f, 2.84484982f, 2.19988537f, 1.67050016f, 1.24153244f, 0.92192322f, 0.64427125f, 0.43325692f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_05 = {
{ 14.61464119f, 0.95350921f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.05039096f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.84484982f, 1.28281462f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.803307f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.52423614f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.74807048f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.59516323f, 0.34370604f, 0.13792117f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.72759056f, 1.24153244f, 0.86115354f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.67050016f, 1.28281462f, 0.95350921f, 0.72133851f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.36326075f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.61951244f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.98035145f, 1.61558151f, 1.32549286f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_10 = {
{ 14.61464119f, 0.89115214f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 1.61558151f, 0.57119018f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.45070267f, 1.08895338f, 0.45573691f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 2.95596409f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.89115214f, 0.4783645f, 0.19894916f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.08895338f, 0.64427125f, 0.34370604f, 0.13792117f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.54755926f, 0.27464288f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.4783645f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.41535246f, 0.95350921f, 0.64427125f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.61558151f, 1.12534678f, 0.803307f, 0.54755926f, 0.36617002f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.32507086f, 2.45070267f, 1.72759056f, 1.24153244f, 0.89115214f, 0.64427125f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.05039096f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.12350607f, 1.61558151f, 1.24153244f, 0.95350921f, 0.72133851f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_15 = {
{ 14.61464119f, 0.83188516f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.59516323f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 1.56271636f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 1.91321158f, 0.83188516f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.51179266f, 0.803307f, 0.41087446f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.56271636f, 0.89115214f, 0.50118381f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.12534678f, 0.72133851f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.07277966f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.36964464f, 0.95350921f, 0.69515091f, 0.4783645f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_20 = {
{ 14.61464119f, 0.803307f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.52423614f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 0.92192322f, 0.36617002f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.05039096f, 0.95350921f, 0.45573691f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.64427125f, 0.29807833f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.803307f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.95350921f, 0.59516323f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.83188516f, 0.59516323f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 3.07277966f, 1.98035145f, 1.36964464f, 0.95350921f, 0.69515091f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.46139455f, 2.36326075f, 1.56271636f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 6.77309084f, 3.46139455f, 2.45070267f, 1.61558151f, 1.162866f, 0.86115354f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.20157266f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_25 = {
{ 14.61464119f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.50118381f, 0.02916753f },
{ 14.61464119f, 2.05039096f, 0.803307f, 0.32104823f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 0.95350921f, 0.43325692f, 0.17026083f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.27464288f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.51179266f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.36326075f, 1.24153244f, 0.72133851f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.98595673f, 0.64427125f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.52423614f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.86115354f, 0.64427125f, 0.4783645f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.28281462f, 0.92192322f, 0.69515091f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.46270394f, 1.08895338f, 0.83188516f, 0.66947293f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_30 = {
{ 14.61464119f, 0.72133851f, 0.02916753f },
{ 14.61464119f, 1.24153244f, 0.43325692f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.59516323f, 0.22545385f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.803307f, 0.36617002f, 0.13792117f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.01931262f, 0.52423614f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.36964464f, 0.74807048f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.56271636f, 0.89115214f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.61558151f, 0.95350921f, 0.61951244f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.41535246f, 0.92192322f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.6383388f, 1.56271636f, 1.01931262f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.77538133f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_35 = {
{ 14.61464119f, 0.69515091f, 0.02916753f },
{ 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.57119018f, 0.19894916f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.69515091f, 0.29807833f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.83188516f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.162866f, 0.64427125f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.36964464f, 0.803307f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.41535246f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.32104823f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.45070267f, 1.51179266f, 1.01931262f, 0.74807048f, 0.57119018f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_40 = {
{ 14.61464119f, 0.59516323f, 0.02916753f },
{ 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
{ 14.61464119f, 1.08895338f, 0.43325692f, 0.13792117f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.64427125f, 0.27464288f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.05039096f, 0.95350921f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.24153244f, 0.72133851f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.52423614f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.41535246f, 0.86115354f, 0.59516323f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.4783645f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.72133851f, 0.54755926f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.43325692f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_45 = {
{ 14.61464119f, 0.59516323f, 0.02916753f },
{ 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 0.95350921f, 0.34370604f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.91321158f, 0.95350921f, 0.57119018f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.19988537f, 1.08895338f, 0.64427125f, 0.41087446f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.72133851f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.59516323f, 0.50118381f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<std::vector<float>> GITS_NOISE_1_50 = {
{ 14.61464119f, 0.54755926f, 0.02916753f },
{ 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
{ 14.61464119f, 0.86115354f, 0.32104823f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.61558151f, 0.83188516f, 0.52423614f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.38853383f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.41087446f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 1.84880662f, 0.95350921f, 0.61951244f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.32549286f, 0.86115354f, 0.64427125f, 0.50118381f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.36964464f, 0.92192322f, 0.69515091f, 0.54755926f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
{ 14.61464119f, 2.45070267f, 1.41535246f, 0.95350921f, 0.72133851f, 0.57119018f, 0.4783645f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
};
const std::vector<const std::vector<std::vector<float>>*> GITS_NOISE = {
&GITS_NOISE_0_80,
&GITS_NOISE_0_85,
&GITS_NOISE_0_90,
&GITS_NOISE_0_95,
&GITS_NOISE_1_00,
&GITS_NOISE_1_05,
&GITS_NOISE_1_10,
&GITS_NOISE_1_15,
&GITS_NOISE_1_20,
&GITS_NOISE_1_25,
&GITS_NOISE_1_30,
&GITS_NOISE_1_35,
&GITS_NOISE_1_40,
&GITS_NOISE_1_45,
&GITS_NOISE_1_50
};
#endif // GITS_NOISE_INL

905
lora.hpp
View File

@ -3,117 +3,35 @@
#include "ggml_extend.hpp"
#define LORA_GRAPH_BASE_SIZE 10240
struct LoraModel : public GGMLRunner {
enum lora_t {
REGULAR = 0,
DIFFUSERS = 1,
DIFFUSERS_2 = 2,
DIFFUSERS_3 = 3,
TRANSFORMERS = 4,
LORA_TYPE_COUNT
};
const std::string lora_ups[LORA_TYPE_COUNT] = {
".lora_up",
"_lora.up",
".lora_B",
".lora.up",
".lora_linear_layer.up",
};
const std::string lora_downs[LORA_TYPE_COUNT] = {
".lora_down",
"_lora.down",
".lora_A",
".lora.down",
".lora_linear_layer.down",
};
const std::string lora_pre[LORA_TYPE_COUNT] = {
"lora.",
"",
"",
"",
"",
};
const std::map<std::string, std::string> alt_names = {
// mmdit
{"final_layer.adaLN_modulation.1", "norm_out.linear"},
{"pos_embed", "pos_embed.proj"},
{"final_layer.linear", "proj_out"},
{"y_embedder.mlp.0", "time_text_embed.text_embedder.linear_1"},
{"y_embedder.mlp.2", "time_text_embed.text_embedder.linear_2"},
{"t_embedder.mlp.0", "time_text_embed.timestep_embedder.linear_1"},
{"t_embedder.mlp.2", "time_text_embed.timestep_embedder.linear_2"},
{"x_block.mlp.fc1", "ff.net.0.proj"},
{"x_block.mlp.fc2", "ff.net.2"},
{"context_block.mlp.fc1", "ff_context.net.0.proj"},
{"context_block.mlp.fc2", "ff_context.net.2"},
{"x_block.adaLN_modulation.1", "norm1.linear"},
{"context_block.adaLN_modulation.1", "norm1_context.linear"},
{"context_block.attn.proj", "attn.to_add_out"},
{"x_block.attn.proj", "attn.to_out.0"},
{"x_block.attn2.proj", "attn2.to_out.0"},
// flux
// singlestream
{"linear2", "proj_out"},
{"modulation.lin", "norm.linear"},
// doublestream
{"txt_attn.proj", "attn.to_add_out"},
{"img_attn.proj", "attn.to_out.0"},
{"txt_mlp.0", "ff_context.net.0.proj"},
{"txt_mlp.2", "ff_context.net.2"},
{"img_mlp.0", "ff.net.0.proj"},
{"img_mlp.2", "ff.net.2"},
{"txt_mod.lin", "norm1_context.linear"},
{"img_mod.lin", "norm1.linear"},
};
const std::map<std::string, std::string> qkv_prefixes = {
// mmdit
{"context_block.attn.qkv", "attn.add_"}, // suffix "_proj"
{"x_block.attn.qkv", "attn.to_"},
{"x_block.attn2.qkv", "attn2.to_"},
// flux
// doublestream
{"txt_attn.qkv", "attn.add_"}, // suffix "_proj"
{"img_attn.qkv", "attn.to_"},
};
const std::map<std::string, std::string> qkvm_prefixes = {
// flux
// singlestream
{"linear1", ""},
};
const std::string* type_fingerprints = lora_ups;
#define LORA_GRAPH_SIZE 10240
struct LoraModel : public GGMLModule {
float multiplier = 1.0f;
std::map<std::string, struct ggml_tensor*> lora_tensors;
std::string file_path;
ModelLoader model_loader;
bool load_failed = false;
bool applied = false;
std::vector<int> zero_index_vec = {0};
ggml_tensor* zero_index = NULL;
enum lora_t type = REGULAR;
bool load_failed = false;
LoraModel(ggml_backend_t backend,
const std::string& file_path = "",
const std::string prefix = "")
: file_path(file_path), GGMLRunner(backend) {
if (!model_loader.init_from_file(file_path, prefix)) {
LoraModel(const std::string file_path = "")
: file_path(file_path) {
name = "lora";
if (!model_loader.init_from_file(file_path)) {
load_failed = true;
}
}
std::string get_desc() {
return "lora";
size_t get_num_tensors() {
return LORA_GRAPH_SIZE;
}
bool load_from_file(bool filter_tensor = false) {
size_t calculate_mem_size() {
return model_loader.cal_mem_size(NULL);
}
bool load_from_file(ggml_backend_t backend) {
if (!alloc_params_buffer(backend)) {
return false;
}
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
if (load_failed) {
@ -121,726 +39,147 @@ struct LoraModel : public GGMLRunner {
return false;
}
bool dry_run = true;
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
if (filter_tensor && !contains(name, "lora")) {
// LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
return true;
}
// LOG_INFO("%s", name.c_str());
for (int i = 0; i < LORA_TYPE_COUNT; i++) {
if (name.find(type_fingerprints[i]) != std::string::npos) {
type = (lora_t)i;
break;
}
}
struct ggml_tensor* real = ggml_new_tensor(params_ctx, tensor_storage.type, tensor_storage.n_dims, tensor_storage.ne);
ggml_allocr_alloc(alloc, real);
if (dry_run) {
struct ggml_tensor* real = ggml_new_tensor(params_ctx,
tensor_storage.type,
tensor_storage.n_dims,
tensor_storage.ne);
lora_tensors[name] = real;
} else {
auto real = lora_tensors[name];
*dst_tensor = real;
}
*dst_tensor = real;
lora_tensors[name] = real;
return true;
};
model_loader.load_tensors(on_new_tensor_cb, backend);
alloc_params_buffer();
// exit(0);
dry_run = false;
model_loader.load_tensors(on_new_tensor_cb, backend);
LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
LOG_DEBUG("finished loaded lora");
ggml_allocr_free(alloc);
return true;
}
ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) {
auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
out = ggml_get_rows(ctx, out, zero_index);
out = ggml_reshape(ctx, out, a);
return out;
}
struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
// make a graph to compute all lora, expected lora and models tensors are in the same backend
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * LORA_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
std::vector<std::string> to_lora_keys(std::string blk_name, SDVersion version) {
std::vector<std::string> keys;
// if (!sd_version_is_sd3(version) || blk_name != "model.diffusion_model.pos_embed") {
size_t k_pos = blk_name.find(".weight");
if (k_pos == std::string::npos) {
return keys;
}
blk_name = blk_name.substr(0, k_pos);
// }
keys.push_back(blk_name);
keys.push_back("lora." + blk_name);
if (sd_version_is_dit(version)) {
if (blk_name.find("model.diffusion_model") != std::string::npos) {
blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer");
}
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// LOG_DEBUG("mem_size %u ", params.mem_size);
if (blk_name.find(".single_blocks") != std::string::npos) {
blk_name.replace(blk_name.find(".single_blocks"), sizeof(".single_blocks") - 1, ".single_transformer_blocks");
}
if (blk_name.find(".double_blocks") != std::string::npos) {
blk_name.replace(blk_name.find(".double_blocks"), sizeof(".double_blocks") - 1, ".transformer_blocks");
}
if (blk_name.find(".joint_blocks") != std::string::npos) {
blk_name.replace(blk_name.find(".joint_blocks"), sizeof(".joint_blocks") - 1, ".transformer_blocks");
}
if (blk_name.find("text_encoders.clip_l") != std::string::npos) {
blk_name.replace(blk_name.find("text_encoders.clip_l"), sizeof("text_encoders.clip_l") - 1, "cond_stage_model");
}
for (const auto& item : alt_names) {
size_t match = blk_name.find(item.first);
if (match != std::string::npos) {
blk_name = blk_name.substr(0, match) + item.second;
}
}
for (const auto& prefix : qkv_prefixes) {
size_t match = blk_name.find(prefix.first);
if (match != std::string::npos) {
std::string split_blk = "SPLIT|" + blk_name.substr(0, match) + prefix.second;
keys.push_back(split_blk);
}
}
for (const auto& prefix : qkvm_prefixes) {
size_t match = blk_name.find(prefix.first);
if (match != std::string::npos) {
std::string split_blk = "SPLIT_L|" + blk_name.substr(0, match) + prefix.second;
keys.push_back(split_blk);
}
}
keys.push_back(blk_name);
}
std::vector<std::string> ret;
for (std::string& key : keys) {
ret.push_back(key);
replace_all_chars(key, '.', '_');
// fix for some sdxl lora, like lcm-lora-xl
if (key == "model_diffusion_model_output_blocks_2_2_conv") {
ret.push_back("model_diffusion_model_output_blocks_2_1_conv");
}
ret.push_back(key);
}
return ret;
}
struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version) {
size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);
zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
set_backend_tensor_data(zero_index, zero_index_vec.data());
ggml_build_forward_expand(gf, zero_index);
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, LORA_GRAPH_SIZE, false);
std::set<std::string> applied_lora_tensors;
for (auto it : model_tensors) {
std::string k_tensor = it.first;
struct ggml_tensor* weight = model_tensors[it.first];
std::vector<std::string> keys = to_lora_keys(k_tensor, version);
if (keys.size() == 0)
size_t k_pos = k_tensor.find(".weight");
if (k_pos == std::string::npos) {
continue;
for (auto& key : keys) {
bool is_qkv_split = starts_with(key, "SPLIT|");
if (is_qkv_split) {
key = key.substr(sizeof("SPLIT|") - 1);
}
bool is_qkvm_split = starts_with(key, "SPLIT_L|");
if (is_qkvm_split) {
key = key.substr(sizeof("SPLIT_L|") - 1);
}
struct ggml_tensor* updown = NULL;
float scale_value = 1.0f;
std::string fk = lora_pre[type] + key;
if (lora_tensors.find(fk + ".hada_w1_a") != lora_tensors.end()) {
// LoHa mode
// TODO: split qkv convention for LoHas (is it ever used?)
if (is_qkv_split || is_qkvm_split) {
LOG_ERROR("Split qkv isn't supported for LoHa models.");
break;
}
std::string alpha_name = "";
ggml_tensor* hada_1_mid = NULL; // tau for tucker decomposition
ggml_tensor* hada_1_up = NULL;
ggml_tensor* hada_1_down = NULL;
ggml_tensor* hada_2_mid = NULL; // tau for tucker decomposition
ggml_tensor* hada_2_up = NULL;
ggml_tensor* hada_2_down = NULL;
std::string hada_1_mid_name = "";
std::string hada_1_down_name = "";
std::string hada_1_up_name = "";
std::string hada_2_mid_name = "";
std::string hada_2_down_name = "";
std::string hada_2_up_name = "";
hada_1_down_name = fk + ".hada_w1_b";
hada_1_up_name = fk + ".hada_w1_a";
hada_1_mid_name = fk + ".hada_t1";
if (lora_tensors.find(hada_1_down_name) != lora_tensors.end()) {
hada_1_down = to_f32(compute_ctx, lora_tensors[hada_1_down_name]);
}
if (lora_tensors.find(hada_1_up_name) != lora_tensors.end()) {
hada_1_up = to_f32(compute_ctx, lora_tensors[hada_1_up_name]);
}
if (lora_tensors.find(hada_1_mid_name) != lora_tensors.end()) {
hada_1_mid = to_f32(compute_ctx, lora_tensors[hada_1_mid_name]);
applied_lora_tensors.insert(hada_1_mid_name);
hada_1_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_1_up));
}
hada_2_down_name = fk + ".hada_w2_b";
hada_2_up_name = fk + ".hada_w2_a";
hada_2_mid_name = fk + ".hada_t2";
if (lora_tensors.find(hada_2_down_name) != lora_tensors.end()) {
hada_2_down = to_f32(compute_ctx, lora_tensors[hada_2_down_name]);
}
if (lora_tensors.find(hada_2_up_name) != lora_tensors.end()) {
hada_2_up = to_f32(compute_ctx, lora_tensors[hada_2_up_name]);
}
if (lora_tensors.find(hada_2_mid_name) != lora_tensors.end()) {
hada_2_mid = to_f32(compute_ctx, lora_tensors[hada_2_mid_name]);
applied_lora_tensors.insert(hada_2_mid_name);
hada_2_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_2_up));
}
alpha_name = fk + ".alpha";
applied_lora_tensors.insert(hada_1_down_name);
applied_lora_tensors.insert(hada_1_up_name);
applied_lora_tensors.insert(hada_2_down_name);
applied_lora_tensors.insert(hada_2_up_name);
applied_lora_tensors.insert(alpha_name);
if (hada_1_up == NULL || hada_1_down == NULL || hada_2_up == NULL || hada_2_down == NULL) {
continue;
}
struct ggml_tensor* updown_1 = ggml_merge_lora(compute_ctx, hada_1_down, hada_1_up, hada_1_mid);
struct ggml_tensor* updown_2 = ggml_merge_lora(compute_ctx, hada_2_down, hada_2_up, hada_2_mid);
updown = ggml_mul_inplace(compute_ctx, updown_1, updown_2);
// calc_scale
// TODO: .dora_scale?
int64_t rank = hada_1_down->ne[ggml_n_dims(hada_1_down) - 1];
if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
scale_value = alpha / rank;
}
} else if (lora_tensors.find(fk + ".lokr_w1") != lora_tensors.end() || lora_tensors.find(fk + ".lokr_w1_a") != lora_tensors.end()) {
// LoKr mode
// TODO: split qkv convention for LoKrs (is it ever used?)
if (is_qkv_split || is_qkvm_split) {
LOG_ERROR("Split qkv isn't supported for LoKr models.");
break;
}
std::string alpha_name = fk + ".alpha";
ggml_tensor* lokr_w1 = NULL;
ggml_tensor* lokr_w2 = NULL;
std::string lokr_w1_name = "";
std::string lokr_w2_name = "";
lokr_w1_name = fk + ".lokr_w1";
lokr_w2_name = fk + ".lokr_w2";
if (lora_tensors.find(lokr_w1_name) != lora_tensors.end()) {
lokr_w1 = to_f32(compute_ctx, lora_tensors[lokr_w1_name]);
applied_lora_tensors.insert(lokr_w1_name);
} else {
ggml_tensor* down = NULL;
ggml_tensor* up = NULL;
std::string down_name = lokr_w1_name + "_b";
std::string up_name = lokr_w1_name + "_a";
if (lora_tensors.find(down_name) != lora_tensors.end()) {
// w1 should not be low rank normally, sometimes w1 and w2 are swapped
down = to_f32(compute_ctx, lora_tensors[down_name]);
applied_lora_tensors.insert(down_name);
int64_t rank = down->ne[ggml_n_dims(down) - 1];
if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
scale_value = alpha / rank;
}
}
if (lora_tensors.find(up_name) != lora_tensors.end()) {
up = to_f32(compute_ctx, lora_tensors[up_name]);
applied_lora_tensors.insert(up_name);
}
lokr_w1 = ggml_merge_lora(compute_ctx, down, up);
}
if (lora_tensors.find(lokr_w2_name) != lora_tensors.end()) {
lokr_w2 = to_f32(compute_ctx, lora_tensors[lokr_w2_name]);
applied_lora_tensors.insert(lokr_w2_name);
} else {
ggml_tensor* down = NULL;
ggml_tensor* up = NULL;
std::string down_name = lokr_w2_name + "_b";
std::string up_name = lokr_w2_name + "_a";
if (lora_tensors.find(down_name) != lora_tensors.end()) {
down = to_f32(compute_ctx, lora_tensors[down_name]);
applied_lora_tensors.insert(down_name);
int64_t rank = down->ne[ggml_n_dims(down) - 1];
if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
scale_value = alpha / rank;
}
}
if (lora_tensors.find(up_name) != lora_tensors.end()) {
up = to_f32(compute_ctx, lora_tensors[up_name]);
applied_lora_tensors.insert(up_name);
}
lokr_w2 = ggml_merge_lora(compute_ctx, down, up);
}
// Technically it might be unused, but I believe it's the expected behavior
applied_lora_tensors.insert(alpha_name);
updown = ggml_kronecker(compute_ctx, lokr_w1, lokr_w2);
} else {
// LoRA mode
ggml_tensor* lora_mid = NULL; // tau for tucker decomposition
ggml_tensor* lora_up = NULL;
ggml_tensor* lora_down = NULL;
std::string alpha_name = "";
std::string scale_name = "";
std::string split_q_scale_name = "";
std::string lora_mid_name = "";
std::string lora_down_name = "";
std::string lora_up_name = "";
if (is_qkv_split) {
std::string suffix = "";
auto split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight";
if (lora_tensors.find(split_q_d_name) == lora_tensors.end()) {
suffix = "_proj";
split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight";
}
if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
// print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1]
// find qkv and mlp up parts in LoRA model
auto split_k_d_name = fk + "k" + suffix + lora_downs[type] + ".weight";
auto split_v_d_name = fk + "v" + suffix + lora_downs[type] + ".weight";
auto split_q_u_name = fk + "q" + suffix + lora_ups[type] + ".weight";
auto split_k_u_name = fk + "k" + suffix + lora_ups[type] + ".weight";
auto split_v_u_name = fk + "v" + suffix + lora_ups[type] + ".weight";
auto split_q_scale_name = fk + "q" + suffix + ".scale";
auto split_k_scale_name = fk + "k" + suffix + ".scale";
auto split_v_scale_name = fk + "v" + suffix + ".scale";
auto split_q_alpha_name = fk + "q" + suffix + ".alpha";
auto split_k_alpha_name = fk + "k" + suffix + ".alpha";
auto split_v_alpha_name = fk + "v" + suffix + ".alpha";
ggml_tensor* lora_q_down = NULL;
ggml_tensor* lora_q_up = NULL;
ggml_tensor* lora_k_down = NULL;
ggml_tensor* lora_k_up = NULL;
ggml_tensor* lora_v_down = NULL;
ggml_tensor* lora_v_up = NULL;
lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]);
if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) {
lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
}
if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) {
lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]);
}
if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) {
lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]);
}
if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) {
lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]);
}
if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) {
lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]);
}
float q_rank = lora_q_up->ne[0];
float k_rank = lora_k_up->ne[0];
float v_rank = lora_v_up->ne[0];
float lora_q_scale = 1;
float lora_k_scale = 1;
float lora_v_scale = 1;
if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) {
lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]);
applied_lora_tensors.insert(split_q_scale_name);
}
if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) {
lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]);
applied_lora_tensors.insert(split_k_scale_name);
}
if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) {
lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]);
applied_lora_tensors.insert(split_v_scale_name);
}
if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) {
float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]);
applied_lora_tensors.insert(split_q_alpha_name);
lora_q_scale = lora_q_alpha / q_rank;
}
if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) {
float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]);
applied_lora_tensors.insert(split_k_alpha_name);
lora_k_scale = lora_k_alpha / k_rank;
}
if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) {
float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]);
applied_lora_tensors.insert(split_v_alpha_name);
lora_v_scale = lora_v_alpha / v_rank;
}
ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale);
ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale);
ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale);
// print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1]
// print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1]
// print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1]
// print_ggml_tensor(lora_q_up, true); //[R, 3072, 1, 1]
// print_ggml_tensor(lora_k_up, true); //[R, 3072, 1, 1]
// print_ggml_tensor(lora_v_up, true); //[R, 3072, 1, 1]
// these need to be stitched together this way:
// |q_up,0 ,0 |
// |0 ,k_up,0 |
// |0 ,0 ,v_up|
// (q_down,k_down,v_down) . (q ,k ,v)
// up_concat will be [9216, R*3, 1, 1]
// down_concat will be [R*3, 3072, 1, 1]
ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), lora_v_down, 1);
ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up);
ggml_scale(compute_ctx, z, 0);
ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1);
ggml_tensor* q_up = ggml_concat(compute_ctx, lora_q_up, zz, 1);
ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), z, 1);
ggml_tensor* v_up = ggml_concat(compute_ctx, zz, lora_v_up, 1);
// print_ggml_tensor(q_up, true); //[R, 9216, 1, 1]
// print_ggml_tensor(k_up, true); //[R, 9216, 1, 1]
// print_ggml_tensor(v_up, true); //[R, 9216, 1, 1]
ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), v_up, 0);
// print_ggml_tensor(lora_up_concat, true); //[R*3, 9216, 1, 1]
lora_down = ggml_cont(compute_ctx, lora_down_concat);
lora_up = ggml_cont(compute_ctx, lora_up_concat);
applied_lora_tensors.insert(split_q_u_name);
applied_lora_tensors.insert(split_k_u_name);
applied_lora_tensors.insert(split_v_u_name);
applied_lora_tensors.insert(split_q_d_name);
applied_lora_tensors.insert(split_k_d_name);
applied_lora_tensors.insert(split_v_d_name);
}
} else if (is_qkvm_split) {
auto split_q_d_name = fk + "attn.to_q" + lora_downs[type] + ".weight";
if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
// print_ggml_tensor(it.second, true); //[3072, 21504, 1, 1]
// find qkv and mlp up parts in LoRA model
auto split_k_d_name = fk + "attn.to_k" + lora_downs[type] + ".weight";
auto split_v_d_name = fk + "attn.to_v" + lora_downs[type] + ".weight";
auto split_q_u_name = fk + "attn.to_q" + lora_ups[type] + ".weight";
auto split_k_u_name = fk + "attn.to_k" + lora_ups[type] + ".weight";
auto split_v_u_name = fk + "attn.to_v" + lora_ups[type] + ".weight";
auto split_m_d_name = fk + "proj_mlp" + lora_downs[type] + ".weight";
auto split_m_u_name = fk + "proj_mlp" + lora_ups[type] + ".weight";
auto split_q_scale_name = fk + "attn.to_q" + ".scale";
auto split_k_scale_name = fk + "attn.to_k" + ".scale";
auto split_v_scale_name = fk + "attn.to_v" + ".scale";
auto split_m_scale_name = fk + "proj_mlp" + ".scale";
auto split_q_alpha_name = fk + "attn.to_q" + ".alpha";
auto split_k_alpha_name = fk + "attn.to_k" + ".alpha";
auto split_v_alpha_name = fk + "attn.to_v" + ".alpha";
auto split_m_alpha_name = fk + "proj_mlp" + ".alpha";
ggml_tensor* lora_q_down = NULL;
ggml_tensor* lora_q_up = NULL;
ggml_tensor* lora_k_down = NULL;
ggml_tensor* lora_k_up = NULL;
ggml_tensor* lora_v_down = NULL;
ggml_tensor* lora_v_up = NULL;
ggml_tensor* lora_m_down = NULL;
ggml_tensor* lora_m_up = NULL;
lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]);
}
if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) {
lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
}
if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) {
lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]);
}
if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) {
lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]);
}
if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) {
lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]);
}
if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) {
lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]);
}
if (lora_tensors.find(split_m_d_name) != lora_tensors.end()) {
lora_m_down = to_f32(compute_ctx, lora_tensors[split_m_d_name]);
}
if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) {
lora_m_up = to_f32(compute_ctx, lora_tensors[split_m_u_name]);
}
float q_rank = lora_q_up->ne[0];
float k_rank = lora_k_up->ne[0];
float v_rank = lora_v_up->ne[0];
float m_rank = lora_v_up->ne[0];
float lora_q_scale = 1;
float lora_k_scale = 1;
float lora_v_scale = 1;
float lora_m_scale = 1;
if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) {
lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]);
applied_lora_tensors.insert(split_q_scale_name);
}
if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) {
lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]);
applied_lora_tensors.insert(split_k_scale_name);
}
if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) {
lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]);
applied_lora_tensors.insert(split_v_scale_name);
}
if (lora_tensors.find(split_m_scale_name) != lora_tensors.end()) {
lora_m_scale = ggml_backend_tensor_get_f32(lora_tensors[split_m_scale_name]);
applied_lora_tensors.insert(split_m_scale_name);
}
if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) {
float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]);
applied_lora_tensors.insert(split_q_alpha_name);
lora_q_scale = lora_q_alpha / q_rank;
}
if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) {
float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]);
applied_lora_tensors.insert(split_k_alpha_name);
lora_k_scale = lora_k_alpha / k_rank;
}
if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) {
float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]);
applied_lora_tensors.insert(split_v_alpha_name);
lora_v_scale = lora_v_alpha / v_rank;
}
if (lora_tensors.find(split_m_alpha_name) != lora_tensors.end()) {
float lora_m_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_m_alpha_name]);
applied_lora_tensors.insert(split_m_alpha_name);
lora_m_scale = lora_m_alpha / m_rank;
}
ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale);
ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale);
ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale);
ggml_scale_inplace(compute_ctx, lora_m_down, lora_m_scale);
// print_ggml_tensor(lora_q_down, true); //[3072, R, 1, 1]
// print_ggml_tensor(lora_k_down, true); //[3072, R, 1, 1]
// print_ggml_tensor(lora_v_down, true); //[3072, R, 1, 1]
// print_ggml_tensor(lora_m_down, true); //[3072, R, 1, 1]
// print_ggml_tensor(lora_q_up, true); //[R, 3072, 1, 1]
// print_ggml_tensor(lora_k_up, true); //[R, 3072, 1, 1]
// print_ggml_tensor(lora_v_up, true); //[R, 3072, 1, 1]
// print_ggml_tensor(lora_m_up, true); //[R, 12288, 1, 1]
// these need to be stitched together this way:
// |q_up,0 ,0 ,0 |
// |0 ,k_up,0 ,0 |
// |0 ,0 ,v_up,0 |
// |0 ,0 ,0 ,m_up|
// (q_down,k_down,v_down,m_down) . (q ,k ,v ,m)
// up_concat will be [21504, R*4, 1, 1]
// down_concat will be [R*4, 3072, 1, 1]
ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), ggml_concat(compute_ctx, lora_v_down, lora_m_down, 1), 1);
// print_ggml_tensor(lora_down_concat, true); //[3072, R*4, 1, 1]
// this also means that if rank is bigger than 672, it is less memory efficient to do it this way (should be fine)
// print_ggml_tensor(lora_q_up, true); //[3072, R, 1, 1]
ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up);
ggml_tensor* mlp_z = ggml_dup_tensor(compute_ctx, lora_m_up);
ggml_scale(compute_ctx, z, 0);
ggml_scale(compute_ctx, mlp_z, 0);
ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1);
ggml_tensor* q_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_up, zz, 1), mlp_z, 1);
ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), ggml_concat(compute_ctx, z, mlp_z, 1), 1);
ggml_tensor* v_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, lora_v_up, 1), mlp_z, 1);
ggml_tensor* m_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, z, 1), lora_m_up, 1);
// print_ggml_tensor(q_up, true); //[R, 21504, 1, 1]
// print_ggml_tensor(k_up, true); //[R, 21504, 1, 1]
// print_ggml_tensor(v_up, true); //[R, 21504, 1, 1]
// print_ggml_tensor(m_up, true); //[R, 21504, 1, 1]
ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), ggml_concat(compute_ctx, v_up, m_up, 0), 0);
// print_ggml_tensor(lora_up_concat, true); //[R*4, 21504, 1, 1]
lora_down = ggml_cont(compute_ctx, lora_down_concat);
lora_up = ggml_cont(compute_ctx, lora_up_concat);
applied_lora_tensors.insert(split_q_u_name);
applied_lora_tensors.insert(split_k_u_name);
applied_lora_tensors.insert(split_v_u_name);
applied_lora_tensors.insert(split_m_u_name);
applied_lora_tensors.insert(split_q_d_name);
applied_lora_tensors.insert(split_k_d_name);
applied_lora_tensors.insert(split_v_d_name);
applied_lora_tensors.insert(split_m_d_name);
}
} else {
lora_up_name = fk + lora_ups[type] + ".weight";
lora_down_name = fk + lora_downs[type] + ".weight";
lora_mid_name = fk + ".lora_mid.weight";
alpha_name = fk + ".alpha";
scale_name = fk + ".scale";
if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
lora_up = to_f32(compute_ctx, lora_tensors[lora_up_name]);
}
if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
lora_down = to_f32(compute_ctx, lora_tensors[lora_down_name]);
}
if (lora_tensors.find(lora_mid_name) != lora_tensors.end()) {
lora_mid = to_f32(compute_ctx, lora_tensors[lora_mid_name]);
applied_lora_tensors.insert(lora_mid_name);
}
applied_lora_tensors.insert(lora_up_name);
applied_lora_tensors.insert(lora_down_name);
applied_lora_tensors.insert(alpha_name);
applied_lora_tensors.insert(scale_name);
}
if (lora_up == NULL || lora_down == NULL) {
continue;
}
// calc_scale
// TODO: .dora_scale?
int64_t rank = lora_down->ne[ggml_n_dims(lora_down) - 1];
if (lora_tensors.find(scale_name) != lora_tensors.end()) {
scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
} else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
scale_value = alpha / rank;
}
updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
}
scale_value *= multiplier;
updown = ggml_reshape(compute_ctx, updown, weight);
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
ggml_tensor* final_weight;
if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
// final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne);
// final_weight = ggml_cpy(compute_ctx, weight, final_weight);
final_weight = to_f32(compute_ctx, weight);
final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
final_weight = ggml_cpy(compute_ctx, final_weight, weight);
} else {
final_weight = ggml_add_inplace(compute_ctx, weight, updown);
}
// final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly
ggml_build_forward_expand(gf, final_weight);
break;
}
k_tensor = k_tensor.substr(0, k_pos);
replace_all_chars(k_tensor, '.', '_');
std::string lora_up_name = "lora." + k_tensor + ".lora_up.weight";
std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
std::string alpha_name = "lora." + k_tensor + ".alpha";
std::string scale_name = "lora." + k_tensor + ".scale";
ggml_tensor* lora_up = NULL;
ggml_tensor* lora_down = NULL;
if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
lora_up = lora_tensors[lora_up_name];
}
if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
lora_down = lora_tensors[lora_down_name];
}
if (lora_up == NULL || lora_down == NULL) {
continue;
}
applied_lora_tensors.insert(lora_up_name);
applied_lora_tensors.insert(lora_down_name);
applied_lora_tensors.insert(alpha_name);
applied_lora_tensors.insert(scale_name);
// calc_cale
int64_t dim = lora_down->ne[lora_down->n_dims - 1];
float scale_value = 1.0f;
if (lora_tensors.find(scale_name) != lora_tensors.end()) {
scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
} else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
scale_value = alpha / dim;
}
scale_value *= multiplier;
ggml_tensor* lora_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(compute_allocr, lora_scale);
if (!ggml_allocr_is_measure(compute_allocr)) {
ggml_backend_tensor_set(lora_scale, &scale_value, 0, ggml_nbytes(lora_scale));
}
// flat lora tensors to multiply it
int64_t lora_up_rows = lora_up->ne[lora_up->n_dims - 1];
lora_up = ggml_reshape_2d(ctx0, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
int64_t lora_down_rows = lora_down->ne[lora_down->n_dims - 1];
lora_down = ggml_reshape_2d(ctx0, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
// ggml_mul_mat requires tensor b transposed
lora_down = ggml_cont(ctx0, ggml_transpose(ctx0, lora_down));
struct ggml_tensor* updown = ggml_mul_mat(ctx0, lora_up, lora_down);
updown = ggml_cont(ctx0, ggml_transpose(ctx0, updown));
updown = ggml_reshape(ctx0, updown, weight);
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
updown = ggml_scale_inplace(ctx0, updown, lora_scale);
ggml_tensor* final_weight;
// if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
// final_weight = ggml_new_tensor(ctx0, GGML_TYPE_F32, weight->n_dims, weight->ne);
// final_weight = ggml_cpy_inplace(ctx0, weight, final_weight);
// final_weight = ggml_add_inplace(ctx0, final_weight, updown);
// final_weight = ggml_cpy_inplace(ctx0, final_weight, weight);
// } else {
// final_weight = ggml_add_inplace(ctx0, weight, updown);
// }
final_weight = ggml_add_inplace(ctx0, weight, updown); // apply directly
ggml_build_forward_expand(gf, final_weight);
}
size_t total_lora_tensors_count = 0;
size_t applied_lora_tensors_count = 0;
for (auto& kv : lora_tensors) {
total_lora_tensors_count++;
if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
LOG_WARN("unused lora tensor |%s|", kv.first.c_str());
print_ggml_tensor(kv.second, true);
// exit(0);
} else {
applied_lora_tensors_count++;
LOG_WARN("unused lora tensor %s", kv.first.c_str());
}
}
/* Don't worry if this message shows up twice in the logs per LoRA,
* this function is called once to calculate the required buffer size
* and then again to actually generate a graph to be used */
if (applied_lora_tensors_count != total_lora_tensors_count) {
LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied",
applied_lora_tensors_count, total_lora_tensors_count);
} else {
LOG_DEBUG("(%lu / %lu) LoRA tensors applied successfully",
applied_lora_tensors_count, total_lora_tensors_count);
}
return gf;
}
void apply(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
void alloc_compute_buffer(std::map<std::string, struct ggml_tensor*> model_tensors) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_lora_graph(model_tensors, version);
return build_graph(model_tensors);
};
GGMLRunner::compute(get_graph, n_threads, true);
GGMLModule::alloc_compute_buffer(get_graph);
}
void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
alloc_compute_buffer(model_tensors);
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(model_tensors);
};
GGMLModule::compute(get_graph, n_threads);
}
};
#endif // __LORA_HPP__
#endif // __LORA_HPP__

1002
mmdit.hpp

File diff suppressed because it is too large Load Diff

976
model.cpp

File diff suppressed because it is too large Load Diff

152
model.h
View File

@ -4,106 +4,27 @@
#include <functional>
#include <map>
#include <memory>
#include <set>
#include <sstream>
#include <string>
#include <tuple>
#include <vector>
#include "ggml-backend.h"
#include "ggml.h"
#include "gguf.h"
#include "ggml/ggml-backend.h"
#include "ggml/ggml.h"
#include "json.hpp"
#include "zip.h"
#define SD_MAX_DIMS 5
enum SDVersion {
VERSION_SD1,
VERSION_SD1_INPAINT,
VERSION_SD1_PIX2PIX,
VERSION_SD2,
VERSION_SD2_INPAINT,
VERSION_SDXL,
VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX,
VERSION_SVD,
VERSION_SD3,
VERSION_FLUX,
VERSION_FLUX_FILL,
VERSION_1_x,
VERSION_2_x,
VERSION_XL,
VERSION_COUNT,
};
static inline bool sd_version_is_flux(SDVersion version) {
if (version == VERSION_FLUX || version == VERSION_FLUX_FILL) {
return true;
}
return false;
}
static inline bool sd_version_is_sd3(SDVersion version) {
if (version == VERSION_SD3) {
return true;
}
return false;
}
static inline bool sd_version_is_sd1(SDVersion version) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
return true;
}
return false;
}
static inline bool sd_version_is_sd2(SDVersion version) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) {
return true;
}
return false;
}
static inline bool sd_version_is_sdxl(SDVersion version) {
if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) {
return true;
}
return false;
}
static inline bool sd_version_is_inpaint(SDVersion version) {
if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL) {
return true;
}
return false;
}
static inline bool sd_version_is_dit(SDVersion version) {
if (sd_version_is_flux(version) || sd_version_is_sd3(version)) {
return true;
}
return false;
}
static inline bool sd_version_is_unet_edit(SDVersion version) {
return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
}
static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version);
}
enum PMVersion {
PM_VERSION_1,
PM_VERSION_2,
};
struct TensorStorage {
std::string name;
ggml_type type = GGML_TYPE_F32;
bool is_bf16 = false;
bool is_f8_e4m3 = false;
bool is_f8_e5m2 = false;
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
int n_dims = 0;
ggml_type type = GGML_TYPE_F32;
bool is_bf16 = false;
int64_t ne[4] = {1, 1, 1, 1};
int n_dims = 0;
size_t file_index = 0;
int index_in_zip = -1; // >= means stored in a zip file
@ -119,11 +40,7 @@ struct TensorStorage {
}
int64_t nelements() const {
int64_t n = 1;
for (int i = 0; i < SD_MAX_DIMS; i++) {
n *= ne[i];
}
return n;
return ne[0] * ne[1] * ne[2] * ne[3];
}
int64_t nbytes() const {
@ -131,7 +48,7 @@ struct TensorStorage {
}
int64_t nbytes_to_read() const {
if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
if (is_bf16) {
return nbytes() / 2;
} else {
return nbytes();
@ -151,7 +68,6 @@ struct TensorStorage {
std::vector<TensorStorage> chunk(size_t n) {
std::vector<TensorStorage> chunks;
size_t chunk_size = nbytes_to_read() / n;
// printf("%d/%d\n", chunk_size, nbytes_to_read());
reverse_ne();
for (int i = 0; i < n; i++) {
TensorStorage chunk_i = *this;
@ -165,7 +81,7 @@ struct TensorStorage {
}
void reverse_ne() {
int64_t new_ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
int64_t new_ne[4] = {1, 1, 1, 1};
for (int i = 0; i < n_dims; i++) {
new_ne[i] = ne[n_dims - 1 - i];
}
@ -173,31 +89,10 @@ struct TensorStorage {
ne[i] = new_ne[i];
}
}
std::string to_string() const {
std::stringstream ss;
const char* type_name = ggml_type_name(type);
if (is_bf16) {
type_name = "bf16";
} else if (is_f8_e4m3) {
type_name = "f8_e4m3";
} else if (is_f8_e5m2) {
type_name = "f8_e5m2";
}
ss << name << " | " << type_name << " | ";
ss << n_dims << " [";
for (int i = 0; i < SD_MAX_DIMS; i++) {
ss << ne[i];
if (i != SD_MAX_DIMS - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
};
typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
typedef std::function<void(const std::string&, int32_t)> on_new_token_cb_t;
class ModelLoader {
protected:
@ -209,7 +104,7 @@ protected:
zip_t* zip,
std::string dir,
size_t file_index,
const std::string prefix);
const std::string& prefix);
bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
bool init_from_safetensors_file(const std::string& file_path, const std::string& prefix = "");
@ -217,28 +112,15 @@ protected:
bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
public:
std::map<std::string, enum ggml_type> tensor_storages_types;
bool init_from_file(const std::string& file_path, const std::string& prefix = "");
bool model_is_unet();
SDVersion get_sd_version();
ggml_type get_sd_wtype();
ggml_type get_conditioner_wtype();
ggml_type get_diffusion_model_wtype();
ggml_type get_vae_wtype();
void set_wtype_override(ggml_type wtype, std::string prefix = "");
std::string load_merges();
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
ggml_backend_t backend,
std::set<std::string> ignore_tensors = {});
bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
int64_t cal_mem_size(ggml_backend_t backend);
~ModelLoader() = default;
static std::string load_merges();
static std::string load_t5_tokenizer_json();
};
#endif // __MODEL_H__
#endif // __MODEL_H__

845
pmid.hpp
View File

@ -1,845 +0,0 @@
#ifndef __PMI_HPP__
#define __PMI_HPP__
#include "ggml_extend.hpp"
#include "clip.hpp"
#include "lora.hpp"
struct FuseBlock : public GGMLBlock {
// network hparams
int in_dim;
int out_dim;
int hidden_dim;
bool use_residue;
public:
FuseBlock(int i_d, int o_d, int h_d, bool use_residue = true)
: in_dim(i_d), out_dim(o_d), hidden_dim(h_d), use_residue(use_residue) {
blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_dim, hidden_dim, true));
blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, out_dim, true));
blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]);
struct ggml_tensor* r = x;
// x = ggml_nn_layer_norm(ctx, x, ln_w, ln_b);
x = layer_norm->forward(ctx, x);
// x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x), fc1_b);
x = fc1->forward(ctx, x);
x = ggml_gelu_inplace(ctx, x);
x = fc2->forward(ctx, x);
// x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x), fc2_b);
if (use_residue)
x = ggml_add(ctx, x, r);
return x;
}
};
/*
class QFormerPerceiver(nn.Module):
def __init__(self, id_embeddings_dim, cross_attention_dim, num_tokens, embedding_dim=1024, use_residual=True, ratio=4):
super().__init__()
self.num_tokens = num_tokens
self.cross_attention_dim = cross_attention_dim
self.use_residual = use_residual
print(cross_attention_dim*num_tokens)
self.token_proj = nn.Sequential(
nn.Linear(id_embeddings_dim, id_embeddings_dim*ratio),
nn.GELU(),
nn.Linear(id_embeddings_dim*ratio, cross_attention_dim*num_tokens),
)
self.token_norm = nn.LayerNorm(cross_attention_dim)
self.perceiver_resampler = FacePerceiverResampler(
dim=cross_attention_dim,
depth=4,
dim_head=128,
heads=cross_attention_dim // 128,
embedding_dim=embedding_dim,
output_dim=cross_attention_dim,
ff_mult=4,
)
def forward(self, x, last_hidden_state):
x = self.token_proj(x)
x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
x = self.token_norm(x) # cls token
out = self.perceiver_resampler(x, last_hidden_state) # retrieve from patch tokens
if self.use_residual: # TODO: if use_residual is not true
out = x + 1.0 * out
return out
*/
struct PMFeedForward : public GGMLBlock {
// network hparams
int dim;
public:
PMFeedForward(int d, int multi = 4)
: dim(d) {
int inner_dim = dim * multi;
blocks["0"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["1"] = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false));
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x) {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
auto ff = std::dynamic_pointer_cast<Mlp>(blocks["1"]);
x = norm->forward(ctx, x);
x = ff->forward(ctx, x);
return x;
}
};
struct PerceiverAttention : public GGMLBlock {
// network hparams
float scale; // = dim_head**-0.5
int dim_head; // = dim_head
int heads; // = heads
public:
PerceiverAttention(int dim, int dim_h = 64, int h = 8)
: scale(powf(dim_h, -0.5)), dim_head(dim_h), heads(h) {
int inner_dim = dim_head * heads;
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, false));
blocks["to_kv"] = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim * 2, false));
blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, false));
}
struct ggml_tensor* reshape_tensor(struct ggml_context* ctx,
struct ggml_tensor* x,
int heads) {
int64_t ne[4];
for (int i = 0; i < 4; ++i)
ne[i] = x->ne[i];
// print_ggml_tensor(x, true, "PerceiverAttention reshape x 0: ");
// printf("heads = %d \n", heads);
// x = ggml_view_4d(ctx, x, x->ne[0], x->ne[1], heads, x->ne[2]/heads,
// x->nb[1], x->nb[2], x->nb[3], 0);
x = ggml_reshape_4d(ctx, x, x->ne[0] / heads, heads, x->ne[1], x->ne[2]);
// x = ggml_view_4d(ctx, x, x->ne[0]/heads, heads, x->ne[1], x->ne[2],
// x->nb[1], x->nb[2], x->nb[3], 0);
// x = ggml_cont(ctx, x);
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));
// print_ggml_tensor(x, true, "PerceiverAttention reshape x 1: ");
// x = ggml_reshape_4d(ctx, x, ne[0], heads, ne[1], ne[2]/heads);
return x;
}
std::vector<struct ggml_tensor*> chunk_half(struct ggml_context* ctx,
struct ggml_tensor* x) {
auto tlo = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
auto tli = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], x->nb[0] * x->ne[0] / 2);
return {ggml_cont(ctx, tlo),
ggml_cont(ctx, tli)};
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* latents) {
// x (torch.Tensor): image features
// shape (b, n1, D)
// latent (torch.Tensor): latent features
// shape (b, n2, D)
int64_t ne[4];
for (int i = 0; i < 4; ++i)
ne[i] = latents->ne[i];
auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
x = norm1->forward(ctx, x);
latents = norm2->forward(ctx, latents);
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
auto q = to_q->forward(ctx, latents);
auto kv_input = ggml_concat(ctx, x, latents, 1);
auto to_kv = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
auto kv = to_kv->forward(ctx, kv_input);
auto k = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0);
auto v = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2));
k = ggml_cont(ctx, k);
v = ggml_cont(ctx, v);
q = reshape_tensor(ctx, q, heads);
k = reshape_tensor(ctx, k, heads);
v = reshape_tensor(ctx, v, heads);
scale = 1.f / sqrt(sqrt((float)dim_head));
k = ggml_scale_inplace(ctx, k, scale);
q = ggml_scale_inplace(ctx, q, scale);
// auto weight = ggml_mul_mat(ctx, q, k);
auto weight = ggml_mul_mat(ctx, k, q); // NOTE order of mul is opposite to pytorch
// GGML's softmax() is equivalent to pytorch's softmax(x, dim=-1)
// in this case, dimension along which Softmax will be computed is the last dim
// in torch and the first dim in GGML, consistent with the convention that pytorch's
// last dimension (varying most rapidly) corresponds to GGML's first (varying most rapidly).
// weight = ggml_soft_max(ctx, weight);
weight = ggml_soft_max_inplace(ctx, weight);
v = ggml_cont(ctx, ggml_transpose(ctx, v));
// auto out = ggml_mul_mat(ctx, weight, v);
auto out = ggml_mul_mat(ctx, v, weight); // NOTE order of mul is opposite to pytorch
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));
out = ggml_reshape_3d(ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1]));
auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
out = to_out->forward(ctx, out);
return out;
}
};
struct FacePerceiverResampler : public GGMLBlock {
// network hparams
int depth;
public:
FacePerceiverResampler(int dim = 768,
int d = 4,
int dim_head = 64,
int heads = 16,
int embedding_dim = 1280,
int output_dim = 768,
int ff_mult = 4)
: depth(d) {
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(embedding_dim, dim, true));
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(dim, output_dim, true));
blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new LayerNorm(output_dim));
for (int i = 0; i < depth; i++) {
std::string name = "layers." + std::to_string(i) + ".0";
blocks[name] = std::shared_ptr<GGMLBlock>(new PerceiverAttention(dim, dim_head, heads));
name = "layers." + std::to_string(i) + ".1";
blocks[name] = std::shared_ptr<GGMLBlock>(new PMFeedForward(dim, ff_mult));
}
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* latents,
struct ggml_tensor* x) {
// x: [N, channels, h, w]
auto proj_in = std::dynamic_pointer_cast<Linear>(blocks["proj_in"]);
auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
auto norm_out = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_out"]);
x = proj_in->forward(ctx, x);
for (int i = 0; i < depth; i++) {
std::string name = "layers." + std::to_string(i) + ".0";
auto attn = std::dynamic_pointer_cast<PerceiverAttention>(blocks[name]);
name = "layers." + std::to_string(i) + ".1";
auto ff = std::dynamic_pointer_cast<PMFeedForward>(blocks[name]);
auto t = attn->forward(ctx, x, latents);
latents = ggml_add(ctx, t, latents);
t = ff->forward(ctx, latents);
latents = ggml_add(ctx, t, latents);
}
latents = proj_out->forward(ctx, latents);
latents = norm_out->forward(ctx, latents);
return latents;
}
};
struct QFormerPerceiver : public GGMLBlock {
// network hparams
int num_tokens;
int cross_attention_dim;
bool use_residul;
public:
QFormerPerceiver(int id_embeddings_dim, int cross_attention_d, int num_t, int embedding_dim = 1024, bool use_r = true, int ratio = 4)
: cross_attention_dim(cross_attention_d), num_tokens(num_t), use_residul(use_r) {
blocks["token_proj"] = std::shared_ptr<GGMLBlock>(new Mlp(id_embeddings_dim,
id_embeddings_dim * ratio,
cross_attention_dim * num_tokens,
true));
blocks["token_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(cross_attention_d));
blocks["perceiver_resampler"] = std::shared_ptr<GGMLBlock>(new FacePerceiverResampler(
cross_attention_dim,
4,
128,
cross_attention_dim / 128,
embedding_dim,
cross_attention_dim,
4));
}
/*
def forward(self, x, last_hidden_state):
x = self.token_proj(x)
x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
x = self.token_norm(x) # cls token
out = self.perceiver_resampler(x, last_hidden_state) # retrieve from patch tokens
if self.use_residual: # TODO: if use_residual is not true
out = x + 1.0 * out
return out
*/
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* last_hidden_state) {
// x: [N, channels, h, w]
auto token_proj = std::dynamic_pointer_cast<Mlp>(blocks["token_proj"]);
auto token_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["token_norm"]);
auto perceiver_resampler = std::dynamic_pointer_cast<FacePerceiverResampler>(blocks["perceiver_resampler"]);
x = token_proj->forward(ctx, x);
int64_t nel = ggml_nelements(x);
x = ggml_reshape_3d(ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
x = token_norm->forward(ctx, x);
struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
if (use_residul)
out = ggml_add(ctx, x, out);
return out;
}
};
/*
class FacePerceiverResampler(torch.nn.Module):
def __init__(
self,
*,
dim=768,
depth=4,
dim_head=64,
heads=16,
embedding_dim=1280,
output_dim=768,
ff_mult=4,
):
super().__init__()
self.proj_in = torch.nn.Linear(embedding_dim, dim)
self.proj_out = torch.nn.Linear(dim, output_dim)
self.norm_out = torch.nn.LayerNorm(output_dim)
self.layers = torch.nn.ModuleList([])
for _ in range(depth):
self.layers.append(
torch.nn.ModuleList(
[
PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
FeedForward(dim=dim, mult=ff_mult),
]
)
)
def forward(self, latents, x):
x = self.proj_in(x)
for attn, ff in self.layers:
latents = attn(x, latents) + latents
latents = ff(latents) + latents
latents = self.proj_out(latents)
return self.norm_out(latents)
*/
/*
def FeedForward(dim, mult=4):
inner_dim = int(dim * mult)
return nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, inner_dim, bias=False),
nn.GELU(),
nn.Linear(inner_dim, dim, bias=False),
)
def reshape_tensor(x, heads):
bs, length, width = x.shape
# (bs, length, width) --> (bs, length, n_heads, dim_per_head)
x = x.view(bs, length, heads, -1)
# (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
x = x.transpose(1, 2)
# (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
x = x.reshape(bs, heads, length, -1)
return x
class PerceiverAttention(nn.Module):
def __init__(self, *, dim, dim_head=64, heads=8):
super().__init__()
self.scale = dim_head**-0.5
self.dim_head = dim_head
self.heads = heads
inner_dim = dim_head * heads
self.norm1 = nn.LayerNorm(dim)
self.norm2 = nn.LayerNorm(dim)
self.to_q = nn.Linear(dim, inner_dim, bias=False)
self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
self.to_out = nn.Linear(inner_dim, dim, bias=False)
def forward(self, x, latents):
"""
Args:
x (torch.Tensor): image features
shape (b, n1, D)
latent (torch.Tensor): latent features
shape (b, n2, D)
"""
x = self.norm1(x)
latents = self.norm2(latents)
b, l, _ = latents.shape
q = self.to_q(latents)
kv_input = torch.cat((x, latents), dim=-2)
k, v = self.to_kv(kv_input).chunk(2, dim=-1)
q = reshape_tensor(q, self.heads)
k = reshape_tensor(k, self.heads)
v = reshape_tensor(v, self.heads)
# attention
scale = 1 / math.sqrt(math.sqrt(self.dim_head))
weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
out = weight @ v
out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
return self.to_out(out)
*/
struct FuseModule : public GGMLBlock {
// network hparams
int embed_dim;
public:
FuseModule(int imb_d)
: embed_dim(imb_d) {
blocks["mlp1"] = std::shared_ptr<GGMLBlock>(new FuseBlock(imb_d * 2, imb_d, imb_d, false));
blocks["mlp2"] = std::shared_ptr<GGMLBlock>(new FuseBlock(imb_d, imb_d, imb_d, true));
blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
}
struct ggml_tensor* fuse_fn(struct ggml_context* ctx,
struct ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds) {
auto mlp1 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
auto mlp2 = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);
// print_ggml_tensor(id_embeds, true, "Fuseblock id_embeds: ");
// print_ggml_tensor(prompt_embeds, true, "Fuseblock prompt_embeds: ");
// auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
// auto id_embeds0 = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
// print_ggml_tensor(id_embeds0, true, "Fuseblock id_embeds0: ");
// print_ggml_tensor(prompt_embeds0, true, "Fuseblock prompt_embeds0: ");
// concat is along dim 2
// auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds, id_embeds, 0);
// print_ggml_tensor(stacked_id_embeds, true, "Fuseblock stacked_id_embeds 0: ");
// stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));
// print_ggml_tensor(stacked_id_embeds, true, "Fuseblock stacked_id_embeds 1: ");
// stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
// stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
// stacked_id_embeds = mlp2.forward(ctx, stacked_id_embeds);
// stacked_id_embeds = ggml_nn_layer_norm(ctx, stacked_id_embeds, ln_w, ln_b);
stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds);
stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds);
stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds);
// print_ggml_tensor(stacked_id_embeds, true, "Fuseblock stacked_id_embeds 1: ");
return stacked_id_embeds;
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds,
struct ggml_tensor* class_tokens_mask,
struct ggml_tensor* class_tokens_mask_pos,
struct ggml_tensor* left,
struct ggml_tensor* right) {
// x: [N, channels, h, w]
struct ggml_tensor* valid_id_embeds = id_embeds;
// # slice out the image token embeddings
// print_ggml_tensor(class_tokens_mask_pos, false);
ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
ggml_set_name(prompt_embeds, "prompt_embeds");
// print_ggml_tensor(valid_id_embeds, true, "valid_id_embeds");
// print_ggml_tensor(class_tokens_mask_pos, true, "class_tokens_mask_pos");
struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx, prompt_embeds, class_tokens_mask_pos);
ggml_set_name(image_token_embeds, "image_token_embeds");
valid_id_embeds = ggml_reshape_2d(ctx, valid_id_embeds, valid_id_embeds->ne[0],
ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);
// stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
// print_ggml_tensor(stacked_id_embeds, true, "AA stacked_id_embeds");
// print_ggml_tensor(left, true, "AA left");
// print_ggml_tensor(right, true, "AA right");
if (left && right) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1);
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1);
} else if (left) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1);
} else if (right) {
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1);
}
// print_ggml_tensor(stacked_id_embeds, true, "BB stacked_id_embeds");
// stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
// print_ggml_tensor(stacked_id_embeds, true, "CC stacked_id_embeds");
class_tokens_mask = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
class_tokens_mask = ggml_repeat(ctx, class_tokens_mask, prompt_embeds);
prompt_embeds = ggml_mul(ctx, prompt_embeds, class_tokens_mask);
struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx, prompt_embeds, stacked_id_embeds);
ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
// print_ggml_tensor(updated_prompt_embeds, true, "updated_prompt_embeds: ");
return updated_prompt_embeds;
}
};
struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
PhotoMakerIDEncoderBlock()
: CLIPVisionModelProjection(OPENAI_CLIP_VIT_L_14) {
blocks["visual_projection_2"] = std::shared_ptr<GGMLBlock>(new Linear(1024, 1280, false));
blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds,
struct ggml_tensor* class_tokens_mask,
struct ggml_tensor* class_tokens_mask_pos,
struct ggml_tensor* left,
struct ggml_tensor* right) {
// x: [N, channels, h, w]
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]);
auto fuse_module = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size]
struct ggml_tensor* id_embeds = visual_projection->forward(ctx, shared_id_embeds); // [N, proj_dim(768)]
struct ggml_tensor* id_embeds_2 = visual_projection_2->forward(ctx, shared_id_embeds); // [N, 1280]
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));
id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
prompt_embeds,
id_embeds,
class_tokens_mask,
class_tokens_mask_pos,
left, right);
return updated_prompt_embeds;
}
};
struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionModelProjection {
int cross_attention_dim;
int num_tokens;
PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock(int id_embeddings_dim = 512)
: CLIPVisionModelProjection(OPENAI_CLIP_VIT_L_14),
cross_attention_dim(2048),
num_tokens(2) {
blocks["visual_projection_2"] = std::shared_ptr<GGMLBlock>(new Linear(1024, 1280, false));
blocks["fuse_module"] = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
/*
cross_attention_dim = 2048
# projection
self.num_tokens = 2
self.cross_attention_dim = cross_attention_dim
self.qformer_perceiver = QFormerPerceiver(
id_embeddings_dim,
cross_attention_dim,
self.num_tokens,
)*/
blocks["qformer_perceiver"] = std::shared_ptr<GGMLBlock>(new QFormerPerceiver(id_embeddings_dim,
cross_attention_dim,
num_tokens));
}
/*
def forward(self, id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds):
b, num_inputs, c, h, w = id_pixel_values.shape
id_pixel_values = id_pixel_values.view(b * num_inputs, c, h, w)
last_hidden_state = self.vision_model(id_pixel_values)[0]
id_embeds = id_embeds.view(b * num_inputs, -1)
id_embeds = self.qformer_perceiver(id_embeds, last_hidden_state)
id_embeds = id_embeds.view(b, num_inputs, self.num_tokens, -1)
updated_prompt_embeds = self.fuse_module(prompt_embeds, id_embeds, class_tokens_mask)
*/
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds,
struct ggml_tensor* class_tokens_mask,
struct ggml_tensor* class_tokens_mask_pos,
struct ggml_tensor* id_embeds,
struct ggml_tensor* left,
struct ggml_tensor* right) {
// x: [N, channels, h, w]
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
auto fuse_module = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]);
// struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size]
struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false); // [N, hidden_size]
id_embeds = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
prompt_embeds,
id_embeds,
class_tokens_mask,
class_tokens_mask_pos,
left, right);
return updated_prompt_embeds;
}
};
struct PhotoMakerIDEncoder : public GGMLRunner {
public:
SDVersion version = VERSION_SDXL;
PMVersion pm_version = PM_VERSION_1;
PhotoMakerIDEncoderBlock id_encoder;
PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock id_encoder2;
float style_strength;
std::vector<float> ctm;
std::vector<ggml_fp16_t> ctmf16;
std::vector<int> ctmpos;
std::vector<ggml_fp16_t> zeros_left_16;
std::vector<float> zeros_left;
std::vector<ggml_fp16_t> zeros_right_16;
std::vector<float> zeros_right;
public:
PhotoMakerIDEncoder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix, SDVersion version = VERSION_SDXL, PMVersion pm_v = PM_VERSION_1, float sty = 20.f)
: GGMLRunner(backend),
version(version),
pm_version(pm_v),
style_strength(sty) {
if (pm_version == PM_VERSION_1) {
id_encoder.init(params_ctx, tensor_types, prefix);
} else if (pm_version == PM_VERSION_2) {
id_encoder2.init(params_ctx, tensor_types, prefix);
}
}
std::string get_desc() {
return "pmid";
}
PMVersion get_version() const {
return pm_version;
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
if (pm_version == PM_VERSION_1)
id_encoder.get_param_tensors(tensors, prefix);
else if (pm_version == PM_VERSION_2)
id_encoder2.get_param_tensors(tensors, prefix);
}
struct ggml_cgraph* build_graph( // struct ggml_allocr* allocr,
struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds,
std::vector<bool>& class_tokens_mask,
struct ggml_tensor* id_embeds) {
ctm.clear();
ctmf16.clear();
ctmpos.clear();
zeros_left.clear();
zeros_left_16.clear();
zeros_right.clear();
zeros_right_16.clear();
ggml_context* ctx0 = compute_ctx;
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
int64_t hidden_size = prompt_embeds->ne[0];
int64_t seq_length = prompt_embeds->ne[1];
ggml_type type = GGML_TYPE_F32;
struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(ctx0, type, class_tokens_mask.size());
struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
struct ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds);
struct ggml_tensor* id_embeds_d = to_backend(id_embeds);
struct ggml_tensor* left = NULL;
struct ggml_tensor* right = NULL;
for (int i = 0; i < class_tokens_mask.size(); i++) {
if (class_tokens_mask[i]) {
// printf(" 1,");
ctm.push_back(0.f); // here use 0.f instead of 1.f to make a scale mask
ctmf16.push_back(ggml_fp32_to_fp16(0.f)); // here use 0.f instead of 1.f to make a scale mask
ctmpos.push_back(i);
} else {
// printf(" 0,");
ctm.push_back(1.f); // here use 1.f instead of 0.f to make a scale mask
ctmf16.push_back(ggml_fp32_to_fp16(1.f)); // here use 0.f instead of 1.f to make a scale mask
}
}
// printf("\n");
if (ctmpos[0] > 0) {
// left = ggml_new_tensor_3d(ctx0, type, hidden_size, 1, ctmpos[0]);
left = ggml_new_tensor_3d(ctx0, type, hidden_size, ctmpos[0], 1);
}
if (ctmpos[ctmpos.size() - 1] < seq_length - 1) {
// right = ggml_new_tensor_3d(ctx0, type,
// hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1);
right = ggml_new_tensor_3d(ctx0, type,
hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1);
}
struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ctmpos.size());
{
if (type == GGML_TYPE_F16)
set_backend_tensor_data(class_tokens_mask_d, ctmf16.data());
else
set_backend_tensor_data(class_tokens_mask_d, ctm.data());
set_backend_tensor_data(class_tokens_mask_pos, ctmpos.data());
if (left) {
if (type == GGML_TYPE_F16) {
for (int i = 0; i < ggml_nelements(left); ++i)
zeros_left_16.push_back(ggml_fp32_to_fp16(0.f));
set_backend_tensor_data(left, zeros_left_16.data());
} else {
for (int i = 0; i < ggml_nelements(left); ++i)
zeros_left.push_back(0.f);
set_backend_tensor_data(left, zeros_left.data());
}
}
if (right) {
if (type == GGML_TYPE_F16) {
for (int i = 0; i < ggml_nelements(right); ++i)
zeros_right_16.push_back(ggml_fp32_to_fp16(0.f));
set_backend_tensor_data(right, zeros_right_16.data());
} else {
for (int i = 0; i < ggml_nelements(right); ++i)
zeros_right.push_back(0.f);
set_backend_tensor_data(right, zeros_right.data());
}
}
}
struct ggml_tensor* updated_prompt_embeds = NULL;
if (pm_version == PM_VERSION_1)
updated_prompt_embeds = id_encoder.forward(ctx0,
id_pixel_values_d,
prompt_embeds_d,
class_tokens_mask_d,
class_tokens_mask_pos,
left, right);
else if (pm_version == PM_VERSION_2)
updated_prompt_embeds = id_encoder2.forward(ctx0,
id_pixel_values_d,
prompt_embeds_d,
class_tokens_mask_d,
class_tokens_mask_pos,
id_embeds_d,
left, right);
ggml_build_forward_expand(gf, updated_prompt_embeds);
return gf;
}
void compute(const int n_threads,
struct ggml_tensor* id_pixel_values,
struct ggml_tensor* prompt_embeds,
struct ggml_tensor* id_embeds,
std::vector<bool>& class_tokens_mask,
struct ggml_tensor** updated_prompt_embeds,
ggml_context* output_ctx) {
auto get_graph = [&]() -> struct ggml_cgraph* {
// return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask);
return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds);
};
// GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
}
};
struct PhotoMakerIDEmbed : public GGMLRunner {
std::map<std::string, struct ggml_tensor*> tensors;
std::string file_path;
ModelLoader* model_loader;
bool load_failed = false;
bool applied = false;
PhotoMakerIDEmbed(ggml_backend_t backend,
ModelLoader* ml,
const std::string& file_path = "",
const std::string& prefix = "")
: file_path(file_path), GGMLRunner(backend), model_loader(ml) {
if (!model_loader->init_from_file(file_path, prefix)) {
load_failed = true;
}
}
std::string get_desc() {
return "id_embeds";
}
bool load_from_file(bool filter_tensor = false) {
LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
if (load_failed) {
LOG_ERROR("init photomaker id embed from file failed: '%s'", file_path.c_str());
return false;
}
bool dry_run = true;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
if (filter_tensor && !contains(name, "pmid.id_embeds")) {
// LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
return true;
}
if (dry_run) {
struct ggml_tensor* real = ggml_new_tensor(params_ctx,
tensor_storage.type,
tensor_storage.n_dims,
tensor_storage.ne);
tensors[name] = real;
} else {
auto real = tensors[name];
*dst_tensor = real;
}
return true;
};
model_loader->load_tensors(on_new_tensor_cb, backend);
alloc_params_buffer();
dry_run = false;
model_loader->load_tensors(on_new_tensor_cb, backend);
LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
return true;
}
struct ggml_tensor* get() {
std::map<std::string, struct ggml_tensor*>::iterator pos;
pos = tensors.find("pmid.id_embeds");
if (pos != tensors.end())
return pos->second;
return NULL;
}
};
#endif // __PMI_HPP__

View File

@ -1,227 +0,0 @@
#ifndef __PREPROCESSING_HPP__
#define __PREPROCESSING_HPP__
#include "ggml_extend.hpp"
#define M_PI_ 3.14159265358979323846
void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
struct ggml_init_params params;
params.mem_size = 20 * 1024 * 1024; // 10
params.mem_buffer = NULL;
params.no_alloc = false;
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel));
ggml_tensor* h = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1);
ggml_cgraph* gf = ggml_new_graph(ctx0);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output));
ggml_graph_compute_with_ctx(ctx0, gf, 1);
ggml_free(ctx0);
}
void gaussian_kernel(struct ggml_tensor* kernel) {
int ks_mid = kernel->ne[0] / 2;
float sigma = 1.4f;
float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f));
for (int y = 0; y < kernel->ne[0]; y++) {
float gx = -ks_mid + y;
for (int x = 0; x < kernel->ne[1]; x++) {
float gy = -ks_mid + x;
float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
ggml_tensor_set_f32(kernel, k_, x, y);
}
}
}
void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
float r = ggml_tensor_get_f32(rgb_img, ix, iy);
float g = ggml_tensor_get_f32(rgb_img, ix, iy, 1);
float b = ggml_tensor_get_f32(rgb_img, ix, iy, 2);
float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
ggml_tensor_set_f32(grayscale, gray, ix, iy);
}
}
}
void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
int n_elements = ggml_nelements(h);
float* dx = (float*)x->data;
float* dy = (float*)y->data;
float* dh = (float*)h->data;
for (int i = 0; i < n_elements; i++) {
dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]);
}
}
void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
int n_elements = ggml_nelements(h);
float* dx = (float*)x->data;
float* dy = (float*)y->data;
float* dh = (float*)h->data;
for (int i = 0; i < n_elements; i++) {
dh[i] = atan2f(dy[i], dx[i]);
}
}
void normalize_tensor(struct ggml_tensor* g) {
int n_elements = ggml_nelements(g);
float* dg = (float*)g->data;
float max = -INFINITY;
for (int i = 0; i < n_elements; i++) {
max = dg[i] > max ? dg[i] : max;
}
max = 1.0f / max;
for (int i = 0; i < n_elements; i++) {
dg[i] *= max;
}
}
void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
for (int iy = 1; iy < result->ne[1] - 1; iy++) {
for (int ix = 1; ix < result->ne[0] - 1; ix++) {
float angle = ggml_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
angle = angle < 0.0f ? angle += 180.0f : angle;
float q = 1.0f;
float r = 1.0f;
// angle 0
if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
q = ggml_tensor_get_f32(G, ix, iy + 1);
r = ggml_tensor_get_f32(G, ix, iy - 1);
}
// angle 45
else if (22.5f >= angle && angle < 67.5f) {
q = ggml_tensor_get_f32(G, ix + 1, iy - 1);
r = ggml_tensor_get_f32(G, ix - 1, iy + 1);
}
// angle 90
else if (67.5f >= angle && angle < 112.5) {
q = ggml_tensor_get_f32(G, ix + 1, iy);
r = ggml_tensor_get_f32(G, ix - 1, iy);
}
// angle 135
else if (112.5 >= angle && angle < 157.5f) {
q = ggml_tensor_get_f32(G, ix - 1, iy - 1);
r = ggml_tensor_get_f32(G, ix + 1, iy + 1);
}
float cur = ggml_tensor_get_f32(G, ix, iy);
if ((cur >= q) && (cur >= r)) {
ggml_tensor_set_f32(result, cur, ix, iy);
} else {
ggml_tensor_set_f32(result, 0.0f, ix, iy);
}
}
}
}
void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) {
int n_elements = ggml_nelements(img);
float* imd = (float*)img->data;
float max = -INFINITY;
for (int i = 0; i < n_elements; i++) {
max = imd[i] > max ? imd[i] : max;
}
float ht = max * high_threshold;
float lt = ht * low_threshold;
for (int i = 0; i < n_elements; i++) {
float img_v = imd[i];
if (img_v >= ht) { // strong pixel
imd[i] = strong;
} else if (img_v <= ht && img_v >= lt) { // strong pixel
imd[i] = weak;
}
}
for (int iy = 0; iy < img->ne[1]; iy++) {
for (int ix = 0; ix < img->ne[0]; ix++) {
if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
ggml_tensor_set_f32(img, ggml_tensor_get_f32(img, ix, iy), ix, iy);
} else {
ggml_tensor_set_f32(img, 0.0f, ix, iy);
}
}
}
// hysteresis
for (int iy = 1; iy < img->ne[1] - 1; iy++) {
for (int ix = 1; ix < img->ne[0] - 1; ix++) {
float imd_v = ggml_tensor_get_f32(img, ix, iy);
if (imd_v == weak) {
if (ggml_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix + 1, iy) == strong ||
ggml_tensor_get_f32(img, ix, iy - 1) == strong || ggml_tensor_get_f32(img, ix, iy + 1) == strong ||
ggml_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix - 1, iy) == strong) {
ggml_tensor_set_f32(img, strong, ix, iy);
} else {
ggml_tensor_set_f32(img, 0.0f, ix, iy);
}
}
}
}
}
uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10
params.mem_buffer = NULL;
params.no_alloc = false;
struct ggml_context* work_ctx = ggml_init(params);
if (!work_ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
float kX[9] = {
-1, 0, 1,
-2, 0, 2,
-1, 0, 1};
float kY[9] = {
1, 2, 1,
0, 0, 0,
-1, -2, -1};
// generate kernel
int kernel_size = 5;
struct ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1);
struct ggml_tensor* sf_kx = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx));
struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
gaussian_kernel(gkernel);
struct ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
struct ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray);
struct ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray);
sd_image_to_tensor(img, image);
grayscale(image, image_gray);
convolve(image_gray, image_gray, gkernel, 2);
convolve(image_gray, iX, sf_kx, 1);
convolve(image_gray, iY, sf_ky, 1);
prop_hypot(iX, iY, G);
normalize_tensor(G);
prop_arctan2(iX, iY, tetha);
non_max_supression(image_gray, G, tetha);
threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
// to RGB channels
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
float gray = ggml_tensor_get_f32(image_gray, ix, iy);
gray = inverse ? 1.0f - gray : gray;
ggml_tensor_set_f32(image, gray, ix, iy);
ggml_tensor_set_f32(image, gray, ix, iy, 1);
ggml_tensor_set_f32(image, gray, ix, iy, 2);
}
}
free(img);
uint8_t* output = sd_tensor_to_image(image);
ggml_free(work_ctx);
return output;
}
#endif // __PREPROCESSING_HPP__

File diff suppressed because it is too large Load Diff

View File

@ -30,8 +30,7 @@ extern "C" {
enum rng_type_t {
STD_DEFAULT_RNG,
CUDA_RNG,
RNG_TYPE_COUNT
CUDA_RNG
};
enum sample_method_t {
@ -42,22 +41,15 @@ enum sample_method_t {
DPMPP2S_A,
DPMPP2M,
DPMPP2Mv2,
IPNDM,
IPNDM_V,
LCM,
DDIM_TRAILING,
TCD,
SAMPLE_METHOD_COUNT
N_SAMPLE_METHODS
};
enum schedule_t {
DEFAULT,
DISCRETE,
KARRAS,
EXPONENTIAL,
AYS,
GITS,
SCHEDULE_COUNT
N_SCHEDULES
};
// same as enum ggml_type
@ -67,43 +59,26 @@ enum sd_type_t {
SD_TYPE_Q4_0 = 2,
SD_TYPE_Q4_1 = 3,
// SD_TYPE_Q4_2 = 4, support has been removed
// SD_TYPE_Q4_3 = 5, support has been removed
SD_TYPE_Q5_0 = 6,
SD_TYPE_Q5_1 = 7,
SD_TYPE_Q8_0 = 8,
SD_TYPE_Q8_1 = 9,
SD_TYPE_Q2_K = 10,
SD_TYPE_Q3_K = 11,
SD_TYPE_Q4_K = 12,
SD_TYPE_Q5_K = 13,
SD_TYPE_Q6_K = 14,
SD_TYPE_Q8_K = 15,
SD_TYPE_IQ2_XXS = 16,
SD_TYPE_IQ2_XS = 17,
SD_TYPE_IQ3_XXS = 18,
SD_TYPE_IQ1_S = 19,
SD_TYPE_IQ4_NL = 20,
SD_TYPE_IQ3_S = 21,
SD_TYPE_IQ2_S = 22,
SD_TYPE_IQ4_XS = 23,
SD_TYPE_I8 = 24,
SD_TYPE_I16 = 25,
SD_TYPE_I32 = 26,
SD_TYPE_I64 = 27,
SD_TYPE_F64 = 28,
SD_TYPE_IQ1_M = 29,
SD_TYPE_BF16 = 30,
// SD_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
// SD_TYPE_Q4_0_4_8 = 32,
// SD_TYPE_Q4_0_8_8 = 33,
SD_TYPE_TQ1_0 = 34,
SD_TYPE_TQ2_0 = 35,
// SD_TYPE_IQ4_NL_4_4 = 36,
// SD_TYPE_IQ4_NL_4_8 = 37,
// SD_TYPE_IQ4_NL_8_8 = 38,
SD_TYPE_COUNT = 39,
// SD_TYPE_Q4_3 (5) support has been removed
SD_TYPE_Q5_0 = 6,
SD_TYPE_Q5_1 = 7,
SD_TYPE_Q8_0 = 8,
SD_TYPE_Q8_1 = 9,
// k-quantizations
SD_TYPE_Q2_K = 10,
SD_TYPE_Q3_K = 11,
SD_TYPE_Q4_K = 12,
SD_TYPE_Q5_K = 13,
SD_TYPE_Q6_K = 14,
SD_TYPE_Q8_K = 15,
SD_TYPE_I8,
SD_TYPE_I16,
SD_TYPE_I32,
SD_TYPE_COUNT,
};
SD_API const char* sd_type_name(enum sd_type_t type);
enum sd_log_level_t {
SD_LOG_DEBUG,
SD_LOG_INFO,
@ -111,33 +86,11 @@ enum sd_log_level_t {
SD_LOG_ERROR
};
typedef struct {
const char* model_path;
const char* clip_l_path;
const char* clip_g_path;
const char* t5xxl_path;
const char* diffusion_model_path;
const char* vae_path;
const char* taesd_path;
const char* control_net_path;
const char* lora_model_dir;
const char* embedding_dir;
const char* stacked_id_embed_dir;
bool vae_decode_only;
bool vae_tiling;
bool free_params_immediately;
int n_threads;
enum sd_type_t wtype;
enum rng_type_t rng_type;
enum schedule_t schedule;
bool keep_clip_on_cpu;
bool keep_control_net_on_cpu;
bool keep_vae_on_cpu;
bool diffusion_flash_attn;
bool chroma_use_dit_mask;
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
} sd_ctx_params_t;
typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
SD_API int32_t get_num_physical_cores();
SD_API const char* sd_get_system_info();
typedef struct {
uint32_t width;
@ -146,118 +99,59 @@ typedef struct {
uint8_t* data;
} sd_image_t;
typedef struct {
int* layers;
size_t layer_count;
float layer_start;
float layer_end;
float scale;
} sd_slg_params_t;
typedef struct {
float txt_cfg;
float img_cfg;
float min_cfg;
float distilled_guidance;
sd_slg_params_t slg;
} sd_guidance_params_t;
typedef struct {
const char* prompt;
const char* negative_prompt;
int clip_skip;
sd_guidance_params_t guidance;
sd_image_t init_image;
sd_image_t* ref_images;
int ref_images_count;
sd_image_t mask_image;
int width;
int height;
enum sample_method_t sample_method;
int sample_steps;
float eta;
float strength;
int64_t seed;
int batch_count;
const sd_image_t* control_cond;
float control_strength;
float style_strength;
bool normalize_input;
const char* input_id_images_path;
} sd_img_gen_params_t;
typedef struct {
sd_image_t init_image;
int width;
int height;
sd_guidance_params_t guidance;
enum sample_method_t sample_method;
int sample_steps;
float strength;
int64_t seed;
int video_frames;
int motion_bucket_id;
int fps;
float augmentation_level;
} sd_vid_gen_params_t;
typedef struct sd_ctx_t sd_ctx_t;
typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
const char* vae_path,
const char* taesd_path,
const char* lora_model_dir,
bool vae_decode_only,
bool vae_tiling,
bool free_params_immediately,
int n_threads,
enum sd_type_t wtype,
enum rng_type_t rng_type,
enum schedule_t s);
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
SD_API int32_t get_num_physical_cores();
SD_API const char* sd_get_system_info();
SD_API const char* sd_type_name(enum sd_type_t type);
SD_API enum sd_type_t str_to_sd_type(const char* str);
SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
SD_API enum rng_type_t str_to_rng_type(const char* str);
SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
SD_API enum sample_method_t str_to_sample_method(const char* str);
SD_API const char* sd_schedule_name(enum schedule_t schedule);
SD_API enum schedule_t str_to_schedule(const char* str);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
const char* prompt,
const char* negative_prompt,
int clip_skip,
float cfg_scale,
int width,
int height,
enum sample_method_t sample_method,
int sample_steps,
int64_t seed,
int batch_count);
SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params); // broken
SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
sd_image_t init_image,
const char* prompt,
const char* negative_prompt,
int clip_skip,
float cfg_scale,
int width,
int height,
enum sample_method_t sample_method,
int sample_steps,
float strength,
int64_t seed,
int batch_count);
typedef struct upscaler_ctx_t upscaler_ctx_t;
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
int n_threads);
int n_threads,
enum sd_type_t wtype);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
SD_API bool convert(const char* input_path,
const char* vae_path,
const char* output_path,
enum sd_type_t output_type,
const char* tensor_type_rules);
SD_API uint8_t* preprocess_canny(uint8_t* img,
int width,
int height,
float high_threshold,
float low_threshold,
float weak,
float strong,
bool inverse);
SD_API sd_image_t upscale(upscaler_ctx_t*, sd_image_t input_image, uint32_t upscale_factor);
#ifdef __cplusplus
}
#endif
#endif // __STABLE_DIFFUSION_H__
#endif // __STABLE_DIFFUSION_H__

1008
t5.hpp

File diff suppressed because it is too large Load Diff

649
tae.hpp
View File

@ -8,45 +8,88 @@
/*
=================================== TinyAutoEncoder ===================================
References:
https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoder_tiny.py
https://github.com/madebyollin/taesd/blob/main/taesd.py
*/
struct TAEBlock {
int in_channels;
int out_channels;
class TAEBlock : public UnaryBlock {
protected:
int n_in;
int n_out;
// conv
ggml_tensor* conv_0_w; // [in_channels, out_channels, 3, 3]
ggml_tensor* conv_0_b; // [in_channels]
ggml_tensor* conv_1_w; // [out_channels, out_channels, 3, 3]
ggml_tensor* conv_1_b; // [out_channels]
ggml_tensor* conv_2_w; // [out_channels, out_channels, 3, 3]
ggml_tensor* conv_2_b; // [out_channels]
public:
TAEBlock(int n_in, int n_out)
: n_in(n_in), n_out(n_out) {
blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}));
blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
if (n_in != n_out) {
blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false));
// skip
ggml_tensor* conv_skip_w; // [in_channels, out_channels, 1, 1]
size_t calculate_mem_size() {
size_t mem_size = in_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_0_w
mem_size += in_channels * ggml_type_size(GGML_TYPE_F32); // conv_0_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_1_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_1_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_2_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_2_b
if (in_channels != out_channels) {
mem_size += in_channels * out_channels * ggml_type_size(GGML_TYPE_F16); // conv_skip_w
}
return mem_size;
}
int get_num_tensors() {
return 6 + (in_channels != out_channels ? 1 : 0);
}
void init_params(ggml_context* ctx) {
conv_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, in_channels);
conv_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
conv_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
conv_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
if (in_channels != out_channels) {
conv_skip_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, out_channels, in_channels);
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [n, n_in, h, w]
// return: [n, n_out, h, w]
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "conv.0.weight"] = conv_0_w;
tensors[prefix + "conv.0.bias"] = conv_0_b;
auto conv_0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
auto conv_2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
tensors[prefix + "conv.2.weight"] = conv_1_w;
tensors[prefix + "conv.2.bias"] = conv_1_b;
auto h = conv_0->forward(ctx, x);
h = ggml_relu_inplace(ctx, h);
h = conv_2->forward(ctx, h);
h = ggml_relu_inplace(ctx, h);
h = conv_4->forward(ctx, h);
tensors[prefix + "conv.4.weight"] = conv_2_w;
tensors[prefix + "conv.4.bias"] = conv_2_b;
if (n_in != n_out) {
auto skip = std::dynamic_pointer_cast<Conv2d>(blocks["skip"]);
LOG_DEBUG("skip");
x = skip->forward(ctx, x);
if (in_channels != out_channels) {
tensors[prefix + "skip.weight"] = conv_skip_w;
}
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
// conv(n_in, n_out)
ggml_tensor* h;
h = ggml_nn_conv_2d(ctx, x, conv_0_w, conv_0_b, 1, 1, 1, 1);
h = ggml_relu_inplace(ctx, h);
h = ggml_nn_conv_2d(ctx, h, conv_1_w, conv_1_b, 1, 1, 1, 1);
h = ggml_relu_inplace(ctx, h);
h = ggml_nn_conv_2d(ctx, h, conv_2_w, conv_2_b, 1, 1, 1, 1);
// skip connection
if (in_channels != out_channels) {
// skip = nn.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
x = ggml_nn_conv_2d(ctx, x, conv_skip_w, NULL, 1, 1, 1, 1);
}
h = ggml_add(ctx, h, x);
@ -55,169 +98,425 @@ public:
}
};
class TinyEncoder : public UnaryBlock {
struct TinyEncoder {
int in_channels = 3;
int channels = 64;
int z_channels = 4;
int channels = 64;
int num_blocks = 3;
public:
TinyEncoder(int z_channels = 4)
: z_channels(z_channels) {
int index = 0;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
// input
ggml_tensor* conv_input_w; // [channels, in_channels, 3, 3]
ggml_tensor* conv_input_b; // [channels]
TAEBlock initial_block;
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
ggml_tensor* conv_1_w; // [channels, channels, 3, 3]
TAEBlock input_blocks[3];
// middle
ggml_tensor* conv_2_w; // [channels, channels, 3, 3]
TAEBlock middle_blocks[3];
// output
ggml_tensor* conv_3_w; // [channels, channels, 3, 3]
TAEBlock output_blocks[3];
// final
ggml_tensor* conv_final_w; // [z_channels, channels, 3, 3]
ggml_tensor* conv_final_b; // [z_channels]
TinyEncoder() {
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
input_blocks[i].in_channels = channels;
input_blocks[i].out_channels = channels;
middle_blocks[i].in_channels = channels;
middle_blocks[i].out_channels = channels;
output_blocks[i].in_channels = channels;
output_blocks[i].out_channels = channels;
}
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
initial_block.in_channels = channels;
initial_block.out_channels = channels;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [n, in_channels, h, w]
// return: [n, z_channels, h/8, w/8]
size_t calculate_mem_size() {
size_t mem_size = channels * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
for (int i = 0; i < num_blocks * 3 + 6; i++) {
auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
mem_size += initial_block.calculate_mem_size();
x = block->forward(ctx, x);
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_2_w
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_3_w
for (int i = 0; i < num_blocks; i++) {
mem_size += input_blocks[i].calculate_mem_size();
mem_size += middle_blocks[i].calculate_mem_size();
mem_size += output_blocks[i].calculate_mem_size();
}
mem_size += z_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += z_channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
return mem_size;
}
int get_num_tensors() {
int num_tensors = 7;
for (int i = 0; i < num_blocks; i++) {
num_tensors += input_blocks[i].get_num_tensors();
num_tensors += middle_blocks[i].get_num_tensors();
num_tensors += output_blocks[i].get_num_tensors();
}
num_tensors += initial_block.get_num_tensors();
return num_tensors;
}
void init_params(ggml_context* ctx) {
conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, channels);
conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
initial_block.init_params(ctx);
conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, z_channels);
conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels);
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].init_params(ctx);
middle_blocks[i].init_params(ctx);
output_blocks[i].init_params(ctx);
}
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "0.weight"] = conv_input_w;
tensors[prefix + "0.bias"] = conv_input_b;
initial_block.map_by_name(tensors, prefix + "1.");
tensors[prefix + "2.weight"] = conv_1_w;
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 3) + ".");
}
return x;
tensors[prefix + "6.weight"] = conv_2_w;
for (int i = 0; i < num_blocks; i++) {
middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
}
tensors[prefix + "10.weight"] = conv_3_w;
for (int i = 0; i < num_blocks; i++) {
output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 11) + ".");
}
tensors[prefix + "14.weight"] = conv_final_w;
tensors[prefix + "14.bias"] = conv_final_b;
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
// conv(3, 64)
auto z = ggml_nn_conv_2d(ctx, x, conv_input_w, conv_input_b, 1, 1, 1, 1);
// Block(64, 64)
z = initial_block.forward(ctx, z);
// conv(64, 64, stride=2, bias=False)
z = ggml_nn_conv_2d(ctx, z, conv_1_w, NULL, 2, 2, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
z = input_blocks[i].forward(ctx, z);
}
// conv(64, 64, stride=2, bias=False)
z = ggml_nn_conv_2d(ctx, z, conv_2_w, NULL, 2, 2, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
z = middle_blocks[i].forward(ctx, z);
}
// conv(64, 64, stride=2, bias=False)
z = ggml_nn_conv_2d(ctx, z, conv_3_w, NULL, 2, 2, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
z = output_blocks[i].forward(ctx, z);
}
// conv(64, 4)
z = ggml_nn_conv_2d(ctx, z, conv_final_w, conv_final_b, 1, 1, 1, 1);
return z;
}
};
class TinyDecoder : public UnaryBlock {
int z_channels = 4;
int channels = 64;
int out_channels = 3;
int num_blocks = 3;
struct TinyDecoder {
int z_channels = 4;
int channels = 64;
int output_channels = 3;
int num_blocks = 3;
public:
TinyDecoder(int z_channels = 4)
: z_channels(z_channels) {
int index = 0;
// input
ggml_tensor* conv_input_w; // [channels, z_channels, 3, 3]
ggml_tensor* conv_input_b; // [channels]
TAEBlock input_blocks[3];
ggml_tensor* conv_1_w; // [channels, channels, 3, 3]
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}));
index++; // nn.ReLU()
// middle
TAEBlock middle_blocks[3];
ggml_tensor* conv_2_w; // [channels, channels, 3, 3]
// output
TAEBlock output_blocks[3];
ggml_tensor* conv_3_w; // [channels, channels, 3, 3]
// final
TAEBlock final_block;
ggml_tensor* conv_final_w; // [output_channels, channels, 3, 3]
ggml_tensor* conv_final_b; // [output_channels]
ggml_tensor* in_scale_1d3; // [1]
ggml_tensor* in_scale_3; // [1]
TinyDecoder() {
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
input_blocks[i].in_channels = channels;
input_blocks[i].out_channels = channels;
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
middle_blocks[i].in_channels = channels;
middle_blocks[i].out_channels = channels;
for (int i = 0; i < num_blocks; i++) {
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
output_blocks[i].in_channels = channels;
output_blocks[i].out_channels = channels;
}
index++; // nn.Upsample()
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
final_block.in_channels = channels;
final_block.out_channels = channels;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
// z: [n, z_channels, h, w]
// return: [n, out_channels, h*8, w*8]
size_t calculate_mem_size() {
size_t mem_size = channels * z_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
auto h = ggml_scale(ctx, z, 1.0f / 3.0f);
h = ggml_tanh_inplace(ctx, h);
h = ggml_scale(ctx, h, 3.0f);
for (int i = 0; i < num_blocks; i++) {
mem_size += input_blocks[i].calculate_mem_size();
}
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
for (int i = 0; i < num_blocks * 3 + 10; i++) {
if (blocks.find(std::to_string(i)) == blocks.end()) {
if (i == 1) {
h = ggml_relu_inplace(ctx, h);
} else {
h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST);
}
continue;
}
auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
for (int i = 0; i < num_blocks; i++) {
mem_size += middle_blocks[i].calculate_mem_size();
}
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_2_w
h = block->forward(ctx, h);
for (int i = 0; i < num_blocks; i++) {
mem_size += output_blocks[i].calculate_mem_size();
}
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_3_w
mem_size += final_block.calculate_mem_size();
mem_size += output_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += output_channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
return mem_size;
}
int get_num_tensors() {
int num_tensors = 9;
for (int i = 0; i < num_blocks; i++) {
num_tensors += input_blocks[i].get_num_tensors();
num_tensors += middle_blocks[i].get_num_tensors();
num_tensors += output_blocks[i].get_num_tensors();
}
num_tensors += final_block.get_num_tensors();
return num_tensors;
}
void init_params(ggml_allocr* alloc, ggml_context* ctx) {
conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, channels);
conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, output_channels);
conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, output_channels);
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].init_params(ctx);
middle_blocks[i].init_params(ctx);
output_blocks[i].init_params(ctx);
}
final_block.init_params(ctx);
// initialize constants scales
in_scale_1d3 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
in_scale_3 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, in_scale_1d3);
float scale_1d3 = 1.0f / 3.0f;
ggml_backend_tensor_set(in_scale_1d3, &scale_1d3, 0, sizeof(scale_1d3));
ggml_allocr_alloc(alloc, in_scale_3);
float scale_3 = 3.0f;
ggml_backend_tensor_set(in_scale_3, &scale_3, 0, sizeof(scale_3));
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "0.weight"] = conv_input_w;
tensors[prefix + "0.bias"] = conv_input_b;
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 2) + ".");
}
tensors[prefix + "6.weight"] = conv_1_w;
for (int i = 0; i < num_blocks; i++) {
middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
}
tensors[prefix + "11.weight"] = conv_2_w;
for (int i = 0; i < num_blocks; i++) {
output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 12) + ".");
}
tensors[prefix + "16.weight"] = conv_3_w;
final_block.map_by_name(tensors, prefix + "17.");
tensors[prefix + "18.weight"] = conv_final_w;
tensors[prefix + "18.bias"] = conv_final_b;
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* z) {
// torch.tanh(x / 3) * 3
auto h = ggml_scale(ctx, z, in_scale_1d3);
h = ggml_tanh_inplace(ctx, h);
h = ggml_scale(ctx, h, in_scale_3);
// conv(4, 64)
h = ggml_nn_conv_2d(ctx, h, conv_input_w, conv_input_b, 1, 1, 1, 1);
// nn.ReLU()
h = ggml_relu_inplace(ctx, h);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
h = input_blocks[i].forward(ctx, h);
}
// nn.Upsample(scale_factor=2)
h = ggml_upscale(ctx, h, 2);
// conv(64, 64, bias=False)
h = ggml_nn_conv_2d(ctx, h, conv_1_w, NULL, 1, 1, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
h = middle_blocks[i].forward(ctx, h);
}
// nn.Upsample(scale_factor=2)
h = ggml_upscale(ctx, h, 2);
// conv(64, 64, bias=False)
h = ggml_nn_conv_2d(ctx, h, conv_2_w, NULL, 1, 1, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
h = output_blocks[i].forward(ctx, h);
}
// nn.Upsample(scale_factor=2)
h = ggml_upscale(ctx, h, 2);
// conv(64, 64, bias=False)
h = ggml_nn_conv_2d(ctx, h, conv_3_w, NULL, 1, 1, 1, 1);
// Block(64, 64)
h = final_block.forward(ctx, h);
// conv(64, 3)
h = ggml_nn_conv_2d(ctx, h, conv_final_w, conv_final_b, 1, 1, 1, 1);
return h;
}
};
class TAESD : public GGMLBlock {
protected:
bool decode_only;
public:
TAESD(bool decode_only = true, SDVersion version = VERSION_SD1)
: decode_only(decode_only) {
int z_channels = 4;
if (sd_version_is_dit(version)) {
z_channels = 16;
}
blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder(z_channels));
if (!decode_only) {
blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder(z_channels));
}
}
struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
return decoder->forward(ctx, z);
}
struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
return encoder->forward(ctx, x);
}
};
struct TinyAutoEncoder : public GGMLRunner {
TAESD taesd;
struct TinyAutoEncoder : public GGMLModule {
TinyEncoder encoder;
TinyDecoder decoder;
bool decode_only = false;
TinyAutoEncoder(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
const std::string prefix,
bool decoder_only = true,
SDVersion version = VERSION_SD1)
: decode_only(decoder_only),
taesd(decoder_only, version),
GGMLRunner(backend) {
taesd.init(params_ctx, tensor_types, prefix);
TinyAutoEncoder(bool decoder_only_ = true)
: decode_only(decoder_only_) {
name = "tae";
}
std::string get_desc() {
return "taesd";
size_t calculate_mem_size() {
size_t mem_size = decoder.calculate_mem_size();
if (!decode_only) {
mem_size += encoder.calculate_mem_size();
}
mem_size += 1024; // padding
return mem_size;
}
bool load_from_file(const std::string& file_path) {
LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
alloc_params_buffer();
size_t get_num_tensors() {
size_t num_tensors = decoder.get_num_tensors();
if (!decode_only) {
num_tensors += encoder.get_num_tensors();
}
return num_tensors;
}
void init_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
decoder.init_params(alloc, params_ctx);
if (!decode_only) {
encoder.init_params(params_ctx);
}
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
ggml_allocr_free(alloc);
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
decoder.map_by_name(tensors, "decoder.layers.");
encoder.map_by_name(tensors, "encoder.layers.");
}
bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
LOG_INFO("loading taesd from '%s'", file_path.c_str());
if (!alloc_params_buffer(backend)) {
return false;
}
std::map<std::string, ggml_tensor*> taesd_tensors;
taesd.get_param_tensors(taesd_tensors);
// prepare memory for the weights
{
init_params();
map_by_name(taesd_tensors);
}
std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
std::set<std::string> ignore_tensors;
if (decode_only) {
ignore_tensors.insert("encoder.");
for (auto& pair : taesd_tensors) {
const std::string& name = pair.first;
if (decode_only && starts_with(name, "encoder")) {
ignore_tensors.insert(name);
continue;
}
tensors_need_to_load.insert(pair);
}
ModelLoader model_loader;
@ -226,7 +525,7 @@ struct TinyAutoEncoder : public GGMLRunner {
return false;
}
bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors);
bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors);
if (!success) {
LOG_ERROR("load tae tensors from model loader failed");
@ -238,23 +537,57 @@ struct TinyAutoEncoder : public GGMLRunner {
}
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z);
struct ggml_tensor* out = decode_graph ? taesd.decode(compute_ctx, z) : taesd.encode(compute_ctx, z);
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* z_ = NULL;
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend)) {
// pass input tensors to gpu memory
z_ = ggml_dup_tensor(ctx0, z);
ggml_allocr_alloc(compute_allocr, z_);
// pass data to device backend
if (!ggml_allocr_is_measure(compute_allocr)) {
ggml_backend_tensor_set(z_, z->data, 0, ggml_nbytes(z));
}
} else {
z_ = z;
}
struct ggml_tensor* out = decode_graph ? decoder.forward(ctx0, z_) : encoder.forward(ctx0, z_);
ggml_build_forward_expand(gf, out);
ggml_free(ctx0);
return gf;
}
void compute(const int n_threads,
struct ggml_tensor* z,
bool decode_graph,
struct ggml_tensor** output,
struct ggml_context* output_ctx = NULL) {
void alloc_compute_buffer(struct ggml_tensor* x, bool decode) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x, decode);
};
GGMLModule::alloc_compute_buffer(get_graph);
}
void compute(struct ggml_tensor* work_result, int n_threads, struct ggml_tensor* z, bool decode_graph) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(z, decode_graph);
};
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
GGMLModule::compute(get_graph, n_threads, work_result);
}
};

View File

@ -1,2 +0,0 @@
DisableFormat: true
SortIncludes: Never

View File

@ -1,10 +0,0 @@
Copyright (c) 2008-2011, Susumu Yata
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
- Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
- Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,3 +1,2 @@
- json.hpp library from: https://github.com/nlohmann/json
- ZIP Library from: https://github.com/kuba--/zip
- darts.h from: https://github.com/google/sentencepiece/tree/master/third_party/darts_clone
- ZIP Library from: https://github.com/kuba--/zip

1926
thirdparty/darts.h vendored

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -177,7 +177,7 @@ STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const
STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality, const char* parameters = NULL);
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality);
#ifdef STBIW_WINDOWS_UTF8
STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
@ -1412,7 +1412,7 @@ static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt
return DU[0];
}
static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality, const char* parameters) {
static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
// Constants that don't pollute global namespace
static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
@ -1521,20 +1521,6 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
s->func(s->context, (void*)YTable, sizeof(YTable));
stbiw__putc(s, 1);
s->func(s->context, UVTable, sizeof(UVTable));
// comment block with parameters of generation
if(parameters != NULL) {
stbiw__putc(s, 0xFF /* comnent */ );
stbiw__putc(s, 0xFE /* marker */ );
size_t param_length = std::min(2 + strlen("parameters") + 1 + strlen(parameters) + 1, (size_t) 0xFFFF);
stbiw__putc(s, param_length >> 8); // no need to mask, length < 65536
stbiw__putc(s, param_length & 0xFF);
s->func(s->context, (void*)"parameters", strlen("parameters") + 1); // std::string is zero-terminated
s->func(s->context, (void*)parameters, std::min(param_length, (size_t) 65534) - 2 - strlen("parameters") - 1);
if(param_length > 65534) stbiw__putc(s, 0); // always zero-terminate for safety
if(param_length & 1) stbiw__putc(s, 0xFF); // pad to even length
}
s->func(s->context, (void*)head1, sizeof(head1));
s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
@ -1639,16 +1625,16 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
{
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality, NULL);
return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
}
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality, const char* parameters)
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
{
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_jpg_core(&s, x, y, comp, data, quality, parameters);
int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
stbi__end_write_file(&s);
return r;
} else

232
thirdparty/zip.c vendored
View File

@ -36,7 +36,6 @@
#include <unistd.h>
#endif
#define USE_EXTERNAL_MZCRC
#include "miniz.h"
#include "zip.h"
@ -1835,234 +1834,3 @@ int zip_extract(const char *zipname, const char *dir,
return zip_archive_extract(&zip_archive, dir, on_extract, arg);
}
#if defined(__SSE4_2__) || defined(__AVX512F__)
#include <immintrin.h>
#endif
// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
// V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
uint32_t crc = ~init;
#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
if (len >= 256) {
_Alignas(__m512) static const uint64_t k1k2[] = {
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
};
_Alignas(__m512) static const uint64_t k3k4[] = {
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
};
_Alignas(__m512) static const uint64_t k5k6[] = {
0x01751997d0,
0x00ccaa009e,
};
_Alignas(__m512) static const uint64_t k7k8[] = {
0x0163cd6124,
0x0000000000,
};
_Alignas(__m512) static const uint64_t poly[] = {
0x01db710641,
0x01f7011641,
};
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
__m128i a0, a1, a2, a3;
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
x0 = _mm512_load_si512((__m512i *)k1k2);
buf += 256;
len -= 256;
while (len >= 256) {
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, x5);
x2 = _mm512_xor_si512(x2, x6);
x3 = _mm512_xor_si512(x3, x7);
x4 = _mm512_xor_si512(x4, x8);
x1 = _mm512_xor_si512(x1, y5);
x2 = _mm512_xor_si512(x2, y6);
x3 = _mm512_xor_si512(x3, y7);
x4 = _mm512_xor_si512(x4, y8);
buf += 256;
len -= 256;
}
x0 = _mm512_load_si512((__m512i *)k3k4);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x3);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x4);
x1 = _mm512_xor_si512(x1, x5);
while (len >= 64) {
x2 = _mm512_loadu_si512((__m512i *)buf);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
buf += 64;
len -= 64;
}
a0 = _mm_load_si128((__m128i *)k5k6);
a1 = _mm512_extracti32x4_epi32(x1, 0);
a2 = _mm512_extracti32x4_epi32(x1, 1);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm512_extracti32x4_epi32(x1, 2);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm512_extracti32x4_epi32(x1, 3);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
a1 = _mm_srli_si128(a1, 8);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_loadl_epi64((__m128i *)k7k8);
a2 = _mm_srli_si128(a1, 4);
a1 = _mm_and_si128(a1, a3);
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_load_si128((__m128i *)poly);
a2 = _mm_and_si128(a1, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
a2 = _mm_and_si128(a2, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
crc = _mm_extract_epi32(a1, 1);
}
#endif
#if defined(__SSE4_2__) && defined(__PCLMUL__)
if (len >= 64) {
_Alignas(__m128) static const uint64_t k1k2[] = {
0x0154442bd4,
0x01c6e41596,
};
_Alignas(__m128) static const uint64_t k3k4[] = {
0x01751997d0,
0x00ccaa009e,
};
_Alignas(__m128) static const uint64_t k5k0[] = {
0x0163cd6124,
0x0000000000,
};
_Alignas(__m128) static const uint64_t poly[] = {
0x01db710641,
0x01f7011641,
};
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
x0 = _mm_load_si128((__m128i *)k1k2);
buf += 64;
len -= 64;
while (len >= 64) {
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_xor_si128(x4, x8);
x1 = _mm_xor_si128(x1, y5);
x2 = _mm_xor_si128(x2, y6);
x3 = _mm_xor_si128(x3, y7);
x4 = _mm_xor_si128(x4, y8);
buf += 64;
len -= 64;
}
x0 = _mm_load_si128((__m128i *)k3k4);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x3);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x4);
x1 = _mm_xor_si128(x1, x5);
while (len >= 16) {
x2 = _mm_loadu_si128((__m128i *)buf);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
buf += 16;
len -= 16;
}
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
x1 = _mm_srli_si128(x1, 8);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_loadl_epi64((__m128i *)k5k0);
x2 = _mm_srli_si128(x1, 4);
x1 = _mm_and_si128(x1, x3);
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_load_si128((__m128i *)poly);
x2 = _mm_and_si128(x1, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
x2 = _mm_and_si128(x2, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
crc = _mm_extract_epi32(x1, 1);
}
#endif
static uint32_t tab[256];
if (!tab[255]) {
// generates table for byte-wise crc calculation on the polynomial
// x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
uint32_t polynomial = 0xedb88320; // bits are reversed
for (int d = 0; d < 256; ++d) {
uint32_t r = d;
for (int i = 0; i < 8; ++i)
r = r >> 1 ^ (r & 1 ? polynomial : 0);
tab[d] = r;
}
}
for (size_t i = 0; i < len; ++i)
crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
return ~crc & 0xffffffff;
}

1347
unet.hpp

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
struct UpscalerGGML {
ggml_backend_t backend = NULL; // general backend
ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<ESRGAN> esrgan_upscaler;
ESRGAN esrgan_upscaler;
std::string esrgan_path;
int n_threads;
@ -15,39 +15,22 @@ struct UpscalerGGML {
}
bool load_from_file(const std::string& esrgan_path) {
#ifdef SD_USE_CUDA
#ifdef SD_USE_CUBLAS
LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
#endif
#ifdef SD_USE_METAL
LOG_DEBUG("Using Metal backend");
ggml_log_set(ggml_log_callback_default, nullptr);
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
backend = ggml_backend_metal_init();
#endif
#ifdef SD_USE_VULKAN
LOG_DEBUG("Using Vulkan backend");
backend = ggml_backend_vk_init(0);
#endif
#ifdef SD_USE_OPENCL
LOG_DEBUG("Using OpenCL backend");
backend = ggml_backend_opencl_init();
#endif
#ifdef SD_USE_SYCL
LOG_DEBUG("Using SYCL backend");
backend = ggml_backend_sycl_init(0);
#endif
ModelLoader model_loader;
if (!model_loader.init_from_file(esrgan_path)) {
LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
}
model_loader.set_wtype_override(model_data_type);
if (!backend) {
LOG_DEBUG("Using CPU backend");
backend = ggml_backend_cpu_init();
}
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
if (!esrgan_upscaler->load_from_file(esrgan_path)) {
if (!esrgan_upscaler.load_from_file(esrgan_path, backend)) {
return false;
}
return true;
@ -56,8 +39,8 @@ struct UpscalerGGML {
sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
// upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
sd_image_t upscaled_image = {0, 0, 0, NULL};
int output_width = (int)input_image.width * esrgan_upscaler->scale;
int output_height = (int)input_image.height * esrgan_upscaler->scale;
int output_width = (int)input_image.width * esrgan_upscaler.scale;
int output_height = (int)input_image.height * esrgan_upscaler.scale;
LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
input_image.width, input_image.height, output_width, output_height);
@ -79,11 +62,15 @@ struct UpscalerGGML {
ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
esrgan_upscaler->compute(n_threads, in, &out);
if (init) {
esrgan_upscaler.alloc_compute_buffer(in);
} else {
esrgan_upscaler.compute(out, n_threads, in);
}
};
int64_t t0 = ggml_time_ms();
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
esrgan_upscaler->free_compute_buffer();
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler.scale, esrgan_upscaler.tile_size, 0.25f, on_tiling);
esrgan_upscaler.free_compute_buffer();
ggml_tensor_clamp(upscaled, 0.f, 1.f);
uint8_t* upscaled_data = sd_tensor_to_image(upscaled);
ggml_free(upscale_ctx);
@ -104,7 +91,8 @@ struct upscaler_ctx_t {
};
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
int n_threads) {
int n_threads,
enum sd_type_t wtype) {
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
if (upscaler_ctx == NULL) {
return NULL;

482
util.cpp
View File

@ -1,7 +1,6 @@
#include "util.h"
#include <stdarg.h>
#include <algorithm>
#include <cmath>
#include <codecvt>
#include <fstream>
#include <locale>
@ -10,7 +9,6 @@
#include <thread>
#include <unordered_set>
#include <vector>
#include "preprocessing.hpp"
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/sysctl.h>
@ -22,13 +20,9 @@
#include <unistd.h>
#endif
#include "ggml-cpu.h"
#include "ggml.h"
#include "ggml/ggml.h"
#include "stable-diffusion.h"
#define STB_IMAGE_RESIZE_IMPLEMENTATION
#include "stb_image_resize.h"
bool ends_with(const std::string& str, const std::string& ending) {
if (str.length() >= ending.length()) {
return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
@ -44,13 +38,6 @@ bool starts_with(const std::string& str, const std::string& start) {
return false;
}
bool contains(const std::string& str, const std::string& substr) {
if (str.find(substr) != std::string::npos) {
return true;
}
return false;
}
void replace_all_chars(std::string& str, char target, char replacement) {
for (size_t i = 0; i < str.length(); ++i) {
if (str[i] == target) {
@ -85,70 +72,6 @@ bool is_directory(const std::string& path) {
return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
}
std::string get_full_path(const std::string& dir, const std::string& filename) {
std::string full_path = dir + "\\" + filename;
WIN32_FIND_DATA find_file_data;
HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
if (hFind != INVALID_HANDLE_VALUE) {
FindClose(hFind);
return full_path;
} else {
return "";
}
}
std::vector<std::string> get_files_from_dir(const std::string& dir) {
std::vector<std::string> files;
WIN32_FIND_DATA findFileData;
HANDLE hFind;
char currentDirectory[MAX_PATH];
GetCurrentDirectory(MAX_PATH, currentDirectory);
char directoryPath[MAX_PATH]; // this is absolute path
sprintf(directoryPath, "%s\\%s\\*", currentDirectory, dir.c_str());
// Find the first file in the directory
hFind = FindFirstFile(directoryPath, &findFileData);
bool isAbsolutePath = false;
// Check if the directory was found
if (hFind == INVALID_HANDLE_VALUE) {
printf("Unable to find directory. Try with original path \n");
char directoryPathAbsolute[MAX_PATH];
sprintf(directoryPathAbsolute, "%s*", dir.c_str());
hFind = FindFirstFile(directoryPathAbsolute, &findFileData);
isAbsolutePath = true;
if (hFind == INVALID_HANDLE_VALUE) {
printf("Absolute path was also wrong.\n");
return files;
}
}
// Loop through all files in the directory
do {
// Check if the found file is a regular file (not a directory)
if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
if (isAbsolutePath) {
files.push_back(dir + "\\" + std::string(findFileData.cFileName));
} else {
files.push_back(std::string(currentDirectory) + "\\" + dir + "\\" + std::string(findFileData.cFileName));
}
}
} while (FindNextFile(hFind, &findFileData) != 0);
// Close the handle
FindClose(hFind);
sort(files.begin(), files.end());
return files;
}
#else // Unix
#include <dirent.h>
#include <sys/stat.h>
@ -163,47 +86,6 @@ bool is_directory(const std::string& path) {
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
}
// TODO: add windows version
std::string get_full_path(const std::string& dir, const std::string& filename) {
DIR* dp = opendir(dir.c_str());
if (dp != nullptr) {
struct dirent* entry;
while ((entry = readdir(dp)) != nullptr) {
if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
closedir(dp);
return dir + "/" + entry->d_name;
}
}
closedir(dp);
}
return "";
}
std::vector<std::string> get_files_from_dir(const std::string& dir) {
std::vector<std::string> files;
DIR* dp = opendir(dir.c_str());
if (dp != nullptr) {
struct dirent* entry;
while ((entry = readdir(dp)) != nullptr) {
std::string fname = dir + "/" + entry->d_name;
if (!is_directory(fname))
files.push_back(fname);
}
closedir(dp);
}
sort(files.begin(), files.end());
return files;
}
#endif
// get_num_physical_cores is copy from
@ -244,9 +126,6 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
static sd_progress_cb_t sd_progress_cb = NULL;
void* sd_progress_cb_data = NULL;
std::u32string utf8_to_utf32(const std::string& utf8_str) {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
return converter.from_bytes(utf8_str);
@ -262,7 +141,7 @@ std::u32string unicode_value_to_utf32(int unicode_value) {
return utf32_string;
}
static std::string sd_basename(const std::string& path) {
std::string sd_basename(const std::string& path) {
size_t pos = path.find_last_of('/');
if (pos != std::string::npos) {
return path.substr(pos + 1);
@ -290,64 +169,7 @@ std::string path_join(const std::string& p1, const std::string& p2) {
return p1 + "/" + p2;
}
std::vector<std::string> splitString(const std::string& str, char delimiter) {
std::vector<std::string> result;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
result.push_back(str.substr(start, end - start));
start = end + 1;
end = str.find(delimiter, start);
}
// Add the last segment after the last delimiter
result.push_back(str.substr(start));
return result;
}
sd_image_t* preprocess_id_image(sd_image_t* img) {
int shortest_edge = 224;
int size = shortest_edge;
sd_image_t* resized = NULL;
uint32_t w = img->width;
uint32_t h = img->height;
uint32_t c = img->channel;
// 1. do resize using stb_resize functions
unsigned char* buf = (unsigned char*)malloc(sizeof(unsigned char) * 3 * size * size);
if (!stbir_resize_uint8(img->data, w, h, 0,
buf, size, size, 0,
c)) {
fprintf(stderr, "%s: resize operation failed \n ", __func__);
return resized;
}
// 2. do center crop (likely unnecessary due to step 1)
// 3. do rescale
// 4. do normalize
// 3 and 4 will need to be done in float format.
resized = new sd_image_t{(uint32_t)shortest_edge,
(uint32_t)shortest_edge,
3,
buf};
return resized;
}
void pretty_progress(int step, int steps, float time) {
if (sd_progress_cb) {
sd_progress_cb(step, steps, time, sd_progress_cb_data);
return;
}
if (step == 0) {
return;
}
std::string progress = " |";
int max_progress = 50;
int32_t current = (int32_t)(step * 1.f * max_progress / steps);
@ -361,7 +183,7 @@ void pretty_progress(int step, int steps, float time) {
}
}
progress += "|";
printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K",
printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s",
progress.c_str(), step, steps,
time > 1.0f || time == 0 ? time : (1.0f / time));
fflush(stdout); // for linux
@ -370,24 +192,6 @@ void pretty_progress(int step, int steps, float time) {
}
}
std::string ltrim(const std::string& s) {
auto it = std::find_if(s.begin(), s.end(), [](int ch) {
return !std::isspace(ch);
});
return std::string(it, s.end());
}
std::string rtrim(const std::string& s) {
auto it = std::find_if(s.rbegin(), s.rend(), [](int ch) {
return !std::isspace(ch);
});
return std::string(s.begin(), it.base());
}
std::string trim(const std::string& s) {
return rtrim(ltrim(s));
}
static sd_log_cb_t sd_log_cb = NULL;
void* sd_log_cb_data = NULL;
@ -397,13 +201,23 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo
va_list args;
va_start(args, format);
static char log_buffer[LOG_BUFFER_SIZE + 1];
int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "%s:%-4d - ", sd_basename(file).c_str(), line);
const char* level_str = "DEBUG";
if (level == SD_LOG_INFO) {
level_str = "INFO ";
} else if (level == SD_LOG_WARN) {
level_str = "WARN ";
} else if (level == SD_LOG_ERROR) {
level_str = "ERROR";
}
static char log_buffer[LOG_BUFFER_SIZE];
int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "[%s] %s:%-4d - ", level_str, sd_basename(file).c_str(), line);
if (written >= 0 && written < LOG_BUFFER_SIZE) {
vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer) - 1);
}
strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer));
if (sd_log_cb) {
sd_log_cb(level, log_buffer, sd_log_cb_data);
@ -416,14 +230,12 @@ void sd_set_log_callback(sd_log_cb_t cb, void* data) {
sd_log_cb = cb;
sd_log_cb_data = data;
}
void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
sd_progress_cb = cb;
sd_progress_cb_data = data;
}
const char* sd_get_system_info() {
static char buffer[1024];
std::stringstream ss;
ss << "System Info: \n";
ss << " BLAS = " << ggml_cpu_has_blas() << std::endl;
ss << " SSE3 = " << ggml_cpu_has_sse3() << std::endl;
ss << " AVX = " << ggml_cpu_has_avx() << std::endl;
ss << " AVX2 = " << ggml_cpu_has_avx2() << std::endl;
@ -441,258 +253,6 @@ const char* sd_get_system_info() {
return buffer;
}
sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
sd_image_f32_t converted_image;
converted_image.width = image.width;
converted_image.height = image.height;
converted_image.channel = image.channel;
// Allocate memory for float data
converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float));
for (int i = 0; i < image.width * image.height * image.channel; i++) {
// Convert uint8_t to float
converted_image.data[i] = (float)image.data[i];
}
return converted_image;
const char* sd_type_name(enum sd_type_t type) {
return ggml_type_name((ggml_type)type);
}
// Function to perform double linear interpolation
float interpolate(float v1, float v2, float v3, float v4, float x_ratio, float y_ratio) {
return v1 * (1 - x_ratio) * (1 - y_ratio) + v2 * x_ratio * (1 - y_ratio) + v3 * (1 - x_ratio) * y_ratio + v4 * x_ratio * y_ratio;
}
sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height) {
sd_image_f32_t resized_image;
resized_image.width = target_width;
resized_image.height = target_height;
resized_image.channel = image.channel;
// Allocate memory for resized float data
resized_image.data = (float*)malloc(target_width * target_height * image.channel * sizeof(float));
for (int y = 0; y < target_height; y++) {
for (int x = 0; x < target_width; x++) {
float original_x = (float)x * image.width / target_width;
float original_y = (float)y * image.height / target_height;
int x1 = (int)original_x;
int y1 = (int)original_y;
int x2 = x1 + 1;
int y2 = y1 + 1;
for (int k = 0; k < image.channel; k++) {
float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
float x_ratio = original_x - x1;
float y_ratio = original_y - y1;
float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
*(resized_image.data + y * target_width * image.channel + x * image.channel + k) = value;
}
}
}
return resized_image;
}
void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) {
for (int y = 0; y < image.height; y++) {
for (int x = 0; x < image.width; x++) {
for (int k = 0; k < image.channel; k++) {
int index = (y * image.width + x) * image.channel + k;
image.data[index] = (image.data[index] - means[k]) / stds[k];
}
}
}
}
// Constants for means and std
float means[3] = {0.48145466, 0.4578275, 0.40821073};
float stds[3] = {0.26862954, 0.26130258, 0.27577711};
// Function to clip and preprocess sd_image_f32_t
sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
float scale = (float)size / fmin(image.width, image.height);
// Interpolation
int new_width = (int)(scale * image.width);
int new_height = (int)(scale * image.height);
float* resized_data = (float*)malloc(new_width * new_height * image.channel * sizeof(float));
for (int y = 0; y < new_height; y++) {
for (int x = 0; x < new_width; x++) {
float original_x = (float)x * image.width / new_width;
float original_y = (float)y * image.height / new_height;
int x1 = (int)original_x;
int y1 = (int)original_y;
int x2 = x1 + 1;
int y2 = y1 + 1;
for (int k = 0; k < image.channel; k++) {
float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
float x_ratio = original_x - x1;
float y_ratio = original_y - y1;
float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
*(resized_data + y * new_width * image.channel + x * image.channel + k) = value;
}
}
}
// Clip and preprocess
int h = (new_height - size) / 2;
int w = (new_width - size) / 2;
sd_image_f32_t result;
result.width = size;
result.height = size;
result.channel = image.channel;
result.data = (float*)malloc(size * size * image.channel * sizeof(float));
for (int k = 0; k < image.channel; k++) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
*(result.data + i * size * image.channel + j * image.channel + k) =
fmin(fmax(*(resized_data + (i + h) * new_width * image.channel + (j + w) * image.channel + k), 0.0f), 255.0f) / 255.0f;
}
}
}
// Free allocated memory
free(resized_data);
// Normalize
for (int k = 0; k < image.channel; k++) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
// *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
int offset = i * size * image.channel + j * image.channel + k;
float value = *(result.data + offset);
value = (value - means[k]) / stds[k];
// value = 0.5f;
*(result.data + offset) = value;
}
}
}
return result;
}
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
//
// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
// Accepted tokens are:
// (abc) - increases attention to abc by a multiplier of 1.1
// (abc:3.12) - increases attention to abc by a multiplier of 3.12
// [abc] - decreases attention to abc by a multiplier of 1.1
// \( - literal character '('
// \[ - literal character '['
// \) - literal character ')'
// \] - literal character ']'
// \\ - literal character '\'
// anything else - just text
//
// >>> parse_prompt_attention('normal text')
// [['normal text', 1.0]]
// >>> parse_prompt_attention('an (important) word')
// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
// >>> parse_prompt_attention('(unbalanced')
// [['unbalanced', 1.1]]
// >>> parse_prompt_attention('\(literal\]')
// [['(literal]', 1.0]]
// >>> parse_prompt_attention('(unnecessary)(parens)')
// [['unnecessaryparens', 1.1]]
// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
// [['a ', 1.0],
// ['house', 1.5730000000000004],
// [' ', 1.1],
// ['on', 1.0],
// [' a ', 1.1],
// ['hill', 0.55],
// [', sun, ', 1.1],
// ['sky', 1.4641000000000006],
// ['.', 1.1]]
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
std::vector<std::pair<std::string, float>> res;
std::vector<int> round_brackets;
std::vector<int> square_brackets;
float round_bracket_multiplier = 1.1f;
float square_bracket_multiplier = 1 / 1.1f;
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
std::regex re_break(R"(\s*\bBREAK\b\s*)");
auto multiply_range = [&](int start_position, float multiplier) {
for (int p = start_position; p < res.size(); ++p) {
res[p].second *= multiplier;
}
};
std::smatch m;
std::string remaining_text = text;
while (std::regex_search(remaining_text, m, re_attention)) {
std::string text = m[0];
std::string weight = m[1];
if (text == "(") {
round_brackets.push_back((int)res.size());
} else if (text == "[") {
square_brackets.push_back((int)res.size());
} else if (!weight.empty()) {
if (!round_brackets.empty()) {
multiply_range(round_brackets.back(), std::stof(weight));
round_brackets.pop_back();
}
} else if (text == ")" && !round_brackets.empty()) {
multiply_range(round_brackets.back(), round_bracket_multiplier);
round_brackets.pop_back();
} else if (text == "]" && !square_brackets.empty()) {
multiply_range(square_brackets.back(), square_bracket_multiplier);
square_brackets.pop_back();
} else if (text == "\\(") {
res.push_back({text.substr(1), 1.0f});
} else {
res.push_back({text, 1.0f});
}
remaining_text = m.suffix();
}
for (int pos : round_brackets) {
multiply_range(pos, round_bracket_multiplier);
}
for (int pos : square_brackets) {
multiply_range(pos, square_bracket_multiplier);
}
if (res.empty()) {
res.push_back({"", 1.0f});
}
int i = 0;
while (i + 1 < res.size()) {
if (res[i].second == res[i + 1].second) {
res[i].first += res[i + 1].first;
res.erase(res.begin() + i + 1);
} else {
++i;
}
}
return res;
}

33
util.h
View File

@ -3,16 +3,11 @@
#include <cstdint>
#include <string>
#include <vector>
#include "stable-diffusion.h"
#define SAFE_STR(s) ((s) ? (s) : "")
#define BOOL_STR(b) ((b) ? "true" : "false")
bool ends_with(const std::string& str, const std::string& ending);
bool starts_with(const std::string& str, const std::string& start);
bool contains(const std::string& str, const std::string& substr);
std::string format(const char* fmt, ...);
@ -20,43 +15,19 @@ void replace_all_chars(std::string& str, char target, char replacement);
bool file_exists(const std::string& filename);
bool is_directory(const std::string& path);
std::string get_full_path(const std::string& dir, const std::string& filename);
std::vector<std::string> get_files_from_dir(const std::string& dir);
std::u32string utf8_to_utf32(const std::string& utf8_str);
std::string utf32_to_utf8(const std::u32string& utf32_str);
std::u32string unicode_value_to_utf32(int unicode_value);
sd_image_t* preprocess_id_image(sd_image_t* img);
// std::string sd_basename(const std::string& path);
typedef struct {
uint32_t width;
uint32_t height;
uint32_t channel;
float* data;
} sd_image_f32_t;
void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]);
sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image);
sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height);
sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size);
std::string sd_basename(const std::string& path);
std::string path_join(const std::string& p1, const std::string& p2);
std::vector<std::string> splitString(const std::string& str, char delimiter);
void pretty_progress(int step, int steps, float time);
void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);
std::string trim(const std::string& s);
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)

1072
vae.hpp

File diff suppressed because it is too large Load Diff

2424069
vocab.hpp

File diff suppressed because it is too large Load Diff