mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-23 22:56:42 +00:00
Compare commits
4 Commits
bb90bfa00f
...
5a34bc7f6e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5a34bc7f6e | ||
|
|
146b6cc49e | ||
|
|
93527fda74 | ||
|
|
6e66a1a4a4 |
196
docs/pulid.md
Normal file
196
docs/pulid.md
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
# PuLID-Flux face-identity preservation
|
||||||
|
|
||||||
|
stable-diffusion.cpp supports the [PuLID-Flux](https://github.com/ToTheBeginning/PuLID)
|
||||||
|
identity-injection technique on top of Flux.1 (schnell or dev) models.
|
||||||
|
Given a single source portrait, PuLID-Flux produces new generations that
|
||||||
|
preserve the source person's face across arbitrary scenes, poses, and
|
||||||
|
prompts.
|
||||||
|
|
||||||
|
Unlike PhotoMaker (which extracts the identity inside the inference
|
||||||
|
process from a directory of images), PuLID-Flux's identity extractor is
|
||||||
|
a heavy stack (insightface ArcFace + EVA-CLIP-L + IDFormer encoder) that
|
||||||
|
is impractical to port to C++/ggml. To keep this implementation small and
|
||||||
|
cross-vendor, **stable-diffusion.cpp consumes a precomputed identity
|
||||||
|
embedding** produced by an external Python tool that runs once per source
|
||||||
|
portrait. Everything downstream of that one-shot extraction is C++ and
|
||||||
|
runs on any backend (Vulkan, CUDA, Metal, ROCm, CPU).
|
||||||
|
|
||||||
|
## Architecture summary
|
||||||
|
|
||||||
|
The PuLID-Flux contribution to the Flux denoise loop is a stack of 20
|
||||||
|
small cross-attention modules (`PerceiverAttentionCA`) inserted between
|
||||||
|
the Flux transformer blocks:
|
||||||
|
|
||||||
|
- After every 2nd of the 19 double-stream blocks (10 hook points)
|
||||||
|
- After every 4th of the 38 single-stream blocks (10 hook points)
|
||||||
|
|
||||||
|
Each cross-attention layer takes the current image tokens as query, the
|
||||||
|
32-token / 2048-dim identity embedding as key+value, and adds its output
|
||||||
|
(scaled by `id_weight`, typically 1.0) back to the image tokens.
|
||||||
|
|
||||||
|
## Required weights
|
||||||
|
|
||||||
|
Three files in addition to the standard Flux weight set:
|
||||||
|
|
||||||
|
1. **Flux base** (transformer + VAE + clip_l + t5xxl) -- exactly as
|
||||||
|
[docs/flux.md](flux.md) describes.
|
||||||
|
2. **PuLID weights** -- download from
|
||||||
|
[guozinan/PuLID](https://huggingface.co/guozinan/PuLID):
|
||||||
|
- `pulid_flux_v0.9.0.safetensors` or `pulid_flux_v0.9.1.safetensors`
|
||||||
|
(recommended; this implementation is verified against v0.9.1)
|
||||||
|
- **v1.1 (`pulid_v1.1.safetensors`) is NOT yet supported** -- it uses
|
||||||
|
renamed keys (`id_adapter_attn_layers.*` instead of `pulid_ca.*`)
|
||||||
|
and possibly different module structure. Future PR.
|
||||||
|
3. **Identity embedding (.pulidembd)** -- produced by the precompute
|
||||||
|
tool below.
|
||||||
|
|
||||||
|
## Precompute the identity embedding
|
||||||
|
|
||||||
|
The precompute tool runs the PyTorch identity-extraction stack on a
|
||||||
|
single portrait image and writes the resulting `(32, 2048)` embedding
|
||||||
|
to a `.pulidembd` binary file (about 131 KB). Run it once per source
|
||||||
|
person; the same file is reused for any number of generations.
|
||||||
|
|
||||||
|
A reference Python script is provided alongside this docs file at
|
||||||
|
[`script/pulid_extract_id.py`](../script/pulid_extract_id.py). It
|
||||||
|
requires:
|
||||||
|
- A working CUDA / CPU PyTorch stack
|
||||||
|
- `insightface`, `facexlib`, `eva-clip`, `torchvision`, `opencv-python`,
|
||||||
|
`huggingface_hub`, `gguf`
|
||||||
|
- The PuLID weights file (same one stable-diffusion.cpp will load below)
|
||||||
|
- The ToTheBeginning/PuLID repo's `pulid/` package (including
|
||||||
|
`pulid/pipeline_flux.py`) and `eva_clip/` package on `PYTHONPATH`; `flux/`
|
||||||
|
is not needed for embedding extraction
|
||||||
|
|
||||||
|
Run it as:
|
||||||
|
|
||||||
|
```
|
||||||
|
python pulid_extract_id.py \
|
||||||
|
--portrait /path/to/source-photo.jpg \
|
||||||
|
--pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \
|
||||||
|
--out /path/to/source.pulidembd
|
||||||
|
```
|
||||||
|
|
||||||
|
## Format (gguf)
|
||||||
|
|
||||||
|
The embedding is a standard **gguf** container holding a single tensor:
|
||||||
|
|
||||||
|
```
|
||||||
|
tensor name : "pulid_id"
|
||||||
|
shape : [token_dim, num_tokens] (ggml order; typically [2048, 32])
|
||||||
|
type : F16 (also accepts F32 / BF16)
|
||||||
|
metadata : general.architecture = "pulid", pulid.version = 1
|
||||||
|
```
|
||||||
|
|
||||||
|
stable-diffusion.cpp loads it with the normal gguf reader
|
||||||
|
(`gguf_init_from_file`) and converts to fp32 at load time -- no bespoke
|
||||||
|
parser. Total file size for the typical (32, 2048, fp16) case is ~131 KB.
|
||||||
|
|
||||||
|
## Command-line usage
|
||||||
|
|
||||||
|
```
|
||||||
|
.\bin\Release\sd-cli.exe \
|
||||||
|
--diffusion-model models\flux1-schnell-Q4_K_S.gguf \
|
||||||
|
--vae models\ae.safetensors \
|
||||||
|
--clip_l models\clip_l.safetensors \
|
||||||
|
--t5xxl models\t5xxl_fp16.safetensors \
|
||||||
|
--pulid-weights models\pulid_flux_v0.9.1.safetensors \
|
||||||
|
--pulid-id-embedding source.pulidembd \
|
||||||
|
--pulid-id-weight 1.0 \
|
||||||
|
-p "candid photograph of a young woman on a beach at sunset" \
|
||||||
|
--cfg-scale 1.0 --sampling-method euler --steps 4 -W 512 -H 512 \
|
||||||
|
--seed 42 --clip-on-cpu \
|
||||||
|
-o out.png
|
||||||
|
```
|
||||||
|
|
||||||
|
For Flux Dev (instead of Schnell), add `--guidance 3.5` and `--steps 20`.
|
||||||
|
|
||||||
|
## Flags
|
||||||
|
|
||||||
|
| Flag | Purpose |
|
||||||
|
|----------------------------|-------------------------------------------------------------------|
|
||||||
|
| `--pulid-weights <path>` | Path to `pulid_flux_v0.9.x.safetensors`. Loaded with the model. |
|
||||||
|
| `--pulid-id-embedding <p>` | Path to a `.pulidembd` binary produced by the precompute tool. |
|
||||||
|
| `--pulid-id-weight <f>` | Identity-injection strength. Typical 0.7-1.2; default 1.0. |
|
||||||
|
|
||||||
|
All three flags must be set together to activate PuLID. Setting only
|
||||||
|
`--pulid-weights` (no embedding) loads the weights but disables injection
|
||||||
|
at runtime. Setting `--pulid-id-weight 0` zeros out the contribution
|
||||||
|
(useful for falsification testing: outputs should be byte-identical to
|
||||||
|
a no-PuLID run with the same seed).
|
||||||
|
|
||||||
|
## Memory budget
|
||||||
|
|
||||||
|
At 512x512, 4 steps (Schnell), the 20 cross-attention layers add roughly
|
||||||
|
10% to denoise time and almost nothing to peak VRAM. Tested on a 12 GB
|
||||||
|
consumer card alongside Flux Schnell Q4 GGUF + CPU-offloaded clip_l and
|
||||||
|
t5xxl + GPU-resident VAE.
|
||||||
|
|
||||||
|
At 1024x1024 with Flux Dev Q4 + 20 steps + PuLID, the VAE decode compute
|
||||||
|
buffer doesn't fit on a 12 GB card even with `--vae-on-cpu`. Workaround:
|
||||||
|
explicitly route VAE to the CPU backend instead of the offload flag:
|
||||||
|
|
||||||
|
```
|
||||||
|
--backend "diffusion=vulkan0,vae=cpu"
|
||||||
|
```
|
||||||
|
|
||||||
|
The `--vae-on-cpu` flag offloads VAE weights but leaves the compute graph
|
||||||
|
on the default backend; this is existing stable-diffusion.cpp behavior,
|
||||||
|
not a PuLID-specific issue. Documented here because anyone running PuLID
|
||||||
|
at 1024 will hit it.
|
||||||
|
|
||||||
|
## Backend selection
|
||||||
|
|
||||||
|
The standard `--backend` flag works as documented. Common patterns:
|
||||||
|
|
||||||
|
```
|
||||||
|
# AMD Vulkan
|
||||||
|
--backend "diffusion=vulkan0,vae=cpu"
|
||||||
|
|
||||||
|
# NVIDIA Vulkan
|
||||||
|
--backend "diffusion=vulkan1,vae=cpu"
|
||||||
|
|
||||||
|
# CUDA
|
||||||
|
--backend "diffusion=cuda0,vae=cpu"
|
||||||
|
```
|
||||||
|
|
||||||
|
The PuLID cross-attention layers run on the same backend as the main
|
||||||
|
diffusion model. They have not yet been independently profiled on every
|
||||||
|
backend; only Vulkan and CPU have been tested by the original contributor.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
A three-way SHA-256 check is the recommended sanity test when bringing up
|
||||||
|
a new combination of model + backend + hardware:
|
||||||
|
|
||||||
|
| Run | Expected hash relation |
|
||||||
|
|----------------------------------------------|------------------------------------|
|
||||||
|
| A: no `--pulid-*` flags | baseline |
|
||||||
|
| B: PuLID flags, `--pulid-id-weight 0.0` | **byte-identical to A** |
|
||||||
|
| C: PuLID flags, `--pulid-id-weight 1.0` | **different from A,B**, preserves source identity |
|
||||||
|
|
||||||
|
If A and C differ but A and B differ too, the injection is allocating
|
||||||
|
or computing something even at zero weight -- likely a bug.
|
||||||
|
|
||||||
|
## Limitations / not yet supported
|
||||||
|
|
||||||
|
- **`--skip-layers` (skip-layer-guidance / SLG) combined with PuLID** is not
|
||||||
|
supported. The `pulid_ca` index advances per non-skipped block, so a
|
||||||
|
skipped block silently misaligns the cross-attention weight assignment
|
||||||
|
vs. the trained intervals. The reference PyTorch implementation does
|
||||||
|
not have SLG either, so there is no well-defined behavior to emulate.
|
||||||
|
Use either feature alone.
|
||||||
|
- **PuLID v1.1 weights** (`pulid_v1.1.safetensors`, renamed key layout).
|
||||||
|
- **Multiple ID images.** The reference PyTorch implementation can fuse
|
||||||
|
several portraits into one embedding for stronger identity. This
|
||||||
|
implementation accepts a single embedding produced from one or more
|
||||||
|
images by the external precompute tool.
|
||||||
|
- **Negative-prompt branch of CFG.** PuLID only injects on the positive
|
||||||
|
conditioning path in the published reference, and the implementation
|
||||||
|
here follows that. Flux's distilled guidance doesn't run a separate
|
||||||
|
uncond branch in normal use, so this matters only for `--true-cfg`
|
||||||
|
workflows that aren't standard for Flux.
|
||||||
|
- **Backends other than Vulkan and CPU** are untested by the original
|
||||||
|
contributor. The implementation is pure-ggml and should work on CUDA,
|
||||||
|
ROCm, and Metal, but verification by users on those backends is
|
||||||
|
welcomed.
|
||||||
@ -415,6 +415,10 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
"--photo-maker",
|
"--photo-maker",
|
||||||
"path to PHOTOMAKER model",
|
"path to PHOTOMAKER model",
|
||||||
&photo_maker_path},
|
&photo_maker_path},
|
||||||
|
{"",
|
||||||
|
"--pulid-weights",
|
||||||
|
"path to PuLID Flux weights",
|
||||||
|
&pulid_weights_path},
|
||||||
{"",
|
{"",
|
||||||
"--upscale-model",
|
"--upscale-model",
|
||||||
"path to esrgan model.",
|
"path to esrgan model.",
|
||||||
@ -812,6 +816,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
|
|||||||
sd_ctx_params.embeddings = embedding_vec.data();
|
sd_ctx_params.embeddings = embedding_vec.data();
|
||||||
sd_ctx_params.embedding_count = static_cast<uint32_t>(embedding_vec.size());
|
sd_ctx_params.embedding_count = static_cast<uint32_t>(embedding_vec.size());
|
||||||
sd_ctx_params.photo_maker_path = photo_maker_path.c_str();
|
sd_ctx_params.photo_maker_path = photo_maker_path.c_str();
|
||||||
|
sd_ctx_params.pulid_weights_path = pulid_weights_path.c_str();
|
||||||
sd_ctx_params.tensor_type_rules = tensor_type_rules.c_str();
|
sd_ctx_params.tensor_type_rules = tensor_type_rules.c_str();
|
||||||
sd_ctx_params.n_threads = n_threads;
|
sd_ctx_params.n_threads = n_threads;
|
||||||
sd_ctx_params.wtype = wtype;
|
sd_ctx_params.wtype = wtype;
|
||||||
@ -887,6 +892,10 @@ ArgOptions SDGenerationParams::get_options() {
|
|||||||
"--pm-id-embed-path",
|
"--pm-id-embed-path",
|
||||||
"path to PHOTOMAKER v2 id embed",
|
"path to PHOTOMAKER v2 id embed",
|
||||||
&pm_id_embed_path},
|
&pm_id_embed_path},
|
||||||
|
{"",
|
||||||
|
"--pulid-id-embedding",
|
||||||
|
"path to PuLID id embedding",
|
||||||
|
&pulid_id_embedding_path},
|
||||||
{"",
|
{"",
|
||||||
"--hires-upscaler",
|
"--hires-upscaler",
|
||||||
"highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
|
"highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
|
||||||
@ -1037,6 +1046,10 @@ ArgOptions SDGenerationParams::get_options() {
|
|||||||
"--pm-style-strength",
|
"--pm-style-strength",
|
||||||
"",
|
"",
|
||||||
&pm_style_strength},
|
&pm_style_strength},
|
||||||
|
{"",
|
||||||
|
"--pulid-id-weight",
|
||||||
|
"strength of PuLID identity injection",
|
||||||
|
&pulid_id_weight},
|
||||||
{"",
|
{"",
|
||||||
"--control-strength",
|
"--control-strength",
|
||||||
"strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
|
"strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
|
||||||
@ -2269,6 +2282,11 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
|
|||||||
pm_style_strength,
|
pm_style_strength,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
sd_pulid_params_t pulid_params = {
|
||||||
|
pulid_id_embedding_path.empty() ? nullptr : pulid_id_embedding_path.c_str(),
|
||||||
|
pulid_id_weight,
|
||||||
|
};
|
||||||
|
|
||||||
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
|
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
|
||||||
params.lora_count = static_cast<uint32_t>(lora_vec.size());
|
params.lora_count = static_cast<uint32_t>(lora_vec.size());
|
||||||
params.prompt = prompt.c_str();
|
params.prompt = prompt.c_str();
|
||||||
@ -2289,6 +2307,7 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
|
|||||||
params.control_image = control_image.get();
|
params.control_image = control_image.get();
|
||||||
params.control_strength = control_strength;
|
params.control_strength = control_strength;
|
||||||
params.pm_params = pm_params;
|
params.pm_params = pm_params;
|
||||||
|
params.pulid_params = pulid_params;
|
||||||
params.vae_tiling_params = vae_tiling_params;
|
params.vae_tiling_params = vae_tiling_params;
|
||||||
params.cache = cache_params;
|
params.cache = cache_params;
|
||||||
|
|
||||||
|
|||||||
@ -133,6 +133,7 @@ struct SDContextParams {
|
|||||||
std::string control_net_path;
|
std::string control_net_path;
|
||||||
std::string embedding_dir;
|
std::string embedding_dir;
|
||||||
std::string photo_maker_path;
|
std::string photo_maker_path;
|
||||||
|
std::string pulid_weights_path;
|
||||||
sd_type_t wtype = SD_TYPE_COUNT;
|
sd_type_t wtype = SD_TYPE_COUNT;
|
||||||
std::string tensor_type_rules;
|
std::string tensor_type_rules;
|
||||||
std::string lora_model_dir = ".";
|
std::string lora_model_dir = ".";
|
||||||
@ -234,6 +235,9 @@ struct SDGenerationParams {
|
|||||||
std::string pm_id_embed_path;
|
std::string pm_id_embed_path;
|
||||||
float pm_style_strength = 20.f;
|
float pm_style_strength = 20.f;
|
||||||
|
|
||||||
|
std::string pulid_id_embedding_path;
|
||||||
|
float pulid_id_weight = 1.0f;
|
||||||
|
|
||||||
int upscale_repeats = 1;
|
int upscale_repeats = 1;
|
||||||
int upscale_tile_size = 128;
|
int upscale_tile_size = 128;
|
||||||
|
|
||||||
|
|||||||
@ -195,6 +195,7 @@ typedef struct {
|
|||||||
const sd_embedding_t* embeddings;
|
const sd_embedding_t* embeddings;
|
||||||
uint32_t embedding_count;
|
uint32_t embedding_count;
|
||||||
const char* photo_maker_path;
|
const char* photo_maker_path;
|
||||||
|
const char* pulid_weights_path;
|
||||||
const char* tensor_type_rules;
|
const char* tensor_type_rules;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
enum sd_type_t wtype;
|
enum sd_type_t wtype;
|
||||||
@ -272,6 +273,11 @@ typedef struct {
|
|||||||
float style_strength;
|
float style_strength;
|
||||||
} sd_pm_params_t; // photo maker
|
} sd_pm_params_t; // photo maker
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
const char* id_embedding_path;
|
||||||
|
float id_weight;
|
||||||
|
} sd_pulid_params_t;
|
||||||
|
|
||||||
enum sd_cache_mode_t {
|
enum sd_cache_mode_t {
|
||||||
SD_CACHE_DISABLED = 0,
|
SD_CACHE_DISABLED = 0,
|
||||||
SD_CACHE_EASYCACHE,
|
SD_CACHE_EASYCACHE,
|
||||||
@ -364,6 +370,7 @@ typedef struct {
|
|||||||
sd_image_t control_image;
|
sd_image_t control_image;
|
||||||
float control_strength;
|
float control_strength;
|
||||||
sd_pm_params_t pm_params;
|
sd_pm_params_t pm_params;
|
||||||
|
sd_pulid_params_t pulid_params;
|
||||||
sd_tiling_params_t vae_tiling_params;
|
sd_tiling_params_t vae_tiling_params;
|
||||||
sd_cache_params_t cache;
|
sd_cache_params_t cache;
|
||||||
sd_hires_params_t hires;
|
sd_hires_params_t hires;
|
||||||
@ -445,6 +452,17 @@ SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
|
|||||||
SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
|
SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
|
||||||
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
|
SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
|
||||||
|
|
||||||
|
enum sd_cancel_mode_t {
|
||||||
|
// Stop the current generation as soon as possible.
|
||||||
|
SD_CANCEL_ALL,
|
||||||
|
// Finish the current image sample, then skip additional batch latents and return completed images.
|
||||||
|
SD_CANCEL_NEW_LATENTS,
|
||||||
|
// Clear a pending cancellation request.
|
||||||
|
SD_CANCEL_RESET
|
||||||
|
};
|
||||||
|
|
||||||
|
SD_API void sd_cancel_generation(sd_ctx_t* sd_ctx, enum sd_cancel_mode_t mode);
|
||||||
|
|
||||||
SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
|
SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
|
||||||
SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
||||||
const sd_vid_gen_params_t* sd_vid_gen_params,
|
const sd_vid_gen_params_t* sd_vid_gen_params,
|
||||||
|
|||||||
134
script/pulid_extract_id.py
Normal file
134
script/pulid_extract_id.py
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
"""
|
||||||
|
Precompute a PuLID-Flux identity embedding from a single source portrait.
|
||||||
|
|
||||||
|
Writes a gguf file (a single tensor `pulid_id`) that stable-diffusion.cpp's
|
||||||
|
`--pulid-id-embedding` flag consumes.
|
||||||
|
|
||||||
|
Dependencies (recommended: vendor rather than pip-install due to upstream
|
||||||
|
packaging quirks):
|
||||||
|
- torch + safetensors
|
||||||
|
- The ToTheBeginning/PuLID repository's `pulid/` package and `eva_clip/`.
|
||||||
|
Put them on PYTHONPATH or sys.path before running this script.
|
||||||
|
- insightface, facexlib, torchvision, opencv-python, huggingface_hub, gguf
|
||||||
|
- numpy, Pillow
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python script/pulid_extract_id.py \\
|
||||||
|
--portrait /path/to/source-photo.jpg \\
|
||||||
|
--pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \\
|
||||||
|
--out /path/to/source.pulidembd
|
||||||
|
|
||||||
|
The portrait must contain a clearly visible face. insightface's antelopev2
|
||||||
|
detector will be auto-downloaded on first run.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
|
||||||
|
def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from pulid.pipeline_flux import PuLIDPipeline
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device, onnx_provider = "cuda", "gpu"
|
||||||
|
else:
|
||||||
|
device, onnx_provider = "cpu", "cpu"
|
||||||
|
|
||||||
|
print(f"device={device}", flush=True)
|
||||||
|
|
||||||
|
# PuLIDPipeline only attaches pulid_ca attributes to `dit` during
|
||||||
|
# construction; get_id_embedding() never runs Flux, so a dummy object is
|
||||||
|
# enough and avoids importing/building a Flux skeleton.
|
||||||
|
print("instantiating PuLIDPipeline with a dummy Flux object", flush=True)
|
||||||
|
dit = SimpleNamespace()
|
||||||
|
pulid = PuLIDPipeline(dit=dit,
|
||||||
|
device=device,
|
||||||
|
weight_dtype=torch.bfloat16,
|
||||||
|
onnx_provider=onnx_provider)
|
||||||
|
|
||||||
|
print(f"loading PuLID weights from {pulid_weights}", flush=True)
|
||||||
|
pulid.load_pretrain(pretrain_path=pulid_weights, version="v0.9.1")
|
||||||
|
|
||||||
|
print(f"extracting ID embedding from {portrait_path}", flush=True)
|
||||||
|
face_img = np.array(Image.open(portrait_path).convert("RGB"))
|
||||||
|
id_embedding, _ = pulid.get_id_embedding(face_img)
|
||||||
|
print(f"id embedding shape={tuple(id_embedding.shape)} dtype={id_embedding.dtype}",
|
||||||
|
flush=True)
|
||||||
|
|
||||||
|
if id_embedding.ndim == 3 and id_embedding.shape[0] == 1:
|
||||||
|
id_embedding = id_embedding[0]
|
||||||
|
return id_embedding
|
||||||
|
|
||||||
|
|
||||||
|
def write_embd(tensor, out_path: str, dtype_choice: str) -> None:
|
||||||
|
import gguf
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if tensor.ndim != 2:
|
||||||
|
raise ValueError(f"expected (num_tokens, token_dim); got {tuple(tensor.shape)}")
|
||||||
|
num_tokens, token_dim = tensor.shape
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
|
writer = gguf.GGUFWriter(out_path, arch="pulid")
|
||||||
|
writer.add_uint32("pulid.version", 1)
|
||||||
|
|
||||||
|
if dtype_choice == "fp16":
|
||||||
|
arr = tensor.to(torch.float16).contiguous().cpu().numpy()
|
||||||
|
writer.add_tensor("pulid_id", arr)
|
||||||
|
elif dtype_choice == "fp32":
|
||||||
|
arr = tensor.to(torch.float32).contiguous().cpu().numpy()
|
||||||
|
writer.add_tensor("pulid_id", arr)
|
||||||
|
elif dtype_choice == "bf16":
|
||||||
|
raw = tensor.to(torch.bfloat16).contiguous().view(torch.uint16).cpu().numpy()
|
||||||
|
writer.add_tensor("pulid_id", raw,
|
||||||
|
raw_shape=(int(num_tokens), int(token_dim)),
|
||||||
|
raw_dtype=gguf.GGMLQuantizationType.BF16)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"unknown --dtype {dtype_choice}")
|
||||||
|
|
||||||
|
writer.write_header_to_file()
|
||||||
|
writer.write_kv_data_to_file()
|
||||||
|
writer.write_tensors_to_file()
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
print(f"wrote {out_path}: gguf, tensor pulid_id [{token_dim}, {num_tokens}] {dtype_choice}",
|
||||||
|
flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser(
|
||||||
|
description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument("--portrait", required=True,
|
||||||
|
help="Path to the source portrait image (JPG/PNG).")
|
||||||
|
ap.add_argument("--pulid-weights", required=True,
|
||||||
|
help="Path to pulid_flux_v0.9.x.safetensors.")
|
||||||
|
ap.add_argument("--out", required=True,
|
||||||
|
help="Output path for the .pulidembd binary.")
|
||||||
|
ap.add_argument("--dtype", default="fp16",
|
||||||
|
choices=["fp16", "bf16", "fp32"],
|
||||||
|
help="Storage dtype (default fp16; produces ~131 KB).")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if not os.path.exists(args.portrait):
|
||||||
|
print(f"ERROR: portrait not found at {args.portrait}", file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
if not os.path.exists(args.pulid_weights):
|
||||||
|
print(f"ERROR: PuLID weights not found at {args.pulid_weights}", file=sys.stderr)
|
||||||
|
return 3
|
||||||
|
|
||||||
|
embedding = extract(args.portrait, args.pulid_weights)
|
||||||
|
write_embd(embedding, args.out, args.dtype)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
#include "conditioning/conditioner.hpp"
|
#include "conditioning/conditioner.hpp"
|
||||||
#include "core/ggml_extend_backend.h"
|
#include "core/ggml_extend_backend.h"
|
||||||
|
#include "model/diffusion/model.hpp"
|
||||||
#include "model_loader.h"
|
#include "model_loader.h"
|
||||||
#include "model_manager.h"
|
#include "model_manager.h"
|
||||||
#include "stable-diffusion.h"
|
#include "stable-diffusion.h"
|
||||||
@ -30,6 +31,7 @@ struct GenerationExtensionConditionContext {
|
|||||||
Conditioner* conditioner;
|
Conditioner* conditioner;
|
||||||
ConditionerParams& condition_params;
|
ConditionerParams& condition_params;
|
||||||
const sd_pm_params_t& pm_params;
|
const sd_pm_params_t& pm_params;
|
||||||
|
const sd_pulid_params_t& pulid_params;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
int total_steps;
|
int total_steps;
|
||||||
};
|
};
|
||||||
@ -56,8 +58,20 @@ struct GenerationExtension {
|
|||||||
const SDCondition& condition) const {
|
const SDCondition& condition) const {
|
||||||
return condition;
|
return condition;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Called in the denoise loop for each enabled extension, after the per-step
|
||||||
|
// DiffusionParams (including its version-specific `extra`) has been built,
|
||||||
|
// but before diffusion_model->compute(). Lets an extension feed data into
|
||||||
|
// the diffusion forward that the conditioning-side hooks can't reach -- it
|
||||||
|
// can set/override fields on `params` (typically the architecture-specific
|
||||||
|
// `params.extra`, e.g. a guidance tensor, control payload, or an identity
|
||||||
|
// embedding for an adapter that injects inside the model's blocks). The
|
||||||
|
// extension targets whichever `extra` variant matches the active model.
|
||||||
|
// Mutates `params` only, never the extension. Default no-op.
|
||||||
|
virtual void before_diffusion(DiffusionParams& /*params*/, int /*step*/) const {}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::shared_ptr<GenerationExtension> create_photomaker_extension();
|
std::shared_ptr<GenerationExtension> create_photomaker_extension();
|
||||||
|
std::shared_ptr<GenerationExtension> create_pulid_extension();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
123
src/extensions/pulid_extension.cpp
Normal file
123
src/extensions/pulid_extension.cpp
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
#include "extensions/generation_extension.h"
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
#include <variant>
|
||||||
|
|
||||||
|
#include "core/tensor_ggml.hpp"
|
||||||
|
#include "core/util.h"
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
|
static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
|
||||||
|
sd::Tensor<float> empty;
|
||||||
|
if (path == nullptr || strlen(path) == 0) {
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_context* ctx_data = nullptr;
|
||||||
|
struct gguf_init_params gp = {/*.no_alloc =*/false, /*.ctx =*/&ctx_data};
|
||||||
|
struct gguf_context* gguf_ctx = gguf_init_from_file(path, gp);
|
||||||
|
if (gguf_ctx == nullptr || ctx_data == nullptr) {
|
||||||
|
LOG_WARN("PuLID id-embedding: cannot read gguf '%s'", path);
|
||||||
|
if (gguf_ctx != nullptr)
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
if (ctx_data != nullptr)
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* t = ggml_get_tensor(ctx_data, "pulid_id");
|
||||||
|
if (t == nullptr) {
|
||||||
|
LOG_WARN("PuLID id-embedding: no 'pulid_id' tensor in '%s'", path);
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t token_dim = t->ne[0];
|
||||||
|
const int64_t num_tokens = t->ne[1];
|
||||||
|
if (token_dim <= 0 || num_tokens <= 0 || token_dim > 65536 || num_tokens > 1024 ||
|
||||||
|
t->ne[2] != 1 || t->ne[3] != 1) {
|
||||||
|
LOG_WARN("PuLID id-embedding: implausible shape [%lld, %lld] in '%s'",
|
||||||
|
(long long)token_dim, (long long)num_tokens, path);
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t n_elem = (size_t)token_dim * (size_t)num_tokens;
|
||||||
|
sd::Tensor<float> out({token_dim, num_tokens, 1});
|
||||||
|
float* dst = out.data();
|
||||||
|
if (t->type == GGML_TYPE_F32) {
|
||||||
|
memcpy(dst, t->data, n_elem * sizeof(float));
|
||||||
|
} else if (t->type == GGML_TYPE_F16) {
|
||||||
|
const ggml_fp16_t* src = reinterpret_cast<const ggml_fp16_t*>(t->data);
|
||||||
|
for (size_t i = 0; i < n_elem; i++) {
|
||||||
|
dst[i] = ggml_fp16_to_fp32(src[i]);
|
||||||
|
}
|
||||||
|
} else if (t->type == GGML_TYPE_BF16) {
|
||||||
|
const ggml_bf16_t* src = reinterpret_cast<const ggml_bf16_t*>(t->data);
|
||||||
|
for (size_t i = 0; i < n_elem; i++) {
|
||||||
|
dst[i] = ggml_bf16_to_fp32(src[i]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOG_WARN("PuLID id-embedding: unsupported tensor type %s in '%s'",
|
||||||
|
ggml_type_name(t->type), path);
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("PuLID id-embedding: loaded [%lld, %lld] type=%s from '%s'",
|
||||||
|
(long long)token_dim, (long long)num_tokens, ggml_type_name(t->type), path);
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PuLIDExtension : public GenerationExtension {
|
||||||
|
bool enabled = false;
|
||||||
|
sd::Tensor<float> id_embedding;
|
||||||
|
float id_weight = 1.0f;
|
||||||
|
|
||||||
|
const char* name() const override {
|
||||||
|
return "pulid";
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_enabled() const override {
|
||||||
|
return enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool init(const GenerationExtensionInitContext& ctx) override {
|
||||||
|
enabled = strlen(SAFE_STR(ctx.params->pulid_weights_path)) > 0;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset_runtime_condition() override {
|
||||||
|
id_embedding = {};
|
||||||
|
id_weight = 1.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool prepare_condition(GenerationExtensionConditionContext& ctx) override {
|
||||||
|
reset_runtime_condition();
|
||||||
|
if (!enabled) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
id_embedding = load_pulid_id_embedding(ctx.pulid_params.id_embedding_path);
|
||||||
|
id_weight = ctx.pulid_params.id_weight;
|
||||||
|
return false; // PuLID does not modify the conditioning
|
||||||
|
}
|
||||||
|
|
||||||
|
void before_diffusion(DiffusionParams& params, int /*step*/) const override {
|
||||||
|
if (!enabled || id_embedding.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (auto* flux_extra = std::get_if<FluxDiffusionExtra>(¶ms.extra)) {
|
||||||
|
flux_extra->pulid_id = &id_embedding;
|
||||||
|
flux_extra->pulid_id_weight = id_weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::shared_ptr<GenerationExtension> create_pulid_extension() {
|
||||||
|
return std::make_shared<PuLIDExtension>();
|
||||||
|
}
|
||||||
76
src/model/adapter/pulid.hpp
Normal file
76
src/model/adapter/pulid.hpp
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
#ifndef __PULID_HPP__
|
||||||
|
#define __PULID_HPP__
|
||||||
|
|
||||||
|
#include "core/ggml_extend.hpp"
|
||||||
|
#include "model/common/block.hpp"
|
||||||
|
|
||||||
|
class PuLIDPerceiverAttentionCA : public GGMLBlock {
|
||||||
|
public:
|
||||||
|
static constexpr int64_t DEFAULT_DIM = 3072; // Flux hidden size
|
||||||
|
static constexpr int64_t DEFAULT_DIM_HEAD = 128;
|
||||||
|
static constexpr int64_t DEFAULT_HEADS = 16;
|
||||||
|
static constexpr int64_t DEFAULT_KV_DIM = 2048; // PuLID ID-embedding dim
|
||||||
|
|
||||||
|
protected:
|
||||||
|
int64_t dim;
|
||||||
|
int64_t dim_head;
|
||||||
|
int64_t heads;
|
||||||
|
int64_t kv_dim;
|
||||||
|
int64_t inner_dim;
|
||||||
|
|
||||||
|
public:
|
||||||
|
PuLIDPerceiverAttentionCA(int64_t dim = DEFAULT_DIM,
|
||||||
|
int64_t dim_head = DEFAULT_DIM_HEAD,
|
||||||
|
int64_t heads = DEFAULT_HEADS,
|
||||||
|
int64_t kv_dim = DEFAULT_KV_DIM)
|
||||||
|
: dim(dim),
|
||||||
|
dim_head(dim_head),
|
||||||
|
heads(heads),
|
||||||
|
kv_dim(kv_dim),
|
||||||
|
inner_dim(dim_head * heads) {
|
||||||
|
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(kv_dim));
|
||||||
|
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||||
|
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, /*bias=*/false));
|
||||||
|
blocks["to_kv"] = std::shared_ptr<GGMLBlock>(new Linear(kv_dim, inner_dim * 2, /*bias=*/false));
|
||||||
|
blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, /*bias=*/false));
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
|
ggml_tensor* id_embedding,
|
||||||
|
ggml_tensor* image_tokens) {
|
||||||
|
auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
|
||||||
|
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
|
||||||
|
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
|
||||||
|
auto to_kv = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
|
||||||
|
auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
|
||||||
|
|
||||||
|
ggml_tensor* x_normed = norm1->forward(ctx, id_embedding);
|
||||||
|
ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens);
|
||||||
|
|
||||||
|
ggml_tensor* q = to_q->forward(ctx, lat_normed); // [N, T_img, 2048]
|
||||||
|
ggml_tensor* kv = to_kv->forward(ctx, x_normed); // [N, T_img, 3072]
|
||||||
|
|
||||||
|
ggml_tensor* k = ggml_view_3d(ctx->ggml_ctx, kv,
|
||||||
|
inner_dim, kv->ne[1], kv->ne[2],
|
||||||
|
kv->nb[1], kv->nb[2],
|
||||||
|
/*offset=*/0);
|
||||||
|
ggml_tensor* v = ggml_view_3d(ctx->ggml_ctx, kv,
|
||||||
|
inner_dim, kv->ne[1], kv->ne[2],
|
||||||
|
kv->nb[1], kv->nb[2],
|
||||||
|
/*offset=*/inner_dim * ggml_element_size(kv));
|
||||||
|
k = ggml_cont(ctx->ggml_ctx, k);
|
||||||
|
v = ggml_cont(ctx->ggml_ctx, v);
|
||||||
|
|
||||||
|
ggml_tensor* attn_out = ggml_ext_attention_ext(
|
||||||
|
ctx->ggml_ctx, ctx->backend,
|
||||||
|
q, k, v,
|
||||||
|
heads,
|
||||||
|
/*mask=*/nullptr,
|
||||||
|
/*diag_mask_inf=*/false);
|
||||||
|
|
||||||
|
ggml_tensor* out = to_out->forward(ctx, attn_out);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __PULID_HPP__
|
||||||
@ -4,6 +4,7 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "model/adapter/pulid.hpp"
|
||||||
#include "model/common/rope.hpp"
|
#include "model/common/rope.hpp"
|
||||||
#include "model/diffusion/dit.hpp"
|
#include "model/diffusion/dit.hpp"
|
||||||
#include "model/diffusion/model.hpp"
|
#include "model/diffusion/model.hpp"
|
||||||
@ -49,6 +50,10 @@ namespace Flux {
|
|||||||
float ref_index_scale = 1.f;
|
float ref_index_scale = 1.f;
|
||||||
ChromaRadianceConfig chroma_radiance_params;
|
ChromaRadianceConfig chroma_radiance_params;
|
||||||
|
|
||||||
|
bool pulid_enabled = false;
|
||||||
|
int pulid_double_interval = 2;
|
||||||
|
int pulid_single_interval = 4;
|
||||||
|
|
||||||
static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||||
const std::string& prefix,
|
const std::string& prefix,
|
||||||
SDVersion version = VERSION_FLUX) {
|
SDVersion version = VERSION_FLUX) {
|
||||||
@ -138,6 +143,9 @@ namespace Flux {
|
|||||||
if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
|
if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
|
||||||
head_dim = tensor_storage.ne[0];
|
head_dim = tensor_storage.ne[0];
|
||||||
}
|
}
|
||||||
|
if (name.find("pulid_ca.") != std::string::npos) {
|
||||||
|
config.pulid_enabled = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != config.patch_size) {
|
if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != config.patch_size) {
|
||||||
GGML_ASSERT(config.patch_size == 2 * actual_radiance_patch_size);
|
GGML_ASSERT(config.patch_size == 2 * actual_radiance_patch_size);
|
||||||
@ -957,6 +965,20 @@ namespace Flux {
|
|||||||
blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
|
blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
|
||||||
blocks["single_stream_modulation"] = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
|
blocks["single_stream_modulation"] = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (config.pulid_enabled) {
|
||||||
|
int num_double_ca = (config.depth + config.pulid_double_interval - 1) / config.pulid_double_interval;
|
||||||
|
int num_single_ca = (config.depth_single_blocks + config.pulid_single_interval - 1) / config.pulid_single_interval;
|
||||||
|
int num_ca = num_double_ca + num_single_ca;
|
||||||
|
for (int i = 0; i < num_ca; i++) {
|
||||||
|
blocks["pulid_ca." + std::to_string(i)] =
|
||||||
|
std::shared_ptr<GGMLBlock>(new PuLIDPerceiverAttentionCA(
|
||||||
|
/*dim=*/config.hidden_size,
|
||||||
|
/*dim_head=*/PuLIDPerceiverAttentionCA::DEFAULT_DIM_HEAD,
|
||||||
|
/*heads=*/PuLIDPerceiverAttentionCA::DEFAULT_HEADS,
|
||||||
|
/*kv_dim=*/PuLIDPerceiverAttentionCA::DEFAULT_KV_DIM));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
|
ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
|
||||||
@ -967,7 +989,9 @@ namespace Flux {
|
|||||||
ggml_tensor* guidance,
|
ggml_tensor* guidance,
|
||||||
ggml_tensor* pe,
|
ggml_tensor* pe,
|
||||||
ggml_tensor* mod_index_arange = nullptr,
|
ggml_tensor* mod_index_arange = nullptr,
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
ggml_tensor* pulid_id = nullptr,
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
||||||
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
||||||
auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
|
auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
|
||||||
@ -1044,6 +1068,13 @@ namespace Flux {
|
|||||||
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
|
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
|
||||||
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
|
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
|
||||||
|
|
||||||
|
const bool pulid_active = config.pulid_enabled && pulid_id != nullptr;
|
||||||
|
if (pulid_active && !skip_layers.empty()) {
|
||||||
|
LOG_WARN("PuLID + skip_layers is not supported; disabling PuLID for this generation.");
|
||||||
|
}
|
||||||
|
const bool pulid_run = pulid_active && skip_layers.empty();
|
||||||
|
int ca_idx = 0;
|
||||||
|
|
||||||
for (int i = 0; i < config.depth; i++) {
|
for (int i = 0; i < config.depth; i++) {
|
||||||
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
|
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
|
||||||
continue;
|
continue;
|
||||||
@ -1056,9 +1087,19 @@ namespace Flux {
|
|||||||
txt = img_txt.second; // [N, n_txt_token, hidden_size]
|
txt = img_txt.second; // [N, n_txt_token, hidden_size]
|
||||||
sd::ggml_graph_cut::mark_graph_cut(img, "flux.double_blocks." + std::to_string(i), "img");
|
sd::ggml_graph_cut::mark_graph_cut(img, "flux.double_blocks." + std::to_string(i), "img");
|
||||||
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.double_blocks." + std::to_string(i), "txt");
|
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.double_blocks." + std::to_string(i), "txt");
|
||||||
|
|
||||||
|
if (pulid_run && (i % config.pulid_double_interval == 0)) {
|
||||||
|
auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
|
||||||
|
blocks["pulid_ca." + std::to_string(ca_idx)]);
|
||||||
|
ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img); // [N, n_img_token, hidden_size]
|
||||||
|
img = ggml_add(ctx->ggml_ctx, img, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(img, "flux.pulid_ca." + std::to_string(ca_idx), "img");
|
||||||
|
ca_idx++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
|
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
|
||||||
|
const int64_t n_txt_tok = txt->ne[1];
|
||||||
for (int i = 0; i < config.depth_single_blocks; i++) {
|
for (int i = 0; i < config.depth_single_blocks; i++) {
|
||||||
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
|
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
|
||||||
continue;
|
continue;
|
||||||
@ -1067,6 +1108,29 @@ namespace Flux {
|
|||||||
|
|
||||||
txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
|
txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
|
||||||
sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.single_blocks." + std::to_string(i), "txt_img");
|
sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.single_blocks." + std::to_string(i), "txt_img");
|
||||||
|
|
||||||
|
if (pulid_run && (i % config.pulid_single_interval == 0)) {
|
||||||
|
auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
|
||||||
|
blocks["pulid_ca." + std::to_string(ca_idx)]);
|
||||||
|
ggml_tensor* txt_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
|
||||||
|
txt_img->ne[0], n_txt_tok, txt_img->ne[2],
|
||||||
|
txt_img->nb[1], txt_img->nb[2],
|
||||||
|
0);
|
||||||
|
ggml_tensor* img_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
|
||||||
|
txt_img->ne[0],
|
||||||
|
txt_img->ne[1] - n_txt_tok,
|
||||||
|
txt_img->ne[2],
|
||||||
|
txt_img->nb[1],
|
||||||
|
txt_img->nb[2],
|
||||||
|
n_txt_tok * txt_img->nb[1]);
|
||||||
|
txt_part = ggml_cont(ctx->ggml_ctx, txt_part);
|
||||||
|
img_part = ggml_cont(ctx->ggml_ctx, img_part);
|
||||||
|
ggml_tensor* ca_out = pulid_ca->forward(ctx, pulid_id, img_part);
|
||||||
|
img_part = ggml_add(ctx->ggml_ctx, img_part, ggml_scale(ctx->ggml_ctx, ca_out, pulid_id_weight));
|
||||||
|
txt_img = ggml_concat(ctx->ggml_ctx, txt_part, img_part, 1);
|
||||||
|
sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.pulid_ca." + std::to_string(ca_idx), "txt_img");
|
||||||
|
ca_idx++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
img = ggml_view_3d(ctx->ggml_ctx,
|
img = ggml_view_3d(ctx->ggml_ctx,
|
||||||
@ -1105,7 +1169,9 @@ namespace Flux {
|
|||||||
ggml_tensor* mod_index_arange = nullptr,
|
ggml_tensor* mod_index_arange = nullptr,
|
||||||
ggml_tensor* dct = nullptr,
|
ggml_tensor* dct = nullptr,
|
||||||
std::vector<ggml_tensor*> ref_latents = {},
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
ggml_tensor* pulid_id = nullptr,
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
GGML_ASSERT(x->ne[3] == 1);
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
|
|
||||||
int64_t W = x->ne[0];
|
int64_t W = x->ne[0];
|
||||||
@ -1131,7 +1197,8 @@ namespace Flux {
|
|||||||
img = ggml_reshape_3d(ctx->ggml_ctx, img, img->ne[0] * img->ne[1], img->ne[2], img->ne[3]); // [N, hidden_size, H/patch_size*W/patch_size]
|
img = ggml_reshape_3d(ctx->ggml_ctx, img, img->ne[0] * img->ne[1], img->ne[2], img->ne[3]); // [N, hidden_size, H/patch_size*W/patch_size]
|
||||||
img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3)); // [N, H/patch_size*W/patch_size, hidden_size]
|
img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3)); // [N, H/patch_size*W/patch_size, hidden_size]
|
||||||
|
|
||||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers); // [N, n_img_token, hidden_size]
|
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers,
|
||||||
|
pulid_id, pulid_id_weight); // [N, n_img_token, hidden_size]
|
||||||
|
|
||||||
// nerf decode
|
// nerf decode
|
||||||
auto nerf_image_embedder = std::dynamic_pointer_cast<NerfEmbedder>(blocks["nerf_image_embedder"]);
|
auto nerf_image_embedder = std::dynamic_pointer_cast<NerfEmbedder>(blocks["nerf_image_embedder"]);
|
||||||
@ -1179,7 +1246,9 @@ namespace Flux {
|
|||||||
ggml_tensor* mod_index_arange = nullptr,
|
ggml_tensor* mod_index_arange = nullptr,
|
||||||
ggml_tensor* dct = nullptr,
|
ggml_tensor* dct = nullptr,
|
||||||
std::vector<ggml_tensor*> ref_latents = {},
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
ggml_tensor* pulid_id = nullptr,
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
GGML_ASSERT(x->ne[3] == 1);
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
|
|
||||||
int64_t W = x->ne[0];
|
int64_t W = x->ne[0];
|
||||||
@ -1226,7 +1295,8 @@ namespace Flux {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
|
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers,
|
||||||
|
pulid_id, pulid_id_weight); // [N, num_tokens, C * patch_size * patch_size]
|
||||||
|
|
||||||
if (out->ne[1] > img_tokens) {
|
if (out->ne[1] > img_tokens) {
|
||||||
out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], img_tokens, out->ne[2], out->nb[1], out->nb[2], 0);
|
out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], img_tokens, out->ne[2], out->nb[1], out->nb[2], 0);
|
||||||
@ -1248,7 +1318,9 @@ namespace Flux {
|
|||||||
ggml_tensor* mod_index_arange = nullptr,
|
ggml_tensor* mod_index_arange = nullptr,
|
||||||
ggml_tensor* dct = nullptr,
|
ggml_tensor* dct = nullptr,
|
||||||
std::vector<ggml_tensor*> ref_latents = {},
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
ggml_tensor* pulid_id = nullptr,
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
// Forward pass of DiT.
|
// Forward pass of DiT.
|
||||||
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
|
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
|
||||||
// timestep: (N,) tensor of diffusion timesteps
|
// timestep: (N,) tensor of diffusion timesteps
|
||||||
@ -1271,7 +1343,9 @@ namespace Flux {
|
|||||||
mod_index_arange,
|
mod_index_arange,
|
||||||
dct,
|
dct,
|
||||||
ref_latents,
|
ref_latents,
|
||||||
skip_layers);
|
skip_layers,
|
||||||
|
pulid_id,
|
||||||
|
pulid_id_weight);
|
||||||
} else {
|
} else {
|
||||||
return forward_flux_chroma(ctx,
|
return forward_flux_chroma(ctx,
|
||||||
x,
|
x,
|
||||||
@ -1284,7 +1358,9 @@ namespace Flux {
|
|||||||
mod_index_arange,
|
mod_index_arange,
|
||||||
dct,
|
dct,
|
||||||
ref_latents,
|
ref_latents,
|
||||||
skip_layers);
|
skip_layers,
|
||||||
|
pulid_id,
|
||||||
|
pulid_id_weight);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -1384,7 +1460,9 @@ namespace Flux {
|
|||||||
const sd::Tensor<float>& guidance_tensor = {},
|
const sd::Tensor<float>& guidance_tensor = {},
|
||||||
const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
|
const std::vector<sd::Tensor<float>>& ref_latents_tensor = {},
|
||||||
bool increase_ref_index = false,
|
bool increase_ref_index = false,
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {},
|
||||||
|
const sd::Tensor<float>& pulid_id_tensor = {},
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
ggml_tensor* x = make_input(x_tensor);
|
ggml_tensor* x = make_input(x_tensor);
|
||||||
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
||||||
ggml_tensor* context = make_optional_input(context_tensor);
|
ggml_tensor* context = make_optional_input(context_tensor);
|
||||||
@ -1461,6 +1539,10 @@ namespace Flux {
|
|||||||
set_backend_tensor_data(dct, dct_vec.data());
|
set_backend_tensor_data(dct, dct_vec.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_tensor* pulid_id = pulid_id_tensor.empty()
|
||||||
|
? nullptr
|
||||||
|
: make_input(pulid_id_tensor);
|
||||||
|
|
||||||
auto runner_ctx = get_context();
|
auto runner_ctx = get_context();
|
||||||
|
|
||||||
ggml_tensor* out = flux.forward(&runner_ctx,
|
ggml_tensor* out = flux.forward(&runner_ctx,
|
||||||
@ -1474,7 +1556,9 @@ namespace Flux {
|
|||||||
mod_index_arange,
|
mod_index_arange,
|
||||||
dct,
|
dct,
|
||||||
ref_latents,
|
ref_latents,
|
||||||
skip_layers);
|
skip_layers,
|
||||||
|
pulid_id,
|
||||||
|
pulid_id_weight);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, out);
|
ggml_build_forward_expand(gf, out);
|
||||||
|
|
||||||
@ -1490,14 +1574,17 @@ namespace Flux {
|
|||||||
const sd::Tensor<float>& guidance = {},
|
const sd::Tensor<float>& guidance = {},
|
||||||
const std::vector<sd::Tensor<float>>& ref_latents = {},
|
const std::vector<sd::Tensor<float>>& ref_latents = {},
|
||||||
bool increase_ref_index = false,
|
bool increase_ref_index = false,
|
||||||
std::vector<int> skip_layers = std::vector<int>()) {
|
std::vector<int> skip_layers = std::vector<int>(),
|
||||||
|
const sd::Tensor<float>& pulid_id = {},
|
||||||
|
float pulid_id_weight = 1.0f) {
|
||||||
// x: [N, in_channels, h, w]
|
// x: [N, in_channels, h, w]
|
||||||
// timesteps: [N, ]
|
// timesteps: [N, ]
|
||||||
// context: [N, max_position, hidden_size]
|
// context: [N, max_position, hidden_size]
|
||||||
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||||
// guidance: [N, ]
|
// guidance: [N, ]
|
||||||
|
// pulid_id: empty (no injection) or [N, num_id_tokens=32, kv_dim=2048]
|
||||||
auto get_graph = [&]() -> ggml_cgraph* {
|
auto get_graph = [&]() -> ggml_cgraph* {
|
||||||
return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
|
return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers, pulid_id, pulid_id_weight);
|
||||||
};
|
};
|
||||||
|
|
||||||
auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
|
auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
|
||||||
@ -1520,7 +1607,9 @@ namespace Flux {
|
|||||||
tensor_or_empty(extra->guidance),
|
tensor_or_empty(extra->guidance),
|
||||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
||||||
diffusion_params.increase_ref_index,
|
diffusion_params.increase_ref_index,
|
||||||
extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
|
extra->skip_layers ? *extra->skip_layers : empty_skip_layers,
|
||||||
|
tensor_or_empty(extra->pulid_id),
|
||||||
|
extra->pulid_id_weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
void test() {
|
void test() {
|
||||||
|
|||||||
@ -22,6 +22,8 @@ struct SkipLayerDiffusionExtra {
|
|||||||
struct FluxDiffusionExtra {
|
struct FluxDiffusionExtra {
|
||||||
const sd::Tensor<float>* guidance = nullptr;
|
const sd::Tensor<float>* guidance = nullptr;
|
||||||
const std::vector<int>* skip_layers = nullptr;
|
const std::vector<int>* skip_layers = nullptr;
|
||||||
|
const sd::Tensor<float>* pulid_id = nullptr;
|
||||||
|
float pulid_id_weight = 1.0f;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct AnimaDiffusionExtra {
|
struct AnimaDiffusionExtra {
|
||||||
|
|||||||
@ -577,13 +577,8 @@ bool ModelManager::alloc_params_buffers(const std::vector<TensorState*>& states,
|
|||||||
for (TensorState* state : states) {
|
for (TensorState* state : states) {
|
||||||
ggml_tensor* tensor = state->tensor;
|
ggml_tensor* tensor = state->tensor;
|
||||||
size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(params_buft, tensor), alignment);
|
size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(params_buft, tensor), alignment);
|
||||||
if (max_size > 0 && tensor_size > max_size) {
|
// Some backends, e.g. Vulkan, report a preferred chunk size here rather than a
|
||||||
LOG_ERROR("model manager tensor '%s' is too large for params buffer: %zu > %zu",
|
// hard per-tensor allocation limit. Oversized tensors are allocated alone.
|
||||||
ggml_get_name(tensor),
|
|
||||||
tensor_size,
|
|
||||||
max_size);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!chunk.empty() && max_size > 0 && chunk_size + tensor_size > max_size) {
|
if (!chunk.empty() && max_size > 0 && chunk_size + tensor_size > max_size) {
|
||||||
if (!alloc_chunk(chunk, chunk_size)) {
|
if (!alloc_chunk(chunk, chunk_size)) {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@ -53,6 +53,8 @@
|
|||||||
const char* sd_vae_format_name(enum sd_vae_format_t format);
|
const char* sd_vae_format_name(enum sd_vae_format_t format);
|
||||||
static SDVersion sd_vae_format_to_version(enum sd_vae_format_t format, SDVersion fallback);
|
static SDVersion sd_vae_format_to_version(enum sd_vae_format_t format, SDVersion fallback);
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
const char* model_version_to_str[] = {
|
const char* model_version_to_str[] = {
|
||||||
"SD 1.x",
|
"SD 1.x",
|
||||||
"SD 1.x Inpaint",
|
"SD 1.x Inpaint",
|
||||||
@ -159,6 +161,9 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
|
|||||||
|
|
||||||
/*=============================================== StableDiffusionGGML ================================================*/
|
/*=============================================== StableDiffusionGGML ================================================*/
|
||||||
|
|
||||||
|
static_assert(std::atomic<sd_cancel_mode_t>::is_always_lock_free,
|
||||||
|
"sd_cancel_mode_t must be lock-free");
|
||||||
|
|
||||||
class StableDiffusionGGML {
|
class StableDiffusionGGML {
|
||||||
public:
|
public:
|
||||||
SDBackendManager backend_manager;
|
SDBackendManager backend_manager;
|
||||||
@ -222,6 +227,20 @@ public:
|
|||||||
return module_backend;
|
return module_backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::atomic<sd_cancel_mode_t> cancellation_flag = SD_CANCEL_RESET;
|
||||||
|
|
||||||
|
void set_cancel_flag(enum sd_cancel_mode_t flag) {
|
||||||
|
cancellation_flag.store(flag, std::memory_order_release);
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset_cancel_flag() {
|
||||||
|
set_cancel_flag(SD_CANCEL_RESET);
|
||||||
|
}
|
||||||
|
|
||||||
|
enum sd_cancel_mode_t get_cancel_flag() {
|
||||||
|
return cancellation_flag.load(std::memory_order_acquire);
|
||||||
|
}
|
||||||
|
|
||||||
size_t max_graph_vram_bytes_for_module(SDBackendModule module) {
|
size_t max_graph_vram_bytes_for_module(SDBackendModule module) {
|
||||||
return max_vram_assignment.bytes_for_backend(backend_for(module));
|
return max_vram_assignment.bytes_for_backend(backend_for(module));
|
||||||
}
|
}
|
||||||
@ -428,6 +447,14 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (strlen(SAFE_STR(sd_ctx_params->pulid_weights_path)) > 0) {
|
||||||
|
LOG_INFO("loading PuLID weights from '%s'", sd_ctx_params->pulid_weights_path);
|
||||||
|
if (!model_loader.init_from_file(sd_ctx_params->pulid_weights_path,
|
||||||
|
"model.diffusion_model.")) {
|
||||||
|
LOG_WARN("loading PuLID weights from '%s' failed", sd_ctx_params->pulid_weights_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) {
|
if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) {
|
||||||
LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path);
|
LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path);
|
||||||
if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) {
|
if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) {
|
||||||
@ -1012,6 +1039,14 @@ public:
|
|||||||
if (photomaker_extension->is_enabled()) {
|
if (photomaker_extension->is_enabled()) {
|
||||||
generation_extensions.push_back(photomaker_extension);
|
generation_extensions.push_back(photomaker_extension);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto pulid_extension = create_pulid_extension();
|
||||||
|
if (!pulid_extension->init(extension_ctx)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (pulid_extension->is_enabled()) {
|
||||||
|
generation_extensions.push_back(pulid_extension);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (auto& extension : generation_extensions) {
|
for (auto& extension : generation_extensions) {
|
||||||
if (!register_runner_params(extension->name(),
|
if (!register_runner_params(extension->name(),
|
||||||
@ -1522,6 +1557,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void prepare_generation_extensions(const sd_pm_params_t& pm_params,
|
void prepare_generation_extensions(const sd_pm_params_t& pm_params,
|
||||||
|
const sd_pulid_params_t& pulid_params,
|
||||||
ConditionerParams& condition_params,
|
ConditionerParams& condition_params,
|
||||||
int total_steps) {
|
int total_steps) {
|
||||||
reset_generation_extensions();
|
reset_generation_extensions();
|
||||||
@ -1529,6 +1565,7 @@ public:
|
|||||||
cond_stage_model.get(),
|
cond_stage_model.get(),
|
||||||
condition_params,
|
condition_params,
|
||||||
pm_params,
|
pm_params,
|
||||||
|
pulid_params,
|
||||||
n_threads,
|
n_threads,
|
||||||
total_steps,
|
total_steps,
|
||||||
};
|
};
|
||||||
@ -1923,6 +1960,11 @@ public:
|
|||||||
SamplePreviewContext preview = prepare_sample_preview_context();
|
SamplePreviewContext preview = prepare_sample_preview_context();
|
||||||
|
|
||||||
auto denoise = [&](const sd::Tensor<float>& x, float sigma, int step) -> sd::guidance::GuiderOutput {
|
auto denoise = [&](const sd::Tensor<float>& x, float sigma, int step) -> sd::guidance::GuiderOutput {
|
||||||
|
if (get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_DEBUG("cancelling generation");
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
if (step == 1 || step == -1) {
|
if (step == 1 || step == -1) {
|
||||||
pretty_progress(0, (int)steps, 0);
|
pretty_progress(0, (int)steps, 0);
|
||||||
last_progress_us = ggml_time_us();
|
last_progress_us = ggml_time_us();
|
||||||
@ -2043,6 +2085,10 @@ public:
|
|||||||
return std::move(cached_output);
|
return std::move(cached_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (const auto& extension : generation_extensions) {
|
||||||
|
extension->before_diffusion(diffusion_params, step);
|
||||||
|
}
|
||||||
|
|
||||||
auto output_opt = work_diffusion_model->compute(n_threads, diffusion_params);
|
auto output_opt = work_diffusion_model->compute(n_threads, diffusion_params);
|
||||||
if (output_opt.empty()) {
|
if (output_opt.empty()) {
|
||||||
LOG_ERROR("diffusion model compute failed");
|
LOG_ERROR("diffusion model compute failed");
|
||||||
@ -2642,6 +2688,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
|
|||||||
sd_ctx_params->backend = nullptr;
|
sd_ctx_params->backend = nullptr;
|
||||||
sd_ctx_params->params_backend = nullptr;
|
sd_ctx_params->params_backend = nullptr;
|
||||||
sd_ctx_params->rpc_servers = nullptr;
|
sd_ctx_params->rpc_servers = nullptr;
|
||||||
|
sd_ctx_params->pulid_weights_path = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
||||||
@ -2667,6 +2714,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
"taesd_path: %s\n"
|
"taesd_path: %s\n"
|
||||||
"control_net_path: %s\n"
|
"control_net_path: %s\n"
|
||||||
"photo_maker_path: %s\n"
|
"photo_maker_path: %s\n"
|
||||||
|
"pulid_weights_path: %s\n"
|
||||||
"tensor_type_rules: %s\n"
|
"tensor_type_rules: %s\n"
|
||||||
"n_threads: %d\n"
|
"n_threads: %d\n"
|
||||||
"wtype: %s\n"
|
"wtype: %s\n"
|
||||||
@ -2701,6 +2749,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
SAFE_STR(sd_ctx_params->taesd_path),
|
SAFE_STR(sd_ctx_params->taesd_path),
|
||||||
SAFE_STR(sd_ctx_params->control_net_path),
|
SAFE_STR(sd_ctx_params->control_net_path),
|
||||||
SAFE_STR(sd_ctx_params->photo_maker_path),
|
SAFE_STR(sd_ctx_params->photo_maker_path),
|
||||||
|
SAFE_STR(sd_ctx_params->pulid_weights_path),
|
||||||
SAFE_STR(sd_ctx_params->tensor_type_rules),
|
SAFE_STR(sd_ctx_params->tensor_type_rules),
|
||||||
sd_ctx_params->n_threads,
|
sd_ctx_params->n_threads,
|
||||||
sd_type_name(sd_ctx_params->wtype),
|
sd_type_name(sd_ctx_params->wtype),
|
||||||
@ -2795,6 +2844,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
|
|||||||
sd_img_gen_params->batch_count = 1;
|
sd_img_gen_params->batch_count = 1;
|
||||||
sd_img_gen_params->control_strength = 0.9f;
|
sd_img_gen_params->control_strength = 0.9f;
|
||||||
sd_img_gen_params->pm_params = {nullptr, 0, nullptr, 20.f};
|
sd_img_gen_params->pm_params = {nullptr, 0, nullptr, 20.f};
|
||||||
|
sd_img_gen_params->pulid_params = {nullptr, 1.0f};
|
||||||
sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
|
sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
|
||||||
sd_cache_params_init(&sd_img_gen_params->cache);
|
sd_cache_params_init(&sd_img_gen_params->cache);
|
||||||
sd_hires_params_init(&sd_img_gen_params->hires);
|
sd_hires_params_init(&sd_img_gen_params->hires);
|
||||||
@ -2937,6 +2987,15 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
|
|||||||
free(sd_ctx);
|
free(sd_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SD_API void sd_cancel_generation(sd_ctx_t* sd_ctx, enum sd_cancel_mode_t mode) {
|
||||||
|
if (sd_ctx && sd_ctx->sd) {
|
||||||
|
if (mode < SD_CANCEL_ALL || mode > SD_CANCEL_RESET) {
|
||||||
|
mode = SD_CANCEL_ALL;
|
||||||
|
}
|
||||||
|
sd_ctx->sd->set_cancel_flag(mode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static sd_audio_t* waveform_to_sd_audio(const StableDiffusionGGML* sd,
|
static sd_audio_t* waveform_to_sd_audio(const StableDiffusionGGML* sd,
|
||||||
const sd::Tensor<float>& waveform) {
|
const sd::Tensor<float>& waveform) {
|
||||||
if (sd == nullptr || waveform.empty()) {
|
if (sd == nullptr || waveform.empty()) {
|
||||||
@ -3096,6 +3155,7 @@ struct GenerationRequest {
|
|||||||
sd_guidance_params_t guidance = {};
|
sd_guidance_params_t guidance = {};
|
||||||
sd_guidance_params_t high_noise_guidance = {};
|
sd_guidance_params_t high_noise_guidance = {};
|
||||||
sd_pm_params_t pm_params = {};
|
sd_pm_params_t pm_params = {};
|
||||||
|
sd_pulid_params_t pulid_params = {};
|
||||||
sd_hires_params_t hires = {};
|
sd_hires_params_t hires = {};
|
||||||
int frames = -1;
|
int frames = -1;
|
||||||
int requested_frames = -1;
|
int requested_frames = -1;
|
||||||
@ -3121,6 +3181,7 @@ struct GenerationRequest {
|
|||||||
has_ref_images = sd_img_gen_params->ref_images_count > 0;
|
has_ref_images = sd_img_gen_params->ref_images_count > 0;
|
||||||
guidance = sd_img_gen_params->sample_params.guidance;
|
guidance = sd_img_gen_params->sample_params.guidance;
|
||||||
pm_params = sd_img_gen_params->pm_params;
|
pm_params = sd_img_gen_params->pm_params;
|
||||||
|
pulid_params = sd_img_gen_params->pulid_params;
|
||||||
hires = sd_img_gen_params->hires;
|
hires = sd_img_gen_params->hires;
|
||||||
cache_params = &sd_img_gen_params->cache;
|
cache_params = &sd_img_gen_params->cache;
|
||||||
resolve(sd_ctx);
|
resolve(sd_ctx);
|
||||||
@ -4047,6 +4108,7 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
|
|||||||
condition_params.ref_images = &latents->ref_images;
|
condition_params.ref_images = &latents->ref_images;
|
||||||
|
|
||||||
sd_ctx->sd->prepare_generation_extensions(request->pm_params,
|
sd_ctx->sd->prepare_generation_extensions(request->pm_params,
|
||||||
|
request->pulid_params,
|
||||||
condition_params,
|
condition_params,
|
||||||
plan->total_steps);
|
plan->total_steps);
|
||||||
int64_t prepare_start_ms = ggml_time_ms();
|
int64_t prepare_start_ms = ggml_time_ms();
|
||||||
@ -4121,15 +4183,29 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
|
|||||||
static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx,
|
static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx,
|
||||||
const GenerationRequest& request,
|
const GenerationRequest& request,
|
||||||
const std::vector<sd::Tensor<float>>& final_latents) {
|
const std::vector<sd::Tensor<float>>& final_latents) {
|
||||||
if (final_latents.size() != static_cast<size_t>(request.batch_count)) {
|
if (final_latents.empty()) {
|
||||||
LOG_ERROR("expected %d latents, got %zu", request.batch_count, final_latents.size());
|
LOG_ERROR("no latent images to decode");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
if (final_latents.size() > static_cast<size_t>(request.batch_count)) {
|
||||||
|
LOG_ERROR("expected at most %d latents, got %zu", request.batch_count, final_latents.size());
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
if (final_latents.size() < static_cast<size_t>(request.batch_count)) {
|
||||||
|
LOG_INFO("decoding %zu/%d latents", final_latents.size(), request.batch_count);
|
||||||
|
} else {
|
||||||
LOG_INFO("decoding %zu latents", final_latents.size());
|
LOG_INFO("decoding %zu latents", final_latents.size());
|
||||||
|
}
|
||||||
std::vector<sd::Tensor<float>> decoded_images;
|
std::vector<sd::Tensor<float>> decoded_images;
|
||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
|
bool cancelled = false;
|
||||||
|
|
||||||
for (size_t i = 0; i < final_latents.size(); i++) {
|
for (size_t i = 0; i < final_latents.size(); i++) {
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling latent decodings");
|
||||||
|
cancelled = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
int64_t t1 = ggml_time_ms();
|
int64_t t1 = ggml_time_ms();
|
||||||
sd::Tensor<float> image = sd_ctx->sd->decode_first_stage(final_latents[i]);
|
sd::Tensor<float> image = sd_ctx->sd->decode_first_stage(final_latents[i]);
|
||||||
if (image.empty()) {
|
if (image.empty()) {
|
||||||
@ -4143,6 +4219,10 @@ static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx,
|
|||||||
|
|
||||||
int64_t t4 = ggml_time_ms();
|
int64_t t4 = ggml_time_ms();
|
||||||
LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t0) * 1.0f / 1000);
|
LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t0) * 1.0f / 1000);
|
||||||
|
if (decoded_images.empty()) {
|
||||||
|
LOG_ERROR(cancelled ? "cancelled before any latent images were decoded" : "no decoded images");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
sd_image_t* result_images = (sd_image_t*)calloc(request.batch_count, sizeof(sd_image_t));
|
sd_image_t* result_images = (sd_image_t*)calloc(request.batch_count, sizeof(sd_image_t));
|
||||||
if (result_images == nullptr) {
|
if (result_images == nullptr) {
|
||||||
@ -4161,6 +4241,11 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
|
|||||||
const sd::Tensor<float>& latent,
|
const sd::Tensor<float>& latent,
|
||||||
const GenerationRequest& request,
|
const GenerationRequest& request,
|
||||||
UpscalerGGML* upscaler) {
|
UpscalerGGML* upscaler) {
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling hires latent upscale");
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
auto get_hires_latent_target_shape = [&]() {
|
auto get_hires_latent_target_shape = [&]() {
|
||||||
std::vector<int64_t> target_shape = latent.shape();
|
std::vector<int64_t> target_shape = latent.shape();
|
||||||
if (target_shape.size() < 2) {
|
if (target_shape.size() < 2) {
|
||||||
@ -4233,6 +4318,10 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
|
|||||||
sd_hires_upscaler_name(request.hires.upscaler));
|
sd_hires_upscaler_name(request.hires.upscaler));
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling hires image upscale");
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
sd::Tensor<float> upscaled_tensor;
|
sd::Tensor<float> upscaled_tensor;
|
||||||
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
|
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
|
||||||
@ -4269,6 +4358,10 @@ static sd::Tensor<float> upscale_hires_latent(sd_ctx_t* sd_ctx,
|
|||||||
upscaled_tensor = sd::ops::clamp(upscaled_tensor, 0.0f, 1.0f);
|
upscaled_tensor = sd::ops::clamp(upscaled_tensor, 0.0f, 1.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling hires latent encode");
|
||||||
|
return {};
|
||||||
|
}
|
||||||
sd::Tensor<float> upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor);
|
sd::Tensor<float> upscaled_latent = sd_ctx->sd->encode_first_stage(upscaled_tensor);
|
||||||
if (upscaled_latent.empty()) {
|
if (upscaled_latent.empty()) {
|
||||||
LOG_ERROR("encode_first_stage failed after hires %s upscale",
|
LOG_ERROR("encode_first_stage failed after hires %s upscale",
|
||||||
@ -4333,6 +4426,8 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sd_ctx->sd->reset_cancel_flag();
|
||||||
|
|
||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
|
sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
|
||||||
GenerationRequest request(sd_ctx, sd_img_gen_params);
|
GenerationRequest request(sd_ctx, sd_img_gen_params);
|
||||||
@ -4368,6 +4463,18 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
|||||||
std::vector<sd::Tensor<float>> final_latents;
|
std::vector<sd::Tensor<float>> final_latents;
|
||||||
int64_t denoise_start = ggml_time_ms();
|
int64_t denoise_start = ggml_time_ms();
|
||||||
for (int b = 0; b < request.batch_count; b++) {
|
for (int b = 0; b < request.batch_count; b++) {
|
||||||
|
sd_cancel_mode_t cancel = sd_ctx->sd->get_cancel_flag();
|
||||||
|
if (cancel == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
if (cancel == SD_CANCEL_NEW_LATENTS) {
|
||||||
|
LOG_INFO("cancelling new latent generation, returning %zu/%d completed latents",
|
||||||
|
final_latents.size(),
|
||||||
|
request.batch_count);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
int64_t sampling_start = ggml_time_ms();
|
int64_t sampling_start = ggml_time_ms();
|
||||||
int64_t cur_seed = request.seed + b;
|
int64_t cur_seed = request.seed + b;
|
||||||
LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, request.batch_count, cur_seed);
|
LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, request.batch_count, cur_seed);
|
||||||
@ -4417,12 +4524,24 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
|||||||
LOG_INFO("generating %zu latent images completed, taking %.2fs",
|
LOG_INFO("generating %zu latent images completed, taking %.2fs",
|
||||||
final_latents.size(),
|
final_latents.size(),
|
||||||
(denoise_end - denoise_start) * 1.0f / 1000);
|
(denoise_end - denoise_start) * 1.0f / 1000);
|
||||||
|
if (final_latents.empty()) {
|
||||||
|
LOG_ERROR("no latent images generated");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
if (request.hires.enabled && request.hires.target_width > 0) {
|
if (request.hires.enabled && request.hires.target_width > 0) {
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation before hires fix");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
LOG_INFO("hires fix: upscaling to %dx%d", request.hires.target_width, request.hires.target_height);
|
LOG_INFO("hires fix: upscaling to %dx%d", request.hires.target_width, request.hires.target_height);
|
||||||
|
|
||||||
std::unique_ptr<UpscalerGGML> hires_upscaler;
|
std::unique_ptr<UpscalerGGML> hires_upscaler;
|
||||||
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
|
if (request.hires.upscaler == SD_HIRES_UPSCALER_MODEL) {
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation before hires model load");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
LOG_INFO("hires fix: loading model upscaler from '%s'", request.hires.model_path);
|
LOG_INFO("hires fix: loading model upscaler from '%s'", request.hires.model_path);
|
||||||
hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads,
|
hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads,
|
||||||
false,
|
false,
|
||||||
@ -4456,6 +4575,10 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
|
|||||||
std::vector<sd::Tensor<float>> hires_final_latents;
|
std::vector<sd::Tensor<float>> hires_final_latents;
|
||||||
int64_t hires_denoise_start = ggml_time_ms();
|
int64_t hires_denoise_start = ggml_time_ms();
|
||||||
for (int b = 0; b < (int)final_latents.size(); b++) {
|
for (int b = 0; b < (int)final_latents.size(); b++) {
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation during hires fix");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
int64_t cur_seed = request.seed + b;
|
int64_t cur_seed = request.seed + b;
|
||||||
sd_ctx->sd->rng->manual_seed(cur_seed);
|
sd_ctx->sd->rng->manual_seed(cur_seed);
|
||||||
sd_ctx->sd->sampler_rng->manual_seed(cur_seed);
|
sd_ctx->sd->sampler_rng->manual_seed(cur_seed);
|
||||||
@ -4886,6 +5009,10 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx,
|
|||||||
LOG_ERROR("no latent video to decode");
|
LOG_ERROR("no latent video to decode");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling video decode");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
sd::Tensor<float> video_latent = final_latent;
|
sd::Tensor<float> video_latent = final_latent;
|
||||||
if (sd_version_is_ltxav(sd_ctx->sd->version) &&
|
if (sd_version_is_ltxav(sd_ctx->sd->version) &&
|
||||||
video_latent.shape()[3] > sd_ctx->sd->get_latent_channel()) {
|
video_latent.shape()[3] > sd_ctx->sd->get_latent_channel()) {
|
||||||
@ -5131,6 +5258,9 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||||||
if (audio_out != nullptr) {
|
if (audio_out != nullptr) {
|
||||||
*audio_out = nullptr;
|
*audio_out = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sd_ctx->sd->reset_cancel_flag();
|
||||||
|
|
||||||
if (num_frames_out != nullptr) {
|
if (num_frames_out != nullptr) {
|
||||||
*num_frames_out = 0;
|
*num_frames_out = 0;
|
||||||
}
|
}
|
||||||
@ -5192,6 +5322,10 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||||||
sd::Tensor<float> noise = sd::Tensor<float>::randn_like(x_t, sd_ctx->sd->rng);
|
sd::Tensor<float> noise = sd::Tensor<float>::randn_like(x_t, sd_ctx->sd->rng);
|
||||||
|
|
||||||
if (plan.high_noise_sample_steps > 0) {
|
if (plan.high_noise_sample_steps > 0) {
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation before high-noise sampling");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T);
|
LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T);
|
||||||
|
|
||||||
int64_t sampling_start = ggml_time_ms();
|
int64_t sampling_start = ggml_time_ms();
|
||||||
@ -5234,6 +5368,10 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||||||
LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
|
LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation before sampling");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
LOG_DEBUG("sample %dx%dx%d", W, H, T);
|
LOG_DEBUG("sample %dx%dx%d", W, H, T);
|
||||||
int64_t sampling_start = ggml_time_ms();
|
int64_t sampling_start = ggml_time_ms();
|
||||||
sd::Tensor<float> final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
|
sd::Tensor<float> final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model,
|
||||||
@ -5270,6 +5408,10 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||||||
LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
|
LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
|
||||||
|
|
||||||
if (latent_upscale_enabled) {
|
if (latent_upscale_enabled) {
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation before latent upscale");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
int64_t upscale_start = ggml_time_ms();
|
int64_t upscale_start = ggml_time_ms();
|
||||||
sd::Tensor<float> upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx,
|
sd::Tensor<float> upscaled_latent = upscale_ltx_spatial_video_latent(sd_ctx,
|
||||||
request.hires.model_path,
|
request.hires.model_path,
|
||||||
@ -5329,6 +5471,10 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||||||
}
|
}
|
||||||
sd::Tensor<float> hires_denoise_mask;
|
sd::Tensor<float> hires_denoise_mask;
|
||||||
sd::Tensor<float> hires_video_positions;
|
sd::Tensor<float> hires_video_positions;
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation before latent upscale refine");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if (!apply_ltxv_refine_image_conditioning(sd_ctx,
|
if (!apply_ltxv_refine_image_conditioning(sd_ctx,
|
||||||
sd_vid_gen_params,
|
sd_vid_gen_params,
|
||||||
hires_request,
|
hires_request,
|
||||||
@ -5408,6 +5554,10 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||||||
if (sd_version_is_ltxav(sd_ctx->sd->version) &&
|
if (sd_version_is_ltxav(sd_ctx->sd->version) &&
|
||||||
latents.audio_length > 0 &&
|
latents.audio_length > 0 &&
|
||||||
sd_ctx->sd->audio_vae_model != nullptr) {
|
sd_ctx->sd->audio_vae_model != nullptr) {
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation before audio decode");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
int64_t audio_latent_decode_start = ggml_time_ms();
|
int64_t audio_latent_decode_start = ggml_time_ms();
|
||||||
|
|
||||||
auto audio_latent = unpack_ltxav_audio_latent(final_latent,
|
auto audio_latent = unpack_ltxav_audio_latent(final_latent,
|
||||||
@ -5440,6 +5590,11 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
|
|||||||
final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]);
|
final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sd_ctx->sd->get_cancel_flag() == SD_CANCEL_ALL) {
|
||||||
|
LOG_ERROR("cancelling generation before video decode");
|
||||||
|
free_sd_audio(generated_audio);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
auto result = decode_video_outputs(sd_ctx, latent_upscale_enabled ? hires_request : request, final_latent, num_frames_out);
|
auto result = decode_video_outputs(sd_ctx, latent_upscale_enabled ? hires_request : request, final_latent, num_frames_out);
|
||||||
if (result == nullptr) {
|
if (result == nullptr) {
|
||||||
free_sd_audio(generated_audio);
|
free_sd_audio(generated_audio);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user