mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-17 11:46:38 +00:00
fix: simplify PuLID ID extraction setup (#1664)
This commit is contained in:
parent
93527fda74
commit
146b6cc49e
@ -52,14 +52,15 @@ to a `.pulidembd` binary file (about 131 KB). Run it once per source
|
|||||||
person; the same file is reused for any number of generations.
|
person; the same file is reused for any number of generations.
|
||||||
|
|
||||||
A reference Python script is provided alongside this docs file at
|
A reference Python script is provided alongside this docs file at
|
||||||
[`scripts/pulid_extract_id.py`](../scripts/pulid_extract_id.py). It
|
[`script/pulid_extract_id.py`](../script/pulid_extract_id.py). It
|
||||||
requires:
|
requires:
|
||||||
- A working CUDA / CPU PyTorch + diffusers stack
|
- A working CUDA / CPU PyTorch stack
|
||||||
- `insightface`, `facexlib`, `eva-clip`, `torchvision`
|
- `insightface`, `facexlib`, `eva-clip`, `torchvision`, `opencv-python`,
|
||||||
|
`huggingface_hub`, `gguf`
|
||||||
- The PuLID weights file (same one stable-diffusion.cpp will load below)
|
- The PuLID weights file (same one stable-diffusion.cpp will load below)
|
||||||
- The ToTheBeginning/PuLID repo's `pulid/pipeline_flux.py` (and its
|
- The ToTheBeginning/PuLID repo's `pulid/` package (including
|
||||||
dependencies under `pulid/` and `flux/`) -- recommended to vendor
|
`pulid/pipeline_flux.py`) and `eva_clip/` package on `PYTHONPATH`; `flux/`
|
||||||
rather than pip-install due to upstream packaging quirks
|
is not needed for embedding extraction
|
||||||
|
|
||||||
Run it as:
|
Run it as:
|
||||||
|
|
||||||
|
|||||||
@ -417,7 +417,7 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
&photo_maker_path},
|
&photo_maker_path},
|
||||||
{"",
|
{"",
|
||||||
"--pulid-weights",
|
"--pulid-weights",
|
||||||
"path to PuLID flux weights (e.g. pulid_flux_v0.9.1.safetensors). Identity is injected during the denoise loop when paired with --pulid-id-embedding.",
|
"path to PuLID Flux weights",
|
||||||
&pulid_weights_path},
|
&pulid_weights_path},
|
||||||
{"",
|
{"",
|
||||||
"--upscale-model",
|
"--upscale-model",
|
||||||
@ -894,7 +894,7 @@ ArgOptions SDGenerationParams::get_options() {
|
|||||||
&pm_id_embed_path},
|
&pm_id_embed_path},
|
||||||
{"",
|
{"",
|
||||||
"--pulid-id-embedding",
|
"--pulid-id-embedding",
|
||||||
"path to a .pulidembd binary produced by pulid_extract_id.py. Carries a (32, 2048) identity embedding extracted from a source portrait. Pair with --pulid-weights on the context.",
|
"path to PuLID id embedding",
|
||||||
&pulid_id_embedding_path},
|
&pulid_id_embedding_path},
|
||||||
{"",
|
{"",
|
||||||
"--hires-upscaler",
|
"--hires-upscaler",
|
||||||
@ -1048,7 +1048,7 @@ ArgOptions SDGenerationParams::get_options() {
|
|||||||
&pm_style_strength},
|
&pm_style_strength},
|
||||||
{"",
|
{"",
|
||||||
"--pulid-id-weight",
|
"--pulid-id-weight",
|
||||||
"strength of PuLID identity injection (default: 1.0). 0.7-1.2 are typical; lower lets the prompt override the face more, higher tightens identity match.",
|
"strength of PuLID identity injection",
|
||||||
&pulid_id_weight},
|
&pulid_id_weight},
|
||||||
{"",
|
{"",
|
||||||
"--control-strength",
|
"--control-strength",
|
||||||
|
|||||||
@ -133,10 +133,6 @@ struct SDContextParams {
|
|||||||
std::string control_net_path;
|
std::string control_net_path;
|
||||||
std::string embedding_dir;
|
std::string embedding_dir;
|
||||||
std::string photo_maker_path;
|
std::string photo_maker_path;
|
||||||
// PuLID-Flux identity-preservation context path: the safetensors blob
|
|
||||||
// carrying the PerceiverAttentionCA cross-attention weights. Loaded
|
|
||||||
// once with the model. Per-generation pulid_id_embedding_path lives in
|
|
||||||
// SDGenerationParams below.
|
|
||||||
std::string pulid_weights_path;
|
std::string pulid_weights_path;
|
||||||
sd_type_t wtype = SD_TYPE_COUNT;
|
sd_type_t wtype = SD_TYPE_COUNT;
|
||||||
std::string tensor_type_rules;
|
std::string tensor_type_rules;
|
||||||
@ -239,9 +235,6 @@ struct SDGenerationParams {
|
|||||||
std::string pm_id_embed_path;
|
std::string pm_id_embed_path;
|
||||||
float pm_style_strength = 20.f;
|
float pm_style_strength = 20.f;
|
||||||
|
|
||||||
// PuLID-Flux: per-generation identity embedding (binary file produced by
|
|
||||||
// runtime-scripts/pulid_extract_id.py). Format documented in
|
|
||||||
// include/stable-diffusion.h sd_pulid_params_t.
|
|
||||||
std::string pulid_id_embedding_path;
|
std::string pulid_id_embedding_path;
|
||||||
float pulid_id_weight = 1.0f;
|
float pulid_id_weight = 1.0f;
|
||||||
|
|
||||||
|
|||||||
@ -195,15 +195,6 @@ typedef struct {
|
|||||||
const sd_embedding_t* embeddings;
|
const sd_embedding_t* embeddings;
|
||||||
uint32_t embedding_count;
|
uint32_t embedding_count;
|
||||||
const char* photo_maker_path;
|
const char* photo_maker_path;
|
||||||
/**
|
|
||||||
* Path to pulid_flux_v0.9.1.safetensors (the PuLID identity-injection
|
|
||||||
* cross-attention weights). When set together with sd_img_gen_params_t.
|
|
||||||
* pulid_params.id_embedding_path, the Flux diffusion model performs PuLID
|
|
||||||
* cross-attention injection during the denoise loop. Loaded once with
|
|
||||||
* the model; the embedding is per-generation. Currently only meaningful
|
|
||||||
* for Flux (depth=19 double, 38 single blocks); silently ignored for
|
|
||||||
* other model versions.
|
|
||||||
*/
|
|
||||||
const char* pulid_weights_path;
|
const char* pulid_weights_path;
|
||||||
const char* tensor_type_rules;
|
const char* tensor_type_rules;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
@ -282,23 +273,9 @@ typedef struct {
|
|||||||
float style_strength;
|
float style_strength;
|
||||||
} sd_pm_params_t; // photo maker
|
} sd_pm_params_t; // photo maker
|
||||||
|
|
||||||
/**
|
|
||||||
* PuLID-Flux identity preservation params.
|
|
||||||
*
|
|
||||||
* Unlike PhotoMaker (which extracts the ID embedding inside the inference
|
|
||||||
* process from a directory of images), PuLID's ID extraction is a heavy
|
|
||||||
* Python-only stack (insightface ArcFace + EVA-CLIP-L + IDFormer). To stay
|
|
||||||
* cross-vendor in C++/Vulkan, sd.cpp consumes a precomputed binary file
|
|
||||||
* produced by an external tool (runtime-scripts/pulid_extract_id.py in the
|
|
||||||
* Cloudhands client tree).
|
|
||||||
*
|
|
||||||
* Format: a gguf container with a single tensor "pulid_id" of shape
|
|
||||||
* [token_dim, num_tokens] (ggml order; typically [2048, 32]) in F16/F32/BF16.
|
|
||||||
* Loaded with the standard gguf reader; see docs/pulid.md.
|
|
||||||
*/
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const char* id_embedding_path; // path to .pulidembd file produced by pulid_extract_id.py
|
const char* id_embedding_path;
|
||||||
float id_weight; // strength of the ID injection; typical 0.7-1.2, default 1.0
|
float id_weight;
|
||||||
} sd_pulid_params_t;
|
} sd_pulid_params_t;
|
||||||
|
|
||||||
enum sd_cache_mode_t {
|
enum sd_cache_mode_t {
|
||||||
|
|||||||
@ -2,26 +2,18 @@
|
|||||||
Precompute a PuLID-Flux identity embedding from a single source portrait.
|
Precompute a PuLID-Flux identity embedding from a single source portrait.
|
||||||
|
|
||||||
Writes a gguf file (a single tensor `pulid_id`) that stable-diffusion.cpp's
|
Writes a gguf file (a single tensor `pulid_id`) that stable-diffusion.cpp's
|
||||||
`--pulid-id-embedding` flag consumes. See docs/pulid.md for the format and
|
`--pulid-id-embedding` flag consumes.
|
||||||
overall PuLID-Flux flow.
|
|
||||||
|
|
||||||
This script intentionally lives outside the C++ build: identity extraction
|
|
||||||
needs insightface + EVA-CLIP-L + IDFormer, which are PyTorch-only stacks
|
|
||||||
that would be impractical to reimplement in ggml just to run once per
|
|
||||||
source person. The C++ side downstream of this file is cross-vendor and
|
|
||||||
backend-agnostic.
|
|
||||||
|
|
||||||
Dependencies (recommended: vendor rather than pip-install due to upstream
|
Dependencies (recommended: vendor rather than pip-install due to upstream
|
||||||
packaging quirks):
|
packaging quirks):
|
||||||
- torch + safetensors
|
- torch + safetensors
|
||||||
- The ToTheBeginning/PuLID repository's `pulid/pipeline_flux.py` and
|
- The ToTheBeginning/PuLID repository's `pulid/` package and `eva_clip/`.
|
||||||
its sibling packages (`flux/`, `eva_clip/`, `models/`). Put them on
|
Put them on PYTHONPATH or sys.path before running this script.
|
||||||
PYTHONPATH or sys.path before running this script.
|
- insightface, facexlib, torchvision, opencv-python, huggingface_hub, gguf
|
||||||
- insightface, facexlib (PuLID pipeline pulls these in)
|
|
||||||
- numpy, Pillow
|
- numpy, Pillow
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python pulid_extract_id.py \\
|
python script/pulid_extract_id.py \\
|
||||||
--portrait /path/to/source-photo.jpg \\
|
--portrait /path/to/source-photo.jpg \\
|
||||||
--pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \\
|
--pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \\
|
||||||
--out /path/to/source.pulidembd
|
--out /path/to/source.pulidembd
|
||||||
@ -35,21 +27,7 @@ from __future__ import annotations
|
|||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
def _make_minimal_flux_skeleton(device):
|
|
||||||
"""PuLIDPipeline expects a `dit` (Flux transformer) to attach its
|
|
||||||
PerceiverAttentionCA modules to during construction. We never run a
|
|
||||||
forward pass on it -- the encoders alone (which is what we actually
|
|
||||||
need) live on the pipeline object, not the dit. So we instantiate a
|
|
||||||
real Flux skeleton with default params and never load its weights."""
|
|
||||||
import torch
|
|
||||||
from flux.model import Flux
|
|
||||||
from flux.util import configs
|
|
||||||
|
|
||||||
with torch.device("cpu"):
|
|
||||||
model = Flux(configs["flux-dev"].params).to(torch.bfloat16)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
|
def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
|
||||||
@ -65,18 +43,17 @@ def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
|
|||||||
|
|
||||||
print(f"device={device}", flush=True)
|
print(f"device={device}", flush=True)
|
||||||
|
|
||||||
print("constructing minimal Flux skeleton (no weights loaded)", flush=True)
|
# PuLIDPipeline only attaches pulid_ca attributes to `dit` during
|
||||||
dit = _make_minimal_flux_skeleton(device)
|
# construction; get_id_embedding() never runs Flux, so a dummy object is
|
||||||
|
# enough and avoids importing/building a Flux skeleton.
|
||||||
print("instantiating PuLIDPipeline", flush=True)
|
print("instantiating PuLIDPipeline with a dummy Flux object", flush=True)
|
||||||
pulid = PuLIDPipeline(dit=dit, device=device,
|
dit = SimpleNamespace()
|
||||||
|
pulid = PuLIDPipeline(dit=dit,
|
||||||
|
device=device,
|
||||||
weight_dtype=torch.bfloat16,
|
weight_dtype=torch.bfloat16,
|
||||||
onnx_provider=onnx_provider)
|
onnx_provider=onnx_provider)
|
||||||
|
|
||||||
print(f"loading PuLID weights from {pulid_weights}", flush=True)
|
print(f"loading PuLID weights from {pulid_weights}", flush=True)
|
||||||
# PuLIDPipeline.load_pretrain expects a "version" string used to construct
|
|
||||||
# the default filename when pretrain_path is None. We pass the file
|
|
||||||
# directly so the version string is informational only.
|
|
||||||
pulid.load_pretrain(pretrain_path=pulid_weights, version="v0.9.1")
|
pulid.load_pretrain(pretrain_path=pulid_weights, version="v0.9.1")
|
||||||
|
|
||||||
print(f"extracting ID embedding from {portrait_path}", flush=True)
|
print(f"extracting ID embedding from {portrait_path}", flush=True)
|
||||||
@ -100,10 +77,6 @@ def write_embd(tensor, out_path: str, dtype_choice: str) -> None:
|
|||||||
|
|
||||||
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
|
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
|
||||||
|
|
||||||
# The embedding ships as a standard gguf container holding a single tensor
|
|
||||||
# named "pulid_id". numpy is row-major (num_tokens, token_dim); gguf stores
|
|
||||||
# dims reversed, so stable-diffusion.cpp reads it back as
|
|
||||||
# ne[0]=token_dim, ne[1]=num_tokens (see load_pulid_id_embedding).
|
|
||||||
writer = gguf.GGUFWriter(out_path, arch="pulid")
|
writer = gguf.GGUFWriter(out_path, arch="pulid")
|
||||||
writer.add_uint32("pulid.version", 1)
|
writer.add_uint32("pulid.version", 1)
|
||||||
|
|
||||||
@ -7,15 +7,6 @@
|
|||||||
#include "core/util.h"
|
#include "core/util.h"
|
||||||
#include "gguf.h"
|
#include "gguf.h"
|
||||||
|
|
||||||
// Load the precomputed PuLID identity embedding produced by
|
|
||||||
// scripts/pulid_extract_id.py into a sd::Tensor<float> (always materialized as
|
|
||||||
// fp32 for the diffusion path). Returns an empty tensor on any failure (the
|
|
||||||
// caller treats empty as "PuLID off").
|
|
||||||
//
|
|
||||||
// The file is a standard gguf container holding a single tensor named
|
|
||||||
// "pulid_id" with shape [token_dim, num_tokens] (ggml order; typically
|
|
||||||
// [2048, 32]) in f16 / bf16 / f32. Using gguf rather than a bespoke header
|
|
||||||
// means the shape + dtype are self-describing and we reuse ggml's reader.
|
|
||||||
static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
|
static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
|
||||||
sd::Tensor<float> empty;
|
sd::Tensor<float> empty;
|
||||||
if (path == nullptr || strlen(path) == 0) {
|
if (path == nullptr || strlen(path) == 0) {
|
||||||
@ -83,20 +74,9 @@ static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
// PuLID-Flux identity injection as a generation extension.
|
|
||||||
//
|
|
||||||
// Unlike PhotoMaker, PuLID does NOT modify the conditioning -- it injects an
|
|
||||||
// identity embedding via cross-attention *inside* the Flux denoise forward (the
|
|
||||||
// pulid_ca.* blocks). Those cross-attention weights are part of the Flux
|
|
||||||
// diffusion model and are loaded into the model tensor map before the model is
|
|
||||||
// constructed (see SDImpl ctor, gated on sd_ctx_params.pulid_weights_path), so
|
|
||||||
// this extension does not own a separate model. Its job is purely runtime:
|
|
||||||
// - prepare_condition: load the per-generation id-embedding file.
|
|
||||||
// - before_diffusion: hand that embedding (+ weight) to FluxDiffusionExtra,
|
|
||||||
// which flux.hpp reads to drive the pulid_ca injection.
|
|
||||||
struct PuLIDExtension : public GenerationExtension {
|
struct PuLIDExtension : public GenerationExtension {
|
||||||
bool enabled = false;
|
bool enabled = false;
|
||||||
sd::Tensor<float> id_embedding; // per-generation; empty when PuLID is off for this request
|
sd::Tensor<float> id_embedding;
|
||||||
float id_weight = 1.0f;
|
float id_weight = 1.0f;
|
||||||
|
|
||||||
const char* name() const override {
|
const char* name() const override {
|
||||||
|
|||||||
@ -4,31 +4,6 @@
|
|||||||
#include "core/ggml_extend.hpp"
|
#include "core/ggml_extend.hpp"
|
||||||
#include "model/common/block.hpp"
|
#include "model/common/block.hpp"
|
||||||
|
|
||||||
/**
|
|
||||||
* PuLID-Flux identity injection for stable-diffusion.cpp.
|
|
||||||
*
|
|
||||||
* Mirrors the PerceiverAttentionCA module from
|
|
||||||
* https://github.com/ToTheBeginning/PuLID/blob/main/pulid/encoders_transformer.py
|
|
||||||
*
|
|
||||||
* Each instance is a cross-attention layer where:
|
|
||||||
* Q comes from image tokens (dim = 3072 = Flux hidden_size)
|
|
||||||
* K, V come from a precomputed ID embedding (kv_dim = 2048, num_tokens = 32)
|
|
||||||
*
|
|
||||||
* 14 instances are inserted into the Flux denoise loop at fixed intervals:
|
|
||||||
* - Every 2nd of the 19 double_blocks (10 hook points)
|
|
||||||
* - Every 4th of the 38 single_blocks (10 hook points... but the v0.9.1
|
|
||||||
* reference uses 4 single hooks, for 14 total)
|
|
||||||
*
|
|
||||||
* Weight key prefix in pulid_flux_v0.9.1.safetensors:
|
|
||||||
* pulid_ca.<i>.norm1.{weight,bias}
|
|
||||||
* pulid_ca.<i>.norm2.{weight,bias}
|
|
||||||
* pulid_ca.<i>.to_q.weight
|
|
||||||
* pulid_ca.<i>.to_kv.weight
|
|
||||||
* pulid_ca.<i>.to_out.weight
|
|
||||||
*
|
|
||||||
* Pure-ggml implementation: all ops have Vulkan / CUDA / Metal kernels in
|
|
||||||
* the upstream ggml backends, so this works cross-vendor by construction.
|
|
||||||
*/
|
|
||||||
class PuLIDPerceiverAttentionCA : public GGMLBlock {
|
class PuLIDPerceiverAttentionCA : public GGMLBlock {
|
||||||
public:
|
public:
|
||||||
static constexpr int64_t DEFAULT_DIM = 3072; // Flux hidden size
|
static constexpr int64_t DEFAULT_DIM = 3072; // Flux hidden size
|
||||||
@ -41,7 +16,7 @@ protected:
|
|||||||
int64_t dim_head;
|
int64_t dim_head;
|
||||||
int64_t heads;
|
int64_t heads;
|
||||||
int64_t kv_dim;
|
int64_t kv_dim;
|
||||||
int64_t inner_dim; // dim_head * heads = 2048
|
int64_t inner_dim;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
PuLIDPerceiverAttentionCA(int64_t dim = DEFAULT_DIM,
|
PuLIDPerceiverAttentionCA(int64_t dim = DEFAULT_DIM,
|
||||||
@ -53,12 +28,6 @@ public:
|
|||||||
heads(heads),
|
heads(heads),
|
||||||
kv_dim(kv_dim),
|
kv_dim(kv_dim),
|
||||||
inner_dim(dim_head * heads) {
|
inner_dim(dim_head * heads) {
|
||||||
// Note the PyTorch reference's surprising signature:
|
|
||||||
// norm1 operates on x (the id_embedding side, kv_dim wide)
|
|
||||||
// norm2 operates on latents (the image tokens, dim wide)
|
|
||||||
// to_q consumes latents (dim -> inner_dim)
|
|
||||||
// to_kv consumes x (kv_dim -> 2*inner_dim)
|
|
||||||
// to_out projects (inner_dim -> dim)
|
|
||||||
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(kv_dim));
|
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(kv_dim));
|
||||||
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||||
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, /*bias=*/false));
|
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, /*bias=*/false));
|
||||||
@ -66,17 +35,6 @@ public:
|
|||||||
blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, /*bias=*/false));
|
blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, /*bias=*/false));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Compute: residual_to_image = PerceiverAttentionCA(id_embedding, image_tokens)
|
|
||||||
*
|
|
||||||
* Inputs:
|
|
||||||
* id_embedding [N, n_id_tokens=32, kv_dim=2048]
|
|
||||||
* image_tokens [N, n_img_tokens, dim=3072]
|
|
||||||
*
|
|
||||||
* Returns:
|
|
||||||
* [N, n_img_tokens, dim=3072] -- to be added to image_tokens by the caller,
|
|
||||||
* scaled by id_weight.
|
|
||||||
*/
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||||
ggml_tensor* id_embedding,
|
ggml_tensor* id_embedding,
|
||||||
ggml_tensor* image_tokens) {
|
ggml_tensor* image_tokens) {
|
||||||
@ -86,43 +44,31 @@ public:
|
|||||||
auto to_kv = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
|
auto to_kv = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
|
||||||
auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
|
auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
|
||||||
|
|
||||||
// Normalize each input on its own dim. The PyTorch reference normalizes
|
ggml_tensor* x_normed = norm1->forward(ctx, id_embedding);
|
||||||
// x (id_embedding) and `latents` (image_tokens) separately, then uses
|
ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens);
|
||||||
// latents for Q and x for K/V -- mind the unusual cross-attention shape.
|
|
||||||
ggml_tensor* x_normed = norm1->forward(ctx, id_embedding); // [N, 32, 2048]
|
|
||||||
ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens); // [N, T_img, 3072]
|
|
||||||
|
|
||||||
// Projections. to_q : 3072 -> 2048 ; to_kv : 2048 -> 4096 (k concat v).
|
|
||||||
ggml_tensor* q = to_q->forward(ctx, lat_normed); // [N, T_img, 2048]
|
ggml_tensor* q = to_q->forward(ctx, lat_normed); // [N, T_img, 2048]
|
||||||
ggml_tensor* kv = to_kv->forward(ctx, x_normed); // [N, 32, 4096]
|
ggml_tensor* kv = to_kv->forward(ctx, x_normed); // [N, T_img, 3072]
|
||||||
|
|
||||||
// Split KV into K (first inner_dim of last axis) and V (second
|
|
||||||
// inner_dim). ggml_view_3d gives strided views without copying;
|
|
||||||
// ggml_cont materializes them so ggml_ext_attention_ext sees
|
|
||||||
// contiguous tensors.
|
|
||||||
ggml_tensor* k = ggml_view_3d(ctx->ggml_ctx, kv,
|
ggml_tensor* k = ggml_view_3d(ctx->ggml_ctx, kv,
|
||||||
inner_dim, kv->ne[1], kv->ne[2],
|
inner_dim, kv->ne[1], kv->ne[2],
|
||||||
kv->nb[1], kv->nb[2],
|
kv->nb[1], kv->nb[2],
|
||||||
/*offset=*/0); // [N, 32, 2048]
|
/*offset=*/0);
|
||||||
ggml_tensor* v = ggml_view_3d(ctx->ggml_ctx, kv,
|
ggml_tensor* v = ggml_view_3d(ctx->ggml_ctx, kv,
|
||||||
inner_dim, kv->ne[1], kv->ne[2],
|
inner_dim, kv->ne[1], kv->ne[2],
|
||||||
kv->nb[1], kv->nb[2],
|
kv->nb[1], kv->nb[2],
|
||||||
/*offset=*/inner_dim * ggml_element_size(kv)); // [N, 32, 2048]
|
/*offset=*/inner_dim * ggml_element_size(kv));
|
||||||
k = ggml_cont(ctx->ggml_ctx, k);
|
k = ggml_cont(ctx->ggml_ctx, k);
|
||||||
v = ggml_cont(ctx->ggml_ctx, v);
|
v = ggml_cont(ctx->ggml_ctx, v);
|
||||||
|
|
||||||
// Standard multi-head attention. ggml_ext_attention_ext expects
|
|
||||||
// [N, n_token, embed_dim] and reshapes into heads internally.
|
|
||||||
// n_head = heads (=16), per-head dim = inner_dim / heads (=128).
|
|
||||||
ggml_tensor* attn_out = ggml_ext_attention_ext(
|
ggml_tensor* attn_out = ggml_ext_attention_ext(
|
||||||
ctx->ggml_ctx, ctx->backend,
|
ctx->ggml_ctx, ctx->backend,
|
||||||
q, k, v,
|
q, k, v,
|
||||||
heads,
|
heads,
|
||||||
/*mask=*/nullptr,
|
/*mask=*/nullptr,
|
||||||
/*diag_mask_inf=*/false); // [N, T_img, inner_dim=2048]
|
/*diag_mask_inf=*/false);
|
||||||
|
|
||||||
// Project back to image-token width (3072).
|
ggml_tensor* out = to_out->forward(ctx, attn_out);
|
||||||
ggml_tensor* out = to_out->forward(ctx, attn_out); // [N, T_img, 3072]
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@ -50,9 +50,6 @@ namespace Flux {
|
|||||||
float ref_index_scale = 1.f;
|
float ref_index_scale = 1.f;
|
||||||
ChromaRadianceConfig chroma_radiance_params;
|
ChromaRadianceConfig chroma_radiance_params;
|
||||||
|
|
||||||
// PuLID-Flux identity injection. Turned on by the runner when a
|
|
||||||
// --pulid-weights path is provided. The intervals are fixed by the
|
|
||||||
// PuLID v0.9.1 architecture (every 2nd double, every 4th single).
|
|
||||||
bool pulid_enabled = false;
|
bool pulid_enabled = false;
|
||||||
int pulid_double_interval = 2;
|
int pulid_double_interval = 2;
|
||||||
int pulid_single_interval = 4;
|
int pulid_single_interval = 4;
|
||||||
@ -146,10 +143,6 @@ namespace Flux {
|
|||||||
if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
|
if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
|
||||||
head_dim = tensor_storage.ne[0];
|
head_dim = tensor_storage.ne[0];
|
||||||
}
|
}
|
||||||
// PuLID weights live alongside the diffusion model under the same
|
|
||||||
// prefix (pulid_ca.<i>.<sub>) when the pulid loader merges them in
|
|
||||||
// (see stable-diffusion.cpp). Spotting any pulid_ca.* key flips the
|
|
||||||
// flag so the Flux ctor builds the pulid_ca.<i> child blocks.
|
|
||||||
if (name.find("pulid_ca.") != std::string::npos) {
|
if (name.find("pulid_ca.") != std::string::npos) {
|
||||||
config.pulid_enabled = true;
|
config.pulid_enabled = true;
|
||||||
}
|
}
|
||||||
@ -973,15 +966,6 @@ namespace Flux {
|
|||||||
blocks["single_stream_modulation"] = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
|
blocks["single_stream_modulation"] = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
// PuLID-Flux identity-injection cross-attention modules. Only constructed
|
|
||||||
// when config.pulid_enabled is set (turned on by the runner after seeing a
|
|
||||||
// --pulid-weights path during model load). Counts come straight from PuLID
|
|
||||||
// v0.9.1's pipeline_flux.py: every `pulid_double_interval` double block
|
|
||||||
// (=2) and every `pulid_single_interval` single block (=4). For a stock
|
|
||||||
// Flux Dev (depth=19, depth_single_blocks=38), this means 10 + 10 = 20
|
|
||||||
// hook points... but the reference uses ceil-rounding so the actual count
|
|
||||||
// is `ceil(depth/2) + ceil(depth_single_blocks/4)` = 10 + 10 = 20. PuLID
|
|
||||||
// v0.9.1 trained weights have 20 entries.
|
|
||||||
if (config.pulid_enabled) {
|
if (config.pulid_enabled) {
|
||||||
int num_double_ca = (config.depth + config.pulid_double_interval - 1) / config.pulid_double_interval;
|
int num_double_ca = (config.depth + config.pulid_double_interval - 1) / config.pulid_double_interval;
|
||||||
int num_single_ca = (config.depth_single_blocks + config.pulid_single_interval - 1) / config.pulid_single_interval;
|
int num_single_ca = (config.depth_single_blocks + config.pulid_single_interval - 1) / config.pulid_single_interval;
|
||||||
@ -1084,16 +1068,6 @@ namespace Flux {
|
|||||||
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
|
sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
|
||||||
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
|
sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
|
||||||
|
|
||||||
// PuLID identity injection: mirrors ToTheBeginning/PuLID
|
|
||||||
// pulid/encoders_transformer.py + flux/model.py. The CA layers
|
|
||||||
// run *between* transformer blocks, with their output added to
|
|
||||||
// img (scaled by id_weight) at every `pulid_double_interval`-th
|
|
||||||
// double_block and every `pulid_single_interval`-th single_block.
|
|
||||||
//
|
|
||||||
// skip_layers + PuLID is NOT a supported combination -- skipping
|
|
||||||
// a block at a PuLID-aligned index would either misalign the
|
|
||||||
// ca_idx assignment (silent quality regression) or require us
|
|
||||||
// to invent a non-reference index policy. Refuse early instead.
|
|
||||||
const bool pulid_active = config.pulid_enabled && pulid_id != nullptr;
|
const bool pulid_active = config.pulid_enabled && pulid_id != nullptr;
|
||||||
if (pulid_active && !skip_layers.empty()) {
|
if (pulid_active && !skip_layers.empty()) {
|
||||||
LOG_WARN("PuLID + skip_layers is not supported; disabling PuLID for this generation.");
|
LOG_WARN("PuLID + skip_layers is not supported; disabling PuLID for this generation.");
|
||||||
@ -1125,7 +1099,7 @@ namespace Flux {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
|
auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1); // [N, n_txt_token + n_img_token, hidden_size]
|
||||||
const int64_t n_txt_tok = txt->ne[1]; // for splitting back into img portion below
|
const int64_t n_txt_tok = txt->ne[1];
|
||||||
for (int i = 0; i < config.depth_single_blocks; i++) {
|
for (int i = 0; i < config.depth_single_blocks; i++) {
|
||||||
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
|
if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
|
||||||
continue;
|
continue;
|
||||||
@ -1138,8 +1112,6 @@ namespace Flux {
|
|||||||
if (pulid_run && (i % config.pulid_single_interval == 0)) {
|
if (pulid_run && (i % config.pulid_single_interval == 0)) {
|
||||||
auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
|
auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
|
||||||
blocks["pulid_ca." + std::to_string(ca_idx)]);
|
blocks["pulid_ca." + std::to_string(ca_idx)]);
|
||||||
// Split txt_img into [txt | img], inject ID into the img portion
|
|
||||||
// only, then concatenate back. Matches the PyTorch reference.
|
|
||||||
ggml_tensor* txt_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
|
ggml_tensor* txt_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
|
||||||
txt_img->ne[0], n_txt_tok, txt_img->ne[2],
|
txt_img->ne[0], n_txt_tok, txt_img->ne[2],
|
||||||
txt_img->nb[1], txt_img->nb[2],
|
txt_img->nb[1], txt_img->nb[2],
|
||||||
@ -1567,9 +1539,6 @@ namespace Flux {
|
|||||||
set_backend_tensor_data(dct, dct_vec.data());
|
set_backend_tensor_data(dct, dct_vec.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Materialize the PuLID id embedding into the compute graph when
|
|
||||||
// pulid_id_tensor is non-empty. forward() accepts nullptr for the
|
|
||||||
// no-injection case.
|
|
||||||
ggml_tensor* pulid_id = pulid_id_tensor.empty()
|
ggml_tensor* pulid_id = pulid_id_tensor.empty()
|
||||||
? nullptr
|
? nullptr
|
||||||
: make_input(pulid_id_tensor);
|
: make_input(pulid_id_tensor);
|
||||||
|
|||||||
@ -22,9 +22,6 @@ struct SkipLayerDiffusionExtra {
|
|||||||
struct FluxDiffusionExtra {
|
struct FluxDiffusionExtra {
|
||||||
const sd::Tensor<float>* guidance = nullptr;
|
const sd::Tensor<float>* guidance = nullptr;
|
||||||
const std::vector<int>* skip_layers = nullptr;
|
const std::vector<int>* skip_layers = nullptr;
|
||||||
// PuLID-Flux: precomputed (N=1, num_tokens=32, kv_dim=2048) identity embedding
|
|
||||||
// produced by runtime-scripts/pulid_extract_id.py. nullptr when PuLID is
|
|
||||||
// disabled. id_weight is per-job (typical 0.7-1.2; default 1.0).
|
|
||||||
const sd::Tensor<float>* pulid_id = nullptr;
|
const sd::Tensor<float>* pulid_id = nullptr;
|
||||||
float pulid_id_weight = 1.0f;
|
float pulid_id_weight = 1.0f;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -430,14 +430,6 @@ public:
|
|||||||
|
|
||||||
if (strlen(SAFE_STR(sd_ctx_params->pulid_weights_path)) > 0) {
|
if (strlen(SAFE_STR(sd_ctx_params->pulid_weights_path)) > 0) {
|
||||||
LOG_INFO("loading PuLID weights from '%s'", sd_ctx_params->pulid_weights_path);
|
LOG_INFO("loading PuLID weights from '%s'", sd_ctx_params->pulid_weights_path);
|
||||||
// PuLID's cross-attention (pulid_ca.*) weights are part of the Flux
|
|
||||||
// diffusion model -- its blocks are constructed inside FluxModel when
|
|
||||||
// the tensor map contains pulid_ca.* keys. So they must be merged into
|
|
||||||
// the model loader here, BEFORE the diffusion model is built; that is
|
|
||||||
// why this stays in the ctor rather than in the pulid generation
|
|
||||||
// extension (whose init runs after model construction). The runtime
|
|
||||||
// side -- per-generation id-embedding + per-step injection -- lives in
|
|
||||||
// src/extensions/pulid_extension.cpp.
|
|
||||||
if (!model_loader.init_from_file(sd_ctx_params->pulid_weights_path,
|
if (!model_loader.init_from_file(sd_ctx_params->pulid_weights_path,
|
||||||
"model.diffusion_model.")) {
|
"model.diffusion_model.")) {
|
||||||
LOG_WARN("loading PuLID weights from '%s' failed", sd_ctx_params->pulid_weights_path);
|
LOG_WARN("loading PuLID weights from '%s' failed", sd_ctx_params->pulid_weights_path);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user