fix: simplify PuLID ID extraction setup (#1664)

2026-06-17 11:46:38 +00:00 · 2026-06-15 23:55:38 +08:00 · 2026-06-15 23:55:38 +08:00 · 146b6cc49e
commit 146b6cc49e
parent 93527fda74
10 changed files with 85 additions and 257 deletions
--- a/docs/pulid.md
+++ b/docs/pulid.md
@ -52,14 +52,15 @@ to a `.pulidembd` binary file (about 131 KB). Run it once per source
 person; the same file is reused for any number of generations.
 A reference Python script is provided alongside this docs file at
-[`scripts/pulid_extract_id.py`](../scripts/pulid_extract_id.py). It
+[`script/pulid_extract_id.py`](../script/pulid_extract_id.py). It
 requires:
- A working CUDA / CPU PyTorch + diffusers stack
+- A working CUDA / CPU PyTorch stack
- `insightface`, `facexlib`, `eva-clip`, `torchvision`
+- `insightface`, `facexlib`, `eva-clip`, `torchvision`, `opencv-python`,
  `huggingface_hub`, `gguf`
 - The PuLID weights file (same one stable-diffusion.cpp will load below)
- The ToTheBeginning/PuLID repo's `pulid/pipeline_flux.py` (and its
+- The ToTheBeginning/PuLID repo's `pulid/` package (including
-  dependencies under `pulid/` and `flux/`) -- recommended to vendor
+  `pulid/pipeline_flux.py`) and `eva_clip/` package on `PYTHONPATH`; `flux/`
-  rather than pip-install due to upstream packaging quirks
+  is not needed for embedding extraction
 Run it as:
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@ -417,7 +417,7 @@ ArgOptions SDContextParams::get_options() {
         &photo_maker_path},
        {"",
         "--pulid-weights",
-         "path to PuLID flux weights (e.g. pulid_flux_v0.9.1.safetensors). Identity is injected during the denoise loop when paired with --pulid-id-embedding.",
+         "path to PuLID Flux weights",
         &pulid_weights_path},
        {"",
         "--upscale-model",
@ -894,7 +894,7 @@ ArgOptions SDGenerationParams::get_options() {
         &pm_id_embed_path},
        {"",
         "--pulid-id-embedding",
-         "path to a .pulidembd binary produced by pulid_extract_id.py. Carries a (32, 2048) identity embedding extracted from a source portrait. Pair with --pulid-weights on the context.",
+         "path to PuLID id embedding",
         &pulid_id_embedding_path},
        {"",
         "--hires-upscaler",
@ -1048,7 +1048,7 @@ ArgOptions SDGenerationParams::get_options() {
         &pm_style_strength},
        {"",
         "--pulid-id-weight",
-         "strength of PuLID identity injection (default: 1.0). 0.7-1.2 are typical; lower lets the prompt override the face more, higher tightens identity match.",
+         "strength of PuLID identity injection",
         &pulid_id_weight},
        {"",
         "--control-strength",
--- a/examples/common/common.h
+++ b/examples/common/common.h
@ -133,10 +133,6 @@ struct SDContextParams {
    std::string control_net_path;
    std::string embedding_dir;
    std::string photo_maker_path;
    // PuLID-Flux identity-preservation context path: the safetensors blob
    // carrying the PerceiverAttentionCA cross-attention weights. Loaded
    // once with the model. Per-generation pulid_id_embedding_path lives in
    // SDGenerationParams below.
    std::string pulid_weights_path;
    sd_type_t wtype = SD_TYPE_COUNT;
    std::string tensor_type_rules;
@ -239,9 +235,6 @@ struct SDGenerationParams {
    std::string pm_id_embed_path;
    float pm_style_strength = 20.f;
    // PuLID-Flux: per-generation identity embedding (binary file produced by
    // runtime-scripts/pulid_extract_id.py). Format documented in
    // include/stable-diffusion.h sd_pulid_params_t.
    std::string pulid_id_embedding_path;
    float pulid_id_weight = 1.0f;
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -195,15 +195,6 @@ typedef struct {
    const sd_embedding_t* embeddings;
    uint32_t embedding_count;
    const char* photo_maker_path;
    /**
     * Path to pulid_flux_v0.9.1.safetensors (the PuLID identity-injection
     * cross-attention weights). When set together with sd_img_gen_params_t.
     * pulid_params.id_embedding_path, the Flux diffusion model performs PuLID
     * cross-attention injection during the denoise loop. Loaded once with
     * the model; the embedding is per-generation. Currently only meaningful
     * for Flux (depth=19 double, 38 single blocks); silently ignored for
     * other model versions.
     */
    const char* pulid_weights_path;
    const char* tensor_type_rules;
    int n_threads;
@ -282,23 +273,9 @@ typedef struct {
    float style_strength;
 } sd_pm_params_t;  // photo maker
 /**
 * PuLID-Flux identity preservation params.
 *
 * Unlike PhotoMaker (which extracts the ID embedding inside the inference
 * process from a directory of images), PuLID's ID extraction is a heavy
 * Python-only stack (insightface ArcFace + EVA-CLIP-L + IDFormer). To stay
 * cross-vendor in C++/Vulkan, sd.cpp consumes a precomputed binary file
 * produced by an external tool (runtime-scripts/pulid_extract_id.py in the
 * Cloudhands client tree).
 *
 * Format: a gguf container with a single tensor "pulid_id" of shape
 * [token_dim, num_tokens] (ggml order; typically [2048, 32]) in F16/F32/BF16.
 * Loaded with the standard gguf reader; see docs/pulid.md.
 */
 typedef struct {
-    const char* id_embedding_path;  // path to .pulidembd file produced by pulid_extract_id.py
+    const char* id_embedding_path;
-    float id_weight;                // strength of the ID injection; typical 0.7-1.2, default 1.0
+    float id_weight;
 } sd_pulid_params_t;
 enum sd_cache_mode_t {
--- a/scripts/pulid_extract_id.py
+++ b/scripts/pulid_extract_id.py
@ -2,26 +2,18 @@
 Precompute a PuLID-Flux identity embedding from a single source portrait.
 Writes a gguf file (a single tensor `pulid_id`) that stable-diffusion.cpp's
-`--pulid-id-embedding` flag consumes. See docs/pulid.md for the format and
+`--pulid-id-embedding` flag consumes.
 overall PuLID-Flux flow.
 This script intentionally lives outside the C++ build: identity extraction
 needs insightface + EVA-CLIP-L + IDFormer, which are PyTorch-only stacks
 that would be impractical to reimplement in ggml just to run once per
 source person. The C++ side downstream of this file is cross-vendor and
 backend-agnostic.
 Dependencies (recommended: vendor rather than pip-install due to upstream
 packaging quirks):
  - torch + safetensors
-  - The ToTheBeginning/PuLID repository's `pulid/pipeline_flux.py` and
+  - The ToTheBeginning/PuLID repository's `pulid/` package and `eva_clip/`.
-    its sibling packages (`flux/`, `eva_clip/`, `models/`). Put them on
+    Put them on PYTHONPATH or sys.path before running this script.
-    PYTHONPATH or sys.path before running this script.
+  - insightface, facexlib, torchvision, opencv-python, huggingface_hub, gguf
  - insightface, facexlib (PuLID pipeline pulls these in)
  - numpy, Pillow
 Usage:
-  python pulid_extract_id.py \\
+  python script/pulid_extract_id.py \\
    --portrait /path/to/source-photo.jpg \\
    --pulid-weights /path/to/pulid_flux_v0.9.1.safetensors \\
    --out /path/to/source.pulidembd
@ -35,21 +27,7 @@ from __future__ import annotations
 import argparse
 import os
 import sys
-
+from types import SimpleNamespace
 def _make_minimal_flux_skeleton(device):
    """PuLIDPipeline expects a `dit` (Flux transformer) to attach its
    PerceiverAttentionCA modules to during construction. We never run a
    forward pass on it -- the encoders alone (which is what we actually
    need) live on the pipeline object, not the dit. So we instantiate a
    real Flux skeleton with default params and never load its weights."""
    import torch
    from flux.model import Flux
    from flux.util import configs
    with torch.device("cpu"):
        model = Flux(configs["flux-dev"].params).to(torch.bfloat16)
    return model
 def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
@ -65,18 +43,17 @@ def extract(portrait_path: str, pulid_weights: str) -> "torch.Tensor":
    print(f"device={device}", flush=True)
-    print("constructing minimal Flux skeleton (no weights loaded)", flush=True)
+    # PuLIDPipeline only attaches pulid_ca attributes to `dit` during
-    dit = _make_minimal_flux_skeleton(device)
+    # construction; get_id_embedding() never runs Flux, so a dummy object is
-
+    # enough and avoids importing/building a Flux skeleton.
-    print("instantiating PuLIDPipeline", flush=True)
+    print("instantiating PuLIDPipeline with a dummy Flux object", flush=True)
-    pulid = PuLIDPipeline(dit=dit, device=device,
+    dit = SimpleNamespace()
    pulid = PuLIDPipeline(dit=dit,
                          device=device,
                          weight_dtype=torch.bfloat16,
                          onnx_provider=onnx_provider)
    print(f"loading PuLID weights from {pulid_weights}", flush=True)
    # PuLIDPipeline.load_pretrain expects a "version" string used to construct
    # the default filename when pretrain_path is None. We pass the file
    # directly so the version string is informational only.
    pulid.load_pretrain(pretrain_path=pulid_weights, version="v0.9.1")
    print(f"extracting ID embedding from {portrait_path}", flush=True)
@ -100,10 +77,6 @@ def write_embd(tensor, out_path: str, dtype_choice: str) -> None:
    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
    # The embedding ships as a standard gguf container holding a single tensor
    # named "pulid_id". numpy is row-major (num_tokens, token_dim); gguf stores
    # dims reversed, so stable-diffusion.cpp reads it back as
    # ne[0]=token_dim, ne[1]=num_tokens (see load_pulid_id_embedding).
    writer = gguf.GGUFWriter(out_path, arch="pulid")
    writer.add_uint32("pulid.version", 1)
--- a/src/extensions/pulid_extension.cpp
+++ b/src/extensions/pulid_extension.cpp
@ -7,15 +7,6 @@
 #include "core/util.h"
 #include "gguf.h"
 // Load the precomputed PuLID identity embedding produced by
 // scripts/pulid_extract_id.py into a sd::Tensor<float> (always materialized as
 // fp32 for the diffusion path). Returns an empty tensor on any failure (the
 // caller treats empty as "PuLID off").
 //
 // The file is a standard gguf container holding a single tensor named
 // "pulid_id" with shape [token_dim, num_tokens] (ggml order; typically
 // [2048, 32]) in f16 / bf16 / f32. Using gguf rather than a bespoke header
 // means the shape + dtype are self-describing and we reuse ggml's reader.
 static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
    sd::Tensor<float> empty;
    if (path == nullptr || strlen(path) == 0) {
@ -83,20 +74,9 @@ static sd::Tensor<float> load_pulid_id_embedding(const char* path) {
    return out;
 }
 // PuLID-Flux identity injection as a generation extension.
 //
 // Unlike PhotoMaker, PuLID does NOT modify the conditioning -- it injects an
 // identity embedding via cross-attention *inside* the Flux denoise forward (the
 // pulid_ca.* blocks). Those cross-attention weights are part of the Flux
 // diffusion model and are loaded into the model tensor map before the model is
 // constructed (see SDImpl ctor, gated on sd_ctx_params.pulid_weights_path), so
 // this extension does not own a separate model. Its job is purely runtime:
 //   - prepare_condition: load the per-generation id-embedding file.
 //   - before_diffusion:  hand that embedding (+ weight) to FluxDiffusionExtra,
 //                        which flux.hpp reads to drive the pulid_ca injection.
 struct PuLIDExtension : public GenerationExtension {
    bool enabled = false;
-    sd::Tensor<float> id_embedding;  // per-generation; empty when PuLID is off for this request
+    sd::Tensor<float> id_embedding;
    float id_weight = 1.0f;
    const char* name() const override {
--- a/src/model/adapter/pulid.hpp
+++ b/src/model/adapter/pulid.hpp
@ -4,31 +4,6 @@
 #include "core/ggml_extend.hpp"
 #include "model/common/block.hpp"
 /**
 * PuLID-Flux identity injection for stable-diffusion.cpp.
 *
 * Mirrors the PerceiverAttentionCA module from
 * https://github.com/ToTheBeginning/PuLID/blob/main/pulid/encoders_transformer.py
 *
 * Each instance is a cross-attention layer where:
 *   Q comes from image tokens             (dim = 3072 = Flux hidden_size)
 *   K, V come from a precomputed ID embedding (kv_dim = 2048, num_tokens = 32)
 *
 * 14 instances are inserted into the Flux denoise loop at fixed intervals:
 *   - Every 2nd of the 19 double_blocks  (10 hook points)
 *   - Every 4th of the 38 single_blocks  (10 hook points... but the v0.9.1
 *     reference uses 4 single hooks, for 14 total)
 *
 * Weight key prefix in pulid_flux_v0.9.1.safetensors:
 *   pulid_ca.<i>.norm1.{weight,bias}
 *   pulid_ca.<i>.norm2.{weight,bias}
 *   pulid_ca.<i>.to_q.weight
 *   pulid_ca.<i>.to_kv.weight
 *   pulid_ca.<i>.to_out.weight
 *
 * Pure-ggml implementation: all ops have Vulkan / CUDA / Metal kernels in
 * the upstream ggml backends, so this works cross-vendor by construction.
 */
 class PuLIDPerceiverAttentionCA : public GGMLBlock {
 public:
    static constexpr int64_t DEFAULT_DIM      = 3072;  // Flux hidden size
@ -41,7 +16,7 @@ protected:
    int64_t dim_head;
    int64_t heads;
    int64_t kv_dim;
-    int64_t inner_dim;  // dim_head * heads = 2048
+    int64_t inner_dim;
 public:
    PuLIDPerceiverAttentionCA(int64_t dim      = DEFAULT_DIM,
@ -53,12 +28,6 @@ public:
          heads(heads),
          kv_dim(kv_dim),
          inner_dim(dim_head * heads) {
        // Note the PyTorch reference's surprising signature:
        // norm1 operates on x (the id_embedding side, kv_dim wide)
        // norm2 operates on latents (the image tokens, dim wide)
        // to_q  consumes latents (dim -> inner_dim)
        // to_kv consumes x       (kv_dim -> 2*inner_dim)
        // to_out projects        (inner_dim -> dim)
        blocks["norm1"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(kv_dim));
        blocks["norm2"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["to_q"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, /*bias=*/false));
@ -66,17 +35,6 @@ public:
        blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, /*bias=*/false));
    }
    /**
     * Compute: residual_to_image = PerceiverAttentionCA(id_embedding, image_tokens)
     *
     * Inputs:
     *   id_embedding  [N, n_id_tokens=32, kv_dim=2048]
     *   image_tokens  [N, n_img_tokens,  dim=3072]
     *
     * Returns:
     *   [N, n_img_tokens, dim=3072]  -- to be added to image_tokens by the caller,
     *                                  scaled by id_weight.
     */
    ggml_tensor* forward(GGMLRunnerContext* ctx,
                         ggml_tensor* id_embedding,
                         ggml_tensor* image_tokens) {
@ -86,43 +44,31 @@ public:
        auto to_kv  = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
        auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
-        // Normalize each input on its own dim. The PyTorch reference normalizes
+        ggml_tensor* x_normed   = norm1->forward(ctx, id_embedding);
-        // x (id_embedding) and `latents` (image_tokens) separately, then uses
+        ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens);
        // latents for Q and x for K/V -- mind the unusual cross-attention shape.
        ggml_tensor* x_normed   = norm1->forward(ctx, id_embedding);    // [N, 32, 2048]
        ggml_tensor* lat_normed = norm2->forward(ctx, image_tokens);    // [N, T_img, 3072]
        // Projections. to_q : 3072 -> 2048 ; to_kv : 2048 -> 4096 (k concat v).
        ggml_tensor* q  = to_q->forward(ctx, lat_normed);  // [N, T_img, 2048]
-        ggml_tensor* kv = to_kv->forward(ctx, x_normed);    // [N, 32,    4096]
+        ggml_tensor* kv = to_kv->forward(ctx, x_normed);   // [N, T_img, 3072]
        // Split KV into K (first inner_dim of last axis) and V (second
        // inner_dim). ggml_view_3d gives strided views without copying;
        // ggml_cont materializes them so ggml_ext_attention_ext sees
        // contiguous tensors.
        ggml_tensor* k = ggml_view_3d(ctx->ggml_ctx, kv,
                                      inner_dim, kv->ne[1], kv->ne[2],
                                      kv->nb[1], kv->nb[2],
-                                       /*offset=*/0);                              // [N, 32, 2048]
+                                      /*offset=*/0);
        ggml_tensor* v = ggml_view_3d(ctx->ggml_ctx, kv,
                                      inner_dim, kv->ne[1], kv->ne[2],
                                      kv->nb[1], kv->nb[2],
-                                       /*offset=*/inner_dim * ggml_element_size(kv)); // [N, 32, 2048]
+                                      /*offset=*/inner_dim * ggml_element_size(kv));
        k              = ggml_cont(ctx->ggml_ctx, k);
        v              = ggml_cont(ctx->ggml_ctx, v);
        // Standard multi-head attention. ggml_ext_attention_ext expects
        // [N, n_token, embed_dim] and reshapes into heads internally.
        // n_head = heads (=16), per-head dim = inner_dim / heads (=128).
        ggml_tensor* attn_out = ggml_ext_attention_ext(
            ctx->ggml_ctx, ctx->backend,
            q, k, v,
            heads,
            /*mask=*/nullptr,
-            /*diag_mask_inf=*/false);  // [N, T_img, inner_dim=2048]
+            /*diag_mask_inf=*/false);
-        // Project back to image-token width (3072).
+        ggml_tensor* out = to_out->forward(ctx, attn_out);
        ggml_tensor* out = to_out->forward(ctx, attn_out);  // [N, T_img, 3072]
        return out;
    }
 };
--- a/src/model/diffusion/flux.hpp
+++ b/src/model/diffusion/flux.hpp
@ -50,9 +50,6 @@ namespace Flux {
        float ref_index_scale     = 1.f;
        ChromaRadianceConfig chroma_radiance_params;
        // PuLID-Flux identity injection. Turned on by the runner when a
        // --pulid-weights path is provided. The intervals are fixed by the
        // PuLID v0.9.1 architecture (every 2nd double, every 4th single).
        bool pulid_enabled        = false;
        int pulid_double_interval = 2;
        int pulid_single_interval = 4;
@ -146,10 +143,6 @@ namespace Flux {
                if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
                    head_dim = tensor_storage.ne[0];
                }
                // PuLID weights live alongside the diffusion model under the same
                // prefix (pulid_ca.<i>.<sub>) when the pulid loader merges them in
                // (see stable-diffusion.cpp). Spotting any pulid_ca.* key flips the
                // flag so the Flux ctor builds the pulid_ca.<i> child blocks.
                if (name.find("pulid_ca.") != std::string::npos) {
                    config.pulid_enabled = true;
                }
@ -973,15 +966,6 @@ namespace Flux {
                blocks["single_stream_modulation"]     = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
            }
            // PuLID-Flux identity-injection cross-attention modules. Only constructed
            // when config.pulid_enabled is set (turned on by the runner after seeing a
            // --pulid-weights path during model load). Counts come straight from PuLID
            // v0.9.1's pipeline_flux.py: every `pulid_double_interval` double block
            // (=2) and every `pulid_single_interval` single block (=4). For a stock
            // Flux Dev (depth=19, depth_single_blocks=38), this means 10 + 10 = 20
            // hook points... but the reference uses ceil-rounding so the actual count
            // is `ceil(depth/2) + ceil(depth_single_blocks/4)` = 10 + 10 = 20. PuLID
            // v0.9.1 trained weights have 20 entries.
            if (config.pulid_enabled) {
                int num_double_ca = (config.depth + config.pulid_double_interval - 1) / config.pulid_double_interval;
                int num_single_ca = (config.depth_single_blocks + config.pulid_single_interval - 1) / config.pulid_single_interval;
@ -989,10 +973,10 @@ namespace Flux {
                for (int i = 0; i < num_ca; i++) {
                    blocks["pulid_ca." + std::to_string(i)] =
                        std::shared_ptr<GGMLBlock>(new PuLIDPerceiverAttentionCA(
-                            /*dim=*/    config.hidden_size,
+                            /*dim=*/config.hidden_size,
                            /*dim_head=*/PuLIDPerceiverAttentionCA::DEFAULT_DIM_HEAD,
-                            /*heads=*/   PuLIDPerceiverAttentionCA::DEFAULT_HEADS,
+                            /*heads=*/PuLIDPerceiverAttentionCA::DEFAULT_HEADS,
-                            /*kv_dim=*/  PuLIDPerceiverAttentionCA::DEFAULT_KV_DIM));
+                            /*kv_dim=*/PuLIDPerceiverAttentionCA::DEFAULT_KV_DIM));
                }
            }
        }
@ -1084,16 +1068,6 @@ namespace Flux {
            sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
            sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");
            // PuLID identity injection: mirrors ToTheBeginning/PuLID
            // pulid/encoders_transformer.py + flux/model.py. The CA layers
            // run *between* transformer blocks, with their output added to
            // img (scaled by id_weight) at every `pulid_double_interval`-th
            // double_block and every `pulid_single_interval`-th single_block.
            //
            // skip_layers + PuLID is NOT a supported combination -- skipping
            // a block at a PuLID-aligned index would either misalign the
            // ca_idx assignment (silent quality regression) or require us
            // to invent a non-reference index policy. Refuse early instead.
            const bool pulid_active = config.pulid_enabled && pulid_id != nullptr;
            if (pulid_active && !skip_layers.empty()) {
                LOG_WARN("PuLID + skip_layers is not supported; disabling PuLID for this generation.");
@ -1125,7 +1099,7 @@ namespace Flux {
            }
            auto txt_img            = ggml_concat(ctx->ggml_ctx, txt, img, 1);  // [N, n_txt_token + n_img_token, hidden_size]
-            const int64_t n_txt_tok = txt->ne[1];                     // for splitting back into img portion below
+            const int64_t n_txt_tok = txt->ne[1];
            for (int i = 0; i < config.depth_single_blocks; i++) {
                if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
                    continue;
@ -1138,8 +1112,6 @@ namespace Flux {
                if (pulid_run && (i % config.pulid_single_interval == 0)) {
                    auto pulid_ca = std::dynamic_pointer_cast<PuLIDPerceiverAttentionCA>(
                        blocks["pulid_ca." + std::to_string(ca_idx)]);
                    // Split txt_img into [txt | img], inject ID into the img portion
                    // only, then concatenate back. Matches the PyTorch reference.
                    ggml_tensor* txt_part = ggml_view_3d(ctx->ggml_ctx, txt_img,
                                                         txt_img->ne[0], n_txt_tok, txt_img->ne[2],
                                                         txt_img->nb[1], txt_img->nb[2],
@ -1567,9 +1539,6 @@ namespace Flux {
                set_backend_tensor_data(dct, dct_vec.data());
            }
            // Materialize the PuLID id embedding into the compute graph when
            // pulid_id_tensor is non-empty. forward() accepts nullptr for the
            // no-injection case.
            ggml_tensor* pulid_id = pulid_id_tensor.empty()
                                        ? nullptr
                                        : make_input(pulid_id_tensor);
--- a/src/model/diffusion/model.hpp
+++ b/src/model/diffusion/model.hpp
@ -22,9 +22,6 @@ struct SkipLayerDiffusionExtra {
 struct FluxDiffusionExtra {
    const sd::Tensor<float>* guidance   = nullptr;
    const std::vector<int>* skip_layers = nullptr;
    // PuLID-Flux: precomputed (N=1, num_tokens=32, kv_dim=2048) identity embedding
    // produced by runtime-scripts/pulid_extract_id.py. nullptr when PuLID is
    // disabled. id_weight is per-job (typical 0.7-1.2; default 1.0).
    const sd::Tensor<float>* pulid_id   = nullptr;
    float pulid_id_weight               = 1.0f;
 };
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -430,14 +430,6 @@ public:
        if (strlen(SAFE_STR(sd_ctx_params->pulid_weights_path)) > 0) {
            LOG_INFO("loading PuLID weights from '%s'", sd_ctx_params->pulid_weights_path);
            // PuLID's cross-attention (pulid_ca.*) weights are part of the Flux
            // diffusion model -- its blocks are constructed inside FluxModel when
            // the tensor map contains pulid_ca.* keys. So they must be merged into
            // the model loader here, BEFORE the diffusion model is built; that is
            // why this stays in the ctor rather than in the pulid generation
            // extension (whose init runs after model construction). The runtime
            // side -- per-generation id-embedding + per-step injection -- lives in
            // src/extensions/pulid_extension.cpp.
            if (!model_loader.init_from_file(sd_ctx_params->pulid_weights_path,
                                             "model.diffusion_model.")) {
                LOG_WARN("loading PuLID weights from '%s' failed", sd_ctx_params->pulid_weights_path);