feat: add SeFi-Image support (#1707)

This commit is contained in:
fszontagh 2026-06-28 16:49:24 +02:00 committed by GitHub
parent f54e45e81c
commit 03e9a22f4d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 736 additions and 17 deletions

View File

@ -53,6 +53,7 @@ API and command-line option may change frequently.***
- [ERNIE-Image](./docs/ernie_image.md) - [ERNIE-Image](./docs/ernie_image.md)
- [Boogu Image](./docs/boogu_image.md) - [Boogu Image](./docs/boogu_image.md)
- [Krea2](./docs/krea2.md) - [Krea2](./docs/krea2.md)
- [SeFi-Image](./docs/sefi_image.md)
- [HiDream-O1-Image](./docs/hidream_o1_image.md) - [HiDream-O1-Image](./docs/hidream_o1_image.md)
- [Ideogram4](./docs/ideogram4.md) - [Ideogram4](./docs/ideogram4.md)
- Image Edit Models - Image Edit Models

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

50
docs/sefi_image.md Normal file
View File

@ -0,0 +1,50 @@
# How to Use
SeFi-Image uses a Flux2-style dual-time transformer (semantic + texture streams), the standard Flux2 VAE, and Qwen3-VL as the LLM text encoder. Tech report: [arXiv:2606.22568](https://arxiv.org/abs/2606.22568).
## Download weights
The SeFi-Image family ships in three scales (1B / 2B / 5B) and three families (Base / RL / turbo), all gated on Hugging Face under https://huggingface.co/SeFi-Image.
- 1B and 2B variants pair with Qwen3-VL-2B-Instruct.
- 5B variants pair with Qwen3-VL-4B-Instruct.
- All variants use the standard Flux2 VAE (`flux2_ae.safetensors` from https://huggingface.co/black-forest-labs/FLUX.2-dev).
Convert the transformer and text encoder to sd.cpp safetensors:
```bash
python3 script/convert_sefi.py <hf_repo_dir> <out_dir>/sefi_<scale>_<family>.safetensors
python3 script/convert_qwen3_vl.py <hf_repo_dir>/Qwen3-VL-XB-Instruct <out_dir>/qwen3_vl_<X>b.safetensors
```
## Variant defaults
| Family | timestep_shift_alpha | steps | cfg-scale |
|---|---|---|---|
| Base | 0.3 | 50 | 4.0 |
| RL | 0.3 | 50 | 4.0 |
| turbo | 1.0 | 4 | 1.0 |
The dispatcher picks `alpha` from the filename (`turbo` substring => 1.0, otherwise 0.3). Override via `--extra-sample-args sefi_alpha=<value>` or `sefi_delta_t=<value>`.
## Examples
### 1B / 2B turbo
```
./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
```
### 1B / 2B base
```
./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_base.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 4.0 --steps 50 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
```
### 5B (needs streaming on 12 GiB VRAM)
```
./build/bin/sd-cli --diffusion-model /path/to/sefi_5b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_4b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --max-vram 8 --stream-layers --offload-to-cpu -o out.png
```
<img alt="SeFi-Image 5B turbo example" src="../assets/sefi_image/example.png" />

View File

@ -81,6 +81,7 @@ enum prediction_t {
FLOW_PRED, FLOW_PRED,
FLUX_FLOW_PRED, FLUX_FLOW_PRED,
FLUX2_FLOW_PRED, FLUX2_FLOW_PRED,
SEFI_FLOW_PRED,
PREDICTION_COUNT PREDICTION_COUNT
}; };

112
script/convert_qwen3_vl.py Normal file
View File

@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""Convert a Qwen3-VL HF safetensors checkpoint into a sd.cpp-loadable form.
The HF dump prefixes text-tower keys with ``model.language_model.`` and
vision-tower keys with ``model.visual.``. sd.cpp expects ``model.<rest>`` for
the text side; the vision side is converted by sd.cpp's own
``convert_qwen3_vl_vision_name`` and is left as-is here.
Operates on raw safetensors bytes so any dtype (BF16/F16/F32) is preserved.
Usage:
python3 script/convert_qwen3_vl.py <hf_qwen3_vl_dir_or_safetensors> <output.safetensors>
"""
import argparse
import json
import os
import struct
import sys
def rewrite_key(key: str) -> str:
if key.startswith("model.language_model."):
return "model." + key[len("model.language_model."):]
return key
def read_safetensors_header(path: str):
with open(path, "rb") as f:
hdr_len = struct.unpack("<Q", f.read(8))[0]
hdr_bytes = f.read(hdr_len)
return json.loads(hdr_bytes), 8 + hdr_len
def collect_shard_paths(path: str):
if os.path.isdir(path):
index_path = os.path.join(path, "model.safetensors.index.json")
if os.path.isfile(index_path):
with open(index_path) as f:
idx = json.load(f)
return sorted({os.path.join(path, n) for n in idx["weight_map"].values()})
single = os.path.join(path, "model.safetensors")
if os.path.isfile(single):
return [single]
raise FileNotFoundError(f"No Qwen3-VL safetensors in {path}")
if os.path.isfile(path):
return [path]
raise FileNotFoundError(path)
def stage_tensors(input_path: str):
entries = []
for shard_path in collect_shard_paths(input_path):
hdr, data_off = read_safetensors_header(shard_path)
for key, info in hdr.items():
if key == "__metadata__":
continue
entries.append((rewrite_key(key), shard_path, data_off, info))
return entries
def write_consolidated(out_path: str, entries):
entries = sorted(entries, key=lambda e: e[0])
new_header = {}
cur_offset = 0
for new_key, shard_path, data_off, info in entries:
start, end = info["data_offsets"]
size = end - start
new_header[new_key] = {
"dtype": info["dtype"],
"shape": info["shape"],
"data_offsets": [cur_offset, cur_offset + size],
}
cur_offset += size
header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
pad = (-len(header_json)) % 8
header_json = header_json + (b" " * pad)
with open(out_path, "wb") as out:
out.write(struct.pack("<Q", len(header_json)))
out.write(header_json)
for new_key, shard_path, data_off, info in entries:
start, end = info["data_offsets"]
with open(shard_path, "rb") as src:
src.seek(data_off + start)
remaining = end - start
while remaining > 0:
chunk = src.read(min(8 * 1024 * 1024, remaining))
if not chunk:
raise IOError(f"Truncated tensor in {shard_path}")
out.write(chunk)
remaining -= len(chunk)
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input", help="HF Qwen3-VL directory or single safetensors file")
parser.add_argument("output", help="Output single safetensors path")
args = parser.parse_args()
entries = stage_tensors(args.input)
print(f"Tensors: {len(entries)}")
print(f"Writing -> {args.output}")
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
write_consolidated(args.output, entries)
print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")
if __name__ == "__main__":
main()

279
script/convert_sefi.py Normal file
View File

@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""Convert a SeFi-Image diffusers checkpoint into a single sd.cpp-compatible safetensors.
Operates on raw safetensors bytes so any dtype (BF16, F32, ...) is preserved exactly.
No numpy or torch dependency required.
Usage:
python3 script/convert_sefi.py <sefi_diffusers_dir> <output.safetensors>
"""
import argparse
import json
import os
import re
import struct
import sys
_LINEAR_TO_LIN = re.compile(r"\.linear\.")
_SHARED_MOD_PREFIXES = (
"double_stream_modulation_img",
"double_stream_modulation_txt",
"single_stream_modulation",
)
def rewrite_transformer_key(key: str) -> str:
if key.startswith("backbone."):
key = key[len("backbone."):]
elif key.startswith("dual_time_embed."):
return key
if any(key.startswith(prefix + ".") for prefix in _SHARED_MOD_PREFIXES):
key = _LINEAR_TO_LIN.sub(".lin.", key, count=1)
if key == "context_embedder.weight":
return "txt_in.weight"
if key == "context_embedder.bias":
return "txt_in.bias"
if key == "x_embedder.weight":
return "img_in.weight"
if key == "x_embedder.bias":
return "img_in.bias"
if key == "proj_out.weight":
return "final_layer.linear.weight"
if key == "proj_out.bias":
return "final_layer.linear.bias"
if key == "norm_out.linear.weight":
return "final_layer.adaLN_modulation.1.weight"
if key == "norm_out.linear.bias":
return "final_layer.adaLN_modulation.1.bias"
m = re.match(r"transformer_blocks\.(\d+)\.(.*)$", key)
if m:
return _rewrite_double_stream(m.group(1), m.group(2))
m = re.match(r"single_transformer_blocks\.(\d+)\.(.*)$", key)
if m:
return _rewrite_single_stream(m.group(1), m.group(2))
return key
def _rewrite_double_stream(idx: str, tail: str) -> str:
dst = f"double_blocks.{idx}."
mapping = {
"norm1.linear.weight": "img_mod.lin.weight",
"norm1_context.linear.weight": "txt_mod.lin.weight",
"attn.norm_q.weight": "img_attn.norm.query_norm.scale",
"attn.norm_k.weight": "img_attn.norm.key_norm.scale",
"attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale",
"attn.norm_added_k.weight": "txt_attn.norm.key_norm.scale",
"attn.to_out.0.weight": "img_attn.proj.weight",
"attn.to_add_out.weight": "txt_attn.proj.weight",
"ff.net.0.proj.weight": "img_mlp.0.weight",
"ff.net.2.weight": "img_mlp.2.weight",
"ff_context.net.0.proj.weight": "txt_mlp.0.weight",
"ff_context.net.2.weight": "txt_mlp.2.weight",
"ff.linear_in.weight": "img_mlp.0.weight",
"ff.linear_out.weight": "img_mlp.2.weight",
"ff_context.linear_in.weight": "txt_mlp.0.weight",
"ff_context.linear_out.weight": "txt_mlp.2.weight",
}
return dst + mapping.get(tail, tail)
# QKV triplets to fuse on output: source tails -> target fused tail.
# Each tuple is (q_tail, k_tail, v_tail, fused_target_tail).
QKV_DOUBLE_TRIPLETS = [
("attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight", "img_attn.qkv.weight"),
("attn.add_q_proj.weight", "attn.add_k_proj.weight", "attn.add_v_proj.weight", "txt_attn.qkv.weight"),
]
def _rewrite_single_stream(idx: str, tail: str) -> str:
dst = f"single_blocks.{idx}."
mapping = {
"norm.linear.weight": "modulation.lin.weight",
"attn.norm_q.weight": "norm.query_norm.scale",
"attn.norm_k.weight": "norm.key_norm.scale",
"attn.to_qkv_mlp_proj.weight": "linear1.weight",
"attn.to_out.weight": "linear2.weight",
}
return dst + mapping.get(tail, tail)
def read_safetensors_header(path: str):
"""Return (header dict, data start byte offset)."""
with open(path, "rb") as f:
hdr_len = struct.unpack("<Q", f.read(8))[0]
hdr_bytes = f.read(hdr_len)
return json.loads(hdr_bytes), 8 + hdr_len
def collect_shard_paths(directory: str, weight_pattern: str):
index_path = os.path.join(directory, f"{weight_pattern}.safetensors.index.json")
if os.path.isfile(index_path):
with open(index_path) as f:
idx = json.load(f)
return sorted({os.path.join(directory, n) for n in idx["weight_map"].values()})
single = os.path.join(directory, f"{weight_pattern}.safetensors")
if not os.path.isfile(single):
raise FileNotFoundError(f"No checkpoint at {directory}: missing {weight_pattern}")
return [single]
def stage_tensors_for_section(section_dir: str, rewrite_fn):
"""Return a list of (new_key, shard_path, data_start_offset, info_dict) entries.
A "qkv_fuse" pseudo-entry with three source descriptors is emitted when a
transformer_blocks.* split q/k/v triplet is found, so the writer can fuse
them into a single output tensor.
"""
entries = []
# First, index all raw keys per shard so we can detect qkv triplets.
raw_by_block = {} # block_idx -> {tail: (key, shard_path, data_off, info)}
raw_others = []
for shard_path in collect_shard_paths(section_dir, "diffusion_pytorch_model"):
hdr, data_off = read_safetensors_header(shard_path)
for key, info in hdr.items():
if key == "__metadata__":
continue
m = re.match(r"backbone\.transformer_blocks\.(\d+)\.(.*)$", key)
if m and any(m.group(2) in trip[:3] for trip in QKV_DOUBLE_TRIPLETS):
idx = m.group(1)
raw_by_block.setdefault(idx, {})[m.group(2)] = (key, shard_path, data_off, info)
else:
raw_others.append((key, shard_path, data_off, info))
for key, shard_path, data_off, info in raw_others:
new_key = rewrite_fn(key)
# Swap the (scale, shift) halves to (shift, scale) at conversion time so
# the on-disk weight matches BFL flux ordering and the runtime stays
# version-agnostic. norm_out.linear weight shape is [2*dim, dim] and bias
# is [2*dim]; both split along axis 0 (outermost == row-major outer).
if new_key in ("final_layer.adaLN_modulation.1.weight",
"final_layer.adaLN_modulation.1.bias"):
info = dict(info)
info["_chunk_swap_halves"] = True
entries.append((new_key, shard_path, data_off, info))
for block_idx, tails in raw_by_block.items():
for q_tail, k_tail, v_tail, fused_tail in QKV_DOUBLE_TRIPLETS:
if q_tail in tails and k_tail in tails and v_tail in tails:
q = tails[q_tail]; k = tails[k_tail]; v = tails[v_tail]
# Validate shapes match.
q_shape = q[3]["shape"]; k_shape = k[3]["shape"]; v_shape = v[3]["shape"]
if q_shape != k_shape or q_shape != v_shape:
raise ValueError(f"qkv shape mismatch at block {block_idx} {q_tail}: q={q_shape} k={k_shape} v={v_shape}")
fused_shape = [q_shape[0] * 3] + list(q_shape[1:])
fused_info = {
"dtype": q[3]["dtype"],
"shape": fused_shape,
"_qkv_sources": [q, k, v], # pseudo field consumed by writer
}
entries.append((f"double_blocks.{block_idx}.{fused_tail}",
None, None, fused_info))
del tails[q_tail]; del tails[k_tail]; del tails[v_tail]
# Anything left in tails was an unmatched single - pass through.
for tail, payload in tails.items():
entries.append((rewrite_fn(payload[0]),) + payload[1:])
return entries
_DTYPE_BYTES = {
"BF16": 2, "F16": 2, "F32": 4, "F64": 8,
"U8": 1, "I8": 1, "I16": 2, "I32": 4, "I64": 8,
"BOOL": 1,
}
def _total_bytes(info: dict) -> int:
if "_qkv_sources" in info:
elems = 1
for d in info["shape"]:
elems *= d
return elems * _DTYPE_BYTES[info["dtype"]]
start, end = info["data_offsets"]
return end - start
def write_consolidated(out_path: str, entries):
"""Write a single safetensors file by streaming raw bytes from each shard.
For qkv-fused entries, q/k/v are concatenated along axis 0 (row-major), so a
simple byte-level concatenation produces the correct fused layout for any
standard dtype.
"""
entries = sorted(entries, key=lambda e: e[0])
new_header = {}
cur_offset = 0
for new_key, shard_path, data_off, info in entries:
size = _total_bytes(info)
new_header[new_key] = {
"dtype": info["dtype"],
"shape": info["shape"],
"data_offsets": [cur_offset, cur_offset + size],
}
cur_offset += size
header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
pad = (-len(header_json)) % 8
header_json = header_json + (b" " * pad)
def copy_range(src_path, src_data_off, src_info, out, byte_range=None):
start, end = src_info["data_offsets"]
if byte_range is not None:
sub_start, sub_end = byte_range
start, end = start + sub_start, start + sub_end
with open(src_path, "rb") as src:
src.seek(src_data_off + start)
remaining = end - start
while remaining > 0:
chunk = src.read(min(8 * 1024 * 1024, remaining))
if not chunk:
raise IOError(f"Truncated tensor in {src_path}")
out.write(chunk)
remaining -= len(chunk)
with open(out_path, "wb") as out:
out.write(struct.pack("<Q", len(header_json)))
out.write(header_json)
for new_key, shard_path, data_off, info in entries:
if "_qkv_sources" in info:
for q_entry in info["_qkv_sources"]:
_, src_path, src_data_off, src_info = q_entry
copy_range(src_path, src_data_off, src_info, out)
elif info.get("_chunk_swap_halves"):
size = _total_bytes(info)
half = size // 2
if size != half * 2:
raise ValueError(f"{new_key}: odd byte size {size} cannot be split into halves")
copy_range(shard_path, data_off, info, out, byte_range=(half, size))
copy_range(shard_path, data_off, info, out, byte_range=(0, half))
else:
copy_range(shard_path, data_off, info, out)
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input_dir", help="SeFi diffusers checkpoint directory")
parser.add_argument("output", help="Output transformer safetensors path (load via --diffusion-model)")
args = parser.parse_args()
transformer_entries = stage_tensors_for_section(
os.path.join(args.input_dir, "transformer"), rewrite_transformer_key)
print(f"Transformer tensors: {len(transformer_entries)}")
print(f"Writing {len(transformer_entries)} tensors -> {args.output}")
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
write_consolidated(args.output, transformer_entries)
print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")
if __name__ == "__main__":
main()

View File

@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
arch = LLM::LLMArch::GPT_OSS_20B; arch = LLM::LLMArch::GPT_OSS_20B;
} else if (sd_version_is_pid(version)) { } else if (sd_version_is_pid(version)) {
arch = LLM::LLMArch::GEMMA2_2B; arch = LLM::LLMArch::GEMMA2_2B;
} else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_krea2(version)) { } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_sefi_image(version) || sd_version_is_krea2(version)) {
arch = LLM::LLMArch::QWEN3_VL; arch = LLM::LLMArch::QWEN3_VL;
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) { } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
arch = LLM::LLMArch::QWEN3; arch = LLM::LLMArch::QWEN3;
@ -1997,6 +1997,18 @@ struct LLMEmbedder : public Conditioner {
prompt_attn_range.second = static_cast<int>(prompt.size()); prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"; prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
} else if (sd_version_is_sefi_image(version)) {
prompt_template_encode_start_idx = 0;
min_length = 1024;
out_layers = {9, 18, 27};
prompt = "<|im_start|>user\n";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n";
} else if (version == VERSION_OVIS_IMAGE) { } else if (version == VERSION_OVIS_IMAGE) {
prompt_template_encode_start_idx = 28; prompt_template_encode_start_idx = 28;
min_length = prompt_template_encode_start_idx + 256; min_length = prompt_template_encode_start_idx + 256;

View File

@ -49,6 +49,7 @@ enum SDVersion {
VERSION_LONGCAT, VERSION_LONGCAT,
VERSION_PID, VERSION_PID,
VERSION_IDEOGRAM4, VERSION_IDEOGRAM4,
VERSION_SEFI_IMAGE,
VERSION_KREA2, VERSION_KREA2,
VERSION_ESRGAN, VERSION_ESRGAN,
VERSION_COUNT, VERSION_COUNT,
@ -187,6 +188,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) {
return false; return false;
} }
static inline bool sd_version_is_sefi_image(SDVersion version) {
if (version == VERSION_SEFI_IMAGE) {
return true;
}
return false;
}
static inline bool sd_version_is_krea2(SDVersion version) { static inline bool sd_version_is_krea2(SDVersion version) {
if (version == VERSION_KREA2) { if (version == VERSION_KREA2) {
return true; return true;
@ -202,7 +210,7 @@ static inline bool sd_version_uses_flux_vae(SDVersion version) {
} }
static inline bool sd_version_uses_flux2_vae(SDVersion version) { static inline bool sd_version_uses_flux2_vae(SDVersion version) {
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) { if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version) || sd_version_is_sefi_image(version)) {
return true; return true;
} }
return false; return false;
@ -242,6 +250,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
sd_version_is_longcat(version) || sd_version_is_longcat(version) ||
sd_version_is_pid(version) || sd_version_is_pid(version) ||
sd_version_is_ideogram4(version) || sd_version_is_ideogram4(version) ||
sd_version_is_sefi_image(version) ||
sd_version_is_krea2(version)) { sd_version_is_krea2(version)) {
return true; return true;
} }

View File

@ -8,6 +8,7 @@
#include "model/common/rope.hpp" #include "model/common/rope.hpp"
#include "model/diffusion/dit.hpp" #include "model/diffusion/dit.hpp"
#include "model/diffusion/model.hpp" #include "model/diffusion/model.hpp"
#include "model/diffusion/sefi_image.hpp"
#include "model_loader.h" #include "model_loader.h"
#define FLUX_GRAPH_SIZE 10240 #define FLUX_GRAPH_SIZE 10240
@ -26,6 +27,9 @@ namespace Flux {
struct FluxConfig { struct FluxConfig {
SDVersion version = VERSION_FLUX; SDVersion version = VERSION_FLUX;
bool is_chroma = false; bool is_chroma = false;
bool is_sefi = false;
int64_t semantic_channels = 0;
float sefi_delta_t = 0.1f;
int patch_size = 2; int patch_size = 2;
int64_t in_channels = 64; int64_t in_channels = 64;
int64_t out_channels = 64; int64_t out_channels = 64;
@ -88,6 +92,21 @@ namespace Flux {
config.share_modulation = true; config.share_modulation = true;
config.ref_index_scale = 10.f; config.ref_index_scale = 10.f;
config.use_mlp_silu_act = true; config.use_mlp_silu_act = true;
} else if (sd_version_is_sefi_image(version)) {
config.is_sefi = true;
config.semantic_channels = 16;
config.in_channels = 128 + config.semantic_channels;
config.patch_size = 1;
config.out_channels = 128 + config.semantic_channels;
config.mlp_ratio = 3.f;
config.theta = 2000;
config.axes_dim = {32, 32, 32, 32};
config.vec_in_dim = 0;
config.qkv_bias = false;
config.disable_bias = true;
config.share_modulation = true;
config.ref_index_scale = 10.f;
config.use_mlp_silu_act = true;
} else if (sd_version_is_longcat(version)) { } else if (sd_version_is_longcat(version)) {
config.context_in_dim = 3584; config.context_in_dim = 3584;
config.vec_in_dim = 0; config.vec_in_dim = 0;
@ -723,8 +742,8 @@ namespace Flux {
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 2 * hidden_size] auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 2 * hidden_size]
auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, 2, 0); auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, 2, 0);
shift = m_vec[0]; // [N, hidden_size] shift = m_vec[0];
scale = m_vec[1]; // [N, hidden_size] scale = m_vec[1];
} }
x = Flux::modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale); x = Flux::modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
@ -902,6 +921,8 @@ namespace Flux {
} }
if (config.is_chroma) { if (config.is_chroma) {
blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(config.in_dim, config.hidden_size); blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(config.in_dim, config.hidden_size);
} else if (config.is_sefi) {
blocks["dual_time_embed"] = std::make_shared<SefiImage::SefiDualTimestepEmbeddings>(256, config.hidden_size);
} else { } else {
blocks["time_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias); blocks["time_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
if (config.vec_in_dim > 0) { if (config.vec_in_dim > 0) {
@ -1027,6 +1048,11 @@ namespace Flux {
if (y != nullptr) { if (y != nullptr) {
txt_img_mask = ggml_pad(ctx->ggml_ctx, y, static_cast<int>(img->ne[1]), 0, 0, 0); txt_img_mask = ggml_pad(ctx->ggml_ctx, y, static_cast<int>(img->ne[1]), 0, 0, 0);
} }
} else if (config.is_sefi) {
auto dual_time_embed = std::dynamic_pointer_cast<SefiImage::SefiDualTimestepEmbeddings>(blocks["dual_time_embed"]);
auto timestep_sem = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, 0);
auto timestep_tex = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, ggml_element_size(timesteps));
vec = dual_time_embed->forward(ctx, timestep_sem, timestep_tex);
} else { } else {
auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]); auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
vec = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f)); vec = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
@ -1500,7 +1526,7 @@ namespace Flux {
set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data()); set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
} }
std::set<int> txt_arange_dims; std::set<int> txt_arange_dims;
if (sd_version_is_flux2(version)) { if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) {
txt_arange_dims = {3}; txt_arange_dims = {3};
increase_ref_index = true; increase_ref_index = true;
} else if (version == VERSION_OVIS_IMAGE) { } else if (version == VERSION_OVIS_IMAGE) {

View File

@ -0,0 +1,91 @@
#ifndef __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
#define __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
#include <memory>
#include "model/common/block.hpp"
namespace SefiImage {
struct SefiImageConfig {
int64_t semantic_channels = 16;
int64_t texture_latent_channels = 32;
int64_t timestep_guidance_in_dim = 256;
int64_t hidden_size = 3072;
float timestep_shift_alpha = 0.3f;
float delta_t = 0.1f;
int64_t packed_texture_channels(int patch_size) const {
return texture_latent_channels * patch_size * patch_size;
}
int64_t packed_input_channels(int patch_size) const {
return semantic_channels + packed_texture_channels(patch_size);
}
static SefiImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix) {
SefiImageConfig config;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
continue;
}
if (ends_with(name, "dual_time_embed.semantic_embedder.linear_1.weight") && tensor_storage.n_dims == 2) {
config.timestep_guidance_in_dim = tensor_storage.ne[0];
config.hidden_size = tensor_storage.ne[1] * 2;
}
}
LOG_DEBUG("sefi_image: semantic_channels = %" PRId64 ", texture_latent_channels = %" PRId64 ", hidden_size = %" PRId64,
config.semantic_channels,
config.texture_latent_channels,
config.hidden_size);
return config;
}
};
struct SefiTimestepEmbedding : public GGMLBlock {
public:
SefiTimestepEmbedding(int64_t in_channels, int64_t time_embed_dim) {
blocks["linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim, false));
blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim, false));
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* sample) {
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
sample = linear_1->forward(ctx, sample);
sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
sample = linear_2->forward(ctx, sample);
return sample;
}
};
struct SefiDualTimestepEmbeddings : public GGMLBlock {
public:
SefiDualTimestepEmbeddings(int64_t in_channels, int64_t embedding_dim) {
GGML_ASSERT(embedding_dim % 2 == 0);
int64_t half_dim = embedding_dim / 2;
blocks["semantic_embedder"] = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
blocks["texture_embedder"] = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
timestep_guidance_in_dim = in_channels;
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* timestep_sem,
ggml_tensor* timestep_tex) {
auto semantic_embedder = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["semantic_embedder"]);
auto texture_embedder = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["texture_embedder"]);
auto sem_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_sem, timestep_guidance_in_dim, 10000, 1.f);
auto tex_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_tex, timestep_guidance_in_dim, 10000, 1.f);
auto sem_emb = semantic_embedder->forward(ctx, sem_proj);
auto tex_emb = texture_embedder->forward(ctx, tex_proj);
return ggml_concat(ctx->ggml_ctx, sem_emb, tex_emb, 0);
}
private:
int64_t timestep_guidance_in_dim = 256;
};
} // namespace SefiImage
#endif // __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__

View File

@ -250,7 +250,7 @@ namespace LLM {
config.intermediate_size = tensor_storage.ne[1]; config.intermediate_size = tensor_storage.ne[1];
} }
} }
if (arch == LLMArch::QWEN3 && config.num_layers == 28) { if ((arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) && config.num_layers == 28) {
config.num_heads = 16; config.num_heads = 16;
} }
if (detected_vision_layers > 0) { if (detected_vision_layers > 0) {

View File

@ -816,12 +816,13 @@ struct AutoEncoderKL : public VAE {
} }
sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override { sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
auto latents_ = sd_version_is_sefi_image(version) ? sd::ops::slice(latents, 2, 16, 144) : latents;
if (sd_version_uses_flux2_vae(version)) { if (sd_version_uses_flux2_vae(version)) {
int channel_dim = 2; int channel_dim = 2;
auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim); auto [mean_tensor, std_tensor] = get_latents_mean_std(latents_, channel_dim);
return (latents * std_tensor) / scale_factor + mean_tensor; return (latents_ * std_tensor) / scale_factor + mean_tensor;
} }
return (latents / scale_factor) + shift_factor; return (latents_ / scale_factor) + shift_factor;
} }
sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override { sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {

View File

@ -66,7 +66,6 @@ const char* unused_tensors[] = {
// "v_pred", // Used to detect SDXL vpred models // "v_pred", // Used to detect SDXL vpred models
"text_encoders.llm.output.weight", "text_encoders.llm.output.weight",
"text_encoders.llm.lm_head.", "text_encoders.llm.lm_head.",
"first_stage_model.bn.",
}; };
bool is_unused_tensor(const std::string& name) { bool is_unused_tensor(const std::string& name) {
@ -480,6 +479,9 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
is_flux2 = true; is_flux2 = true;
} }
if (tensor_storage.name.find("dual_time_embed.semantic_embedder.linear_1.weight") != std::string::npos) {
return VERSION_SEFI_IMAGE;
}
if (tensor_storage.name.find("single_blocks.47.linear1.weight") != std::string::npos) { if (tensor_storage.name.find("single_blocks.47.linear1.weight") != std::string::npos) {
has_single_block_47 = true; has_single_block_47 = true;
} }

View File

@ -743,7 +743,7 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
name = convert_diffusers_unet_to_original_sdxl(name); name = convert_diffusers_unet_to_original_sdxl(name);
} else if (sd_version_is_sd3(version)) { } else if (sd_version_is_sd3(version)) {
name = convert_diffusers_dit_to_original_sd3(name); name = convert_diffusers_dit_to_original_sd3(name);
} else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) { } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) {
name = convert_diffusers_dit_to_original_flux(name); name = convert_diffusers_dit_to_original_flux(name);
} else if (sd_version_is_z_image(version)) { } else if (sd_version_is_z_image(version)) {
name = convert_diffusers_dit_to_original_lumina2(name); name = convert_diffusers_dit_to_original_lumina2(name);

View File

@ -1005,6 +1005,8 @@ struct FluxFlowDenoiser : public DiscreteFlowDenoiser {
} }
}; };
struct SefiFlowDenoiser;
struct Flux2FlowDenoiser : public FluxFlowDenoiser { struct Flux2FlowDenoiser : public FluxFlowDenoiser {
Flux2FlowDenoiser() = default; Flux2FlowDenoiser() = default;
@ -1037,6 +1039,80 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser {
} }
}; };
struct SefiFlowDenoiser : public Flux2FlowDenoiser {
static constexpr int kNumTrainTimesteps = 1000;
static constexpr int kSemChannels = 16;
static constexpr int kTotalChannels = 144;
float delta_t = 0.1f;
float timestep_shift_alpha = 1.0f;
std::vector<float> sem_sigmas;
std::vector<float> tex_sigmas;
std::vector<float> sem_timesteps;
std::vector<float> tex_timesteps;
SefiFlowDenoiser() = default;
static float apply_alpha_shift(float u_unit, float alpha) {
if (alpha == 1.0f) {
return u_unit;
}
float denom = 1.0f + (alpha - 1.0f) * u_unit;
return (alpha * u_unit) / denom;
}
std::vector<float> get_sigmas(uint32_t n,
int image_seq_len,
scheduler_t scheduler_type,
SDVersion version,
const char* extra_sample_args = nullptr) override {
sem_sigmas.clear();
tex_sigmas.clear();
sem_timesteps.clear();
tex_timesteps.clear();
for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "sefi scheduler arg")) {
if (key == "sefi_alpha") {
if (!parse_strict_float(value, timestep_shift_alpha)) {
LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str());
}
} else if (key == "sefi_delta_t") {
if (!parse_strict_float(value, delta_t)) {
LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str());
}
}
}
for (uint32_t i = 0; i <= n; ++i) {
float u_base = static_cast<float>(i) / static_cast<float>(n);
float u_shifted = apply_alpha_shift(u_base, timestep_shift_alpha);
float u_sem_raw = u_shifted * (1.0f + delta_t);
float u_sem = std::min(u_sem_raw, 1.0f);
float u_tex = std::max(0.0f, std::min(u_sem_raw - delta_t, 1.0f));
int idx_sem = std::min(kNumTrainTimesteps - 1,
std::max(0, static_cast<int>(u_sem * (kNumTrainTimesteps - 1))));
int idx_tex = std::min(kNumTrainTimesteps - 1,
std::max(0, static_cast<int>(u_tex * (kNumTrainTimesteps - 1))));
float t_sem = static_cast<float>(kNumTrainTimesteps - idx_sem);
float t_tex = static_cast<float>(kNumTrainTimesteps - idx_tex);
float sigma_sem = t_sem / static_cast<float>(kNumTrainTimesteps);
float sigma_tex = t_tex / static_cast<float>(kNumTrainTimesteps);
sem_timesteps.push_back(t_sem);
tex_timesteps.push_back(t_tex);
sem_sigmas.push_back(sigma_sem);
tex_sigmas.push_back(sigma_tex);
}
LOG_DEBUG("SefiFlowDenoiser: built %u-step dual schedule (alpha=%.2f delta_t=%.2f)",
n, timestep_shift_alpha, delta_t);
return tex_sigmas;
}
};
typedef std::function<sd::guidance::GuiderOutput(const sd::Tensor<float>&, float, int)> denoise_cb_t; typedef std::function<sd::guidance::GuiderOutput(const sd::Tensor<float>&, float, int)> denoise_cb_t;
static std::pair<float, float> get_ancestral_step(float sigma_from, static std::pair<float, float> get_ancestral_step(float sigma_from,
@ -1140,6 +1216,40 @@ static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
return x; return x;
} }
static sd::Tensor<float> sample_sefi_euler(SefiFlowDenoiser* sefi,
denoise_cb_t model,
sd::Tensor<float> x) {
const std::vector<float>& sigma_tex_vec = sefi->tex_sigmas;
const std::vector<float>& sigma_sem_vec = sefi->sem_sigmas;
int steps = static_cast<int>(sigma_tex_vec.size()) - 1;
for (int i = 0; i < steps; i++) {
float sigma_tex_cur = sigma_tex_vec[i];
float sigma_tex_next = sigma_tex_vec[i + 1];
float sigma_sem_cur = sigma_sem_vec[i];
float sigma_sem_next = sigma_sem_vec[i + 1];
if (sigma_tex_cur <= 1e-9f) {
continue;
}
auto denoised_opt = model(x, sigma_tex_cur, i + 1);
if (denoised_opt.pred.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
sd::Tensor<float> velocity = (x - denoised) / sigma_tex_cur;
auto x_sem = sd::ops::slice(x, 2, 0, SefiFlowDenoiser::kSemChannels);
auto x_tex = sd::ops::slice(x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels);
auto vel_sem = sd::ops::slice(velocity, 2, 0, SefiFlowDenoiser::kSemChannels);
auto vel_tex = sd::ops::slice(velocity, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels);
auto x_sem_next = x_sem + vel_sem * (sigma_sem_next - sigma_sem_cur);
auto x_tex_next = x_tex + vel_tex * (sigma_tex_next - sigma_tex_cur);
sd::ops::slice_assign(&x, 2, 0, SefiFlowDenoiser::kSemChannels, x_sem_next);
sd::ops::slice_assign(&x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels, x_tex_next);
}
return x;
}
static sd::Tensor<float> sample_euler(denoise_cb_t model, static sd::Tensor<float> sample_euler(denoise_cb_t model,
sd::Tensor<float> x, sd::Tensor<float> x,
const std::vector<float>& sigmas) { const std::vector<float>& sigmas) {
@ -2055,7 +2165,13 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
std::shared_ptr<RNG> rng, std::shared_ptr<RNG> rng,
float eta, float eta,
bool is_flow_denoiser, bool is_flow_denoiser,
const char* extra_sample_args) { const char* extra_sample_args,
std::shared_ptr<Denoiser> denoiser_for_dispatch = nullptr) {
if (denoiser_for_dispatch) {
if (auto sefi = std::dynamic_pointer_cast<SefiFlowDenoiser>(denoiser_for_dispatch)) {
return sample_sefi_euler(sefi.get(), model, std::move(x));
}
}
SamplerExtraArgs extra_args = parse_key_value_args(extra_sample_args, "extra sample arg"); SamplerExtraArgs extra_args = parse_key_value_args(extra_sample_args, "extra sample arg");
switch (method) { switch (method) {
case EULER_A_SAMPLE_METHOD: case EULER_A_SAMPLE_METHOD:

View File

@ -96,6 +96,7 @@ const char* model_version_to_str[] = {
"Longcat-Image", "Longcat-Image",
"PiD", "PiD",
"Ideogram 4", "Ideogram 4",
"SeFi-Image",
"Krea2", "Krea2",
"ESRGAN", "ESRGAN",
}; };
@ -691,7 +692,7 @@ public:
version, version,
sd_ctx_params->chroma_use_dit_mask, sd_ctx_params->chroma_use_dit_mask,
model_manager); model_manager);
} else if (sd_version_is_flux2(version)) { } else if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) {
bool is_chroma = false; bool is_chroma = false;
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE), cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
tensor_storage_map, tensor_storage_map,
@ -1295,6 +1296,8 @@ public:
} else if (sd_version_is_krea2(version)) { } else if (sd_version_is_krea2(version)) {
default_flow_shift = 1.15f; default_flow_shift = 1.15f;
} }
} else if (sd_version_is_sefi_image(version)) {
pred_type = SEFI_FLOW_PRED;
} else if (sd_version_is_flux2(version)) { } else if (sd_version_is_flux2(version)) {
pred_type = FLUX2_FLOW_PRED; pred_type = FLUX2_FLOW_PRED;
} else { } else {
@ -1334,6 +1337,11 @@ public:
denoiser = std::make_shared<Flux2FlowDenoiser>(); denoiser = std::make_shared<Flux2FlowDenoiser>();
break; break;
} }
case SEFI_FLOW_PRED: {
LOG_INFO("running in SeFi-Image dual-time FLOW mode");
denoiser = std::make_shared<SefiFlowDenoiser>();
break;
}
default: { default: {
LOG_ERROR("Unknown predition type %i", pred_type); LOG_ERROR("Unknown predition type %i", pred_type);
return false; return false;
@ -1639,7 +1647,16 @@ public:
std::vector<float> process_timesteps(const std::vector<float>& timesteps, std::vector<float> process_timesteps(const std::vector<float>& timesteps,
const sd::Tensor<float>& init_latent, const sd::Tensor<float>& init_latent,
const sd::Tensor<float>& denoise_mask) { const sd::Tensor<float>& denoise_mask,
int step) {
if (auto sefi_denoiser = std::dynamic_pointer_cast<SefiFlowDenoiser>(denoiser)) {
int sched_idx = step > 0 ? step - 1 : 0;
if (sched_idx >= static_cast<int>(sefi_denoiser->tex_timesteps.size())) {
sched_idx = static_cast<int>(sefi_denoiser->tex_timesteps.size()) - 1;
}
return {sefi_denoiser->sem_timesteps[sched_idx],
sefi_denoiser->tex_timesteps[sched_idx]};
}
if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") { if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") {
int64_t frame_count = init_latent.shape()[2]; int64_t frame_count = init_latent.shape()[2];
auto new_timesteps = std::vector<float>(static_cast<size_t>(frame_count), timesteps[0]); auto new_timesteps = std::vector<float>(static_cast<size_t>(frame_count), timesteps[0]);
@ -2051,7 +2068,7 @@ public:
timesteps_vec = process_ltxav_video_timesteps(base_timesteps_vec, init_latent, denoise_mask); timesteps_vec = process_ltxav_video_timesteps(base_timesteps_vec, init_latent, denoise_mask);
audio_timesteps_tensor = sd::Tensor<float>({static_cast<int64_t>(base_timesteps_vec.size())}, base_timesteps_vec); audio_timesteps_tensor = sd::Tensor<float>({static_cast<int64_t>(base_timesteps_vec.size())}, base_timesteps_vec);
} else { } else {
timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask, step);
} }
const std::vector<float>& scaling_timesteps_vec = (sd_version_is_ltxav(version) && !denoise_mask.empty()) const std::vector<float>& scaling_timesteps_vec = (sd_version_is_ltxav(version) && !denoise_mask.empty())
? base_timesteps_vec ? base_timesteps_vec
@ -2121,7 +2138,7 @@ public:
diffusion_params.extra = UNetDiffusionExtra{-1, &controls, control_strength}; diffusion_params.extra = UNetDiffusionExtra{-1, &controls, control_strength};
} else if (sd_version_is_sd3(version)) { } else if (sd_version_is_sd3(version)) {
diffusion_params.extra = SkipLayerDiffusionExtra{local_skip_layers}; diffusion_params.extra = SkipLayerDiffusionExtra{local_skip_layers};
} else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) { } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) {
diffusion_params.extra = FluxDiffusionExtra{&guidance_tensor, diffusion_params.extra = FluxDiffusionExtra{&guidance_tensor,
local_skip_layers}; local_skip_layers};
} else if (sd_version_is_anima(version)) { } else if (sd_version_is_anima(version)) {
@ -2265,7 +2282,7 @@ public:
return output; return output;
}; };
auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args); auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args, denoiser);
if (x0_opt.empty()) { if (x0_opt.empty()) {
LOG_ERROR("Diffusion model sampling failed"); LOG_ERROR("Diffusion model sampling failed");
if (control_net) { if (control_net) {
@ -2326,6 +2343,8 @@ public:
latent_channel = 3; latent_channel = 3;
} else if (sd_version_is_pid(version)) { } else if (sd_version_is_pid(version)) {
latent_channel = 3; latent_channel = 3;
} else if (sd_version_is_sefi_image(version)) {
latent_channel = 144;
} else if (sd_version_uses_flux2_vae(version)) { } else if (sd_version_uses_flux2_vae(version)) {
latent_channel = 128; latent_channel = 128;
} else { } else {