feat: add SeFi-Image support (#1707)

2026-06-29 09:36:40 +00:00 · 2026-06-28 16:49:24 +02:00 · 2026-06-28 16:49:24 +02:00 · 03e9a22f4d
commit 03e9a22f4d
parent f54e45e81c
16 changed files with 736 additions and 17 deletions
--- a/README.md
+++ b/README.md
@ -53,6 +53,7 @@ API and command-line option may change frequently.***
    - [ERNIE-Image](./docs/ernie_image.md)
    - [Boogu Image](./docs/boogu_image.md)
    - [Krea2](./docs/krea2.md)
    - [SeFi-Image](./docs/sefi_image.md)
    - [HiDream-O1-Image](./docs/hidream_o1_image.md)
    - [Ideogram4](./docs/ideogram4.md)
  - Image Edit Models
--- a/assets/sefi_image/example.png
+++ b/assets/sefi_image/example.png
--- a/docs/sefi_image.md
+++ b/docs/sefi_image.md
@ -0,0 +1,50 @@
 # How to Use
 SeFi-Image uses a Flux2-style dual-time transformer (semantic + texture streams), the standard Flux2 VAE, and Qwen3-VL as the LLM text encoder. Tech report: [arXiv:2606.22568](https://arxiv.org/abs/2606.22568).
 ## Download weights
 The SeFi-Image family ships in three scales (1B / 2B / 5B) and three families (Base / RL / turbo), all gated on Hugging Face under https://huggingface.co/SeFi-Image.
 - 1B and 2B variants pair with Qwen3-VL-2B-Instruct.
 - 5B variants pair with Qwen3-VL-4B-Instruct.
 - All variants use the standard Flux2 VAE (`flux2_ae.safetensors` from https://huggingface.co/black-forest-labs/FLUX.2-dev).
 Convert the transformer and text encoder to sd.cpp safetensors:
 ```bash
 python3 script/convert_sefi.py     <hf_repo_dir>                          <out_dir>/sefi_<scale>_<family>.safetensors
 python3 script/convert_qwen3_vl.py  <hf_repo_dir>/Qwen3-VL-XB-Instruct    <out_dir>/qwen3_vl_<X>b.safetensors
 ```
 ## Variant defaults
 | Family | timestep_shift_alpha | steps | cfg-scale |
 |---|---|---|---|
 | Base | 0.3 | 50 | 4.0 |
 | RL | 0.3 | 50 | 4.0 |
 | turbo | 1.0 | 4 | 1.0 |
 The dispatcher picks `alpha` from the filename (`turbo` substring => 1.0, otherwise 0.3). Override via `--extra-sample-args sefi_alpha=<value>` or `sefi_delta_t=<value>`.
 ## Examples
 ### 1B / 2B turbo
 ```
 ./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
 ```
 ### 1B / 2B base
 ```
 ./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_base.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 4.0 --steps 50 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
 ```
 ### 5B (needs streaming on 12 GiB VRAM)
 ```
 ./build/bin/sd-cli --diffusion-model /path/to/sefi_5b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_4b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --max-vram 8 --stream-layers --offload-to-cpu -o out.png
 ```
 <img alt="SeFi-Image 5B turbo example" src="../assets/sefi_image/example.png" />
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -81,6 +81,7 @@ enum prediction_t {
    FLOW_PRED,
    FLUX_FLOW_PRED,
    FLUX2_FLOW_PRED,
    SEFI_FLOW_PRED,
    PREDICTION_COUNT
 };
--- a/script/convert_qwen3_vl.py
+++ b/script/convert_qwen3_vl.py
@ -0,0 +1,112 @@
 #!/usr/bin/env python3
 """Convert a Qwen3-VL HF safetensors checkpoint into a sd.cpp-loadable form.
 The HF dump prefixes text-tower keys with ``model.language_model.`` and
 vision-tower keys with ``model.visual.``. sd.cpp expects ``model.<rest>`` for
 the text side; the vision side is converted by sd.cpp's own
 ``convert_qwen3_vl_vision_name`` and is left as-is here.
 Operates on raw safetensors bytes so any dtype (BF16/F16/F32) is preserved.
 Usage:
    python3 script/convert_qwen3_vl.py <hf_qwen3_vl_dir_or_safetensors> <output.safetensors>
 """
 import argparse
 import json
 import os
 import struct
 import sys
 def rewrite_key(key: str) -> str:
    if key.startswith("model.language_model."):
        return "model." + key[len("model.language_model."):]
    return key
 def read_safetensors_header(path: str):
    with open(path, "rb") as f:
        hdr_len = struct.unpack("<Q", f.read(8))[0]
        hdr_bytes = f.read(hdr_len)
    return json.loads(hdr_bytes), 8 + hdr_len
 def collect_shard_paths(path: str):
    if os.path.isdir(path):
        index_path = os.path.join(path, "model.safetensors.index.json")
        if os.path.isfile(index_path):
            with open(index_path) as f:
                idx = json.load(f)
            return sorted({os.path.join(path, n) for n in idx["weight_map"].values()})
        single = os.path.join(path, "model.safetensors")
        if os.path.isfile(single):
            return [single]
        raise FileNotFoundError(f"No Qwen3-VL safetensors in {path}")
    if os.path.isfile(path):
        return [path]
    raise FileNotFoundError(path)
 def stage_tensors(input_path: str):
    entries = []
    for shard_path in collect_shard_paths(input_path):
        hdr, data_off = read_safetensors_header(shard_path)
        for key, info in hdr.items():
            if key == "__metadata__":
                continue
            entries.append((rewrite_key(key), shard_path, data_off, info))
    return entries
 def write_consolidated(out_path: str, entries):
    entries = sorted(entries, key=lambda e: e[0])
    new_header = {}
    cur_offset = 0
    for new_key, shard_path, data_off, info in entries:
        start, end = info["data_offsets"]
        size = end - start
        new_header[new_key] = {
            "dtype": info["dtype"],
            "shape": info["shape"],
            "data_offsets": [cur_offset, cur_offset + size],
        }
        cur_offset += size
    header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
    pad = (-len(header_json)) % 8
    header_json = header_json + (b" " * pad)
    with open(out_path, "wb") as out:
        out.write(struct.pack("<Q", len(header_json)))
        out.write(header_json)
        for new_key, shard_path, data_off, info in entries:
            start, end = info["data_offsets"]
            with open(shard_path, "rb") as src:
                src.seek(data_off + start)
                remaining = end - start
                while remaining > 0:
                    chunk = src.read(min(8 * 1024 * 1024, remaining))
                    if not chunk:
                        raise IOError(f"Truncated tensor in {shard_path}")
                    out.write(chunk)
                    remaining -= len(chunk)
 def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("input", help="HF Qwen3-VL directory or single safetensors file")
    parser.add_argument("output", help="Output single safetensors path")
    args = parser.parse_args()
    entries = stage_tensors(args.input)
    print(f"Tensors: {len(entries)}")
    print(f"Writing -> {args.output}")
    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
    write_consolidated(args.output, entries)
    print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")
 if __name__ == "__main__":
    main()
--- a/script/convert_sefi.py
+++ b/script/convert_sefi.py
@ -0,0 +1,279 @@
 #!/usr/bin/env python3
 """Convert a SeFi-Image diffusers checkpoint into a single sd.cpp-compatible safetensors.
 Operates on raw safetensors bytes so any dtype (BF16, F32, ...) is preserved exactly.
 No numpy or torch dependency required.
 Usage:
    python3 script/convert_sefi.py <sefi_diffusers_dir> <output.safetensors>
 """
 import argparse
 import json
 import os
 import re
 import struct
 import sys
 _LINEAR_TO_LIN = re.compile(r"\.linear\.")
 _SHARED_MOD_PREFIXES = (
    "double_stream_modulation_img",
    "double_stream_modulation_txt",
    "single_stream_modulation",
 )
 def rewrite_transformer_key(key: str) -> str:
    if key.startswith("backbone."):
        key = key[len("backbone."):]
    elif key.startswith("dual_time_embed."):
        return key
    if any(key.startswith(prefix + ".") for prefix in _SHARED_MOD_PREFIXES):
        key = _LINEAR_TO_LIN.sub(".lin.", key, count=1)
    if key == "context_embedder.weight":
        return "txt_in.weight"
    if key == "context_embedder.bias":
        return "txt_in.bias"
    if key == "x_embedder.weight":
        return "img_in.weight"
    if key == "x_embedder.bias":
        return "img_in.bias"
    if key == "proj_out.weight":
        return "final_layer.linear.weight"
    if key == "proj_out.bias":
        return "final_layer.linear.bias"
    if key == "norm_out.linear.weight":
        return "final_layer.adaLN_modulation.1.weight"
    if key == "norm_out.linear.bias":
        return "final_layer.adaLN_modulation.1.bias"
    m = re.match(r"transformer_blocks\.(\d+)\.(.*)$", key)
    if m:
        return _rewrite_double_stream(m.group(1), m.group(2))
    m = re.match(r"single_transformer_blocks\.(\d+)\.(.*)$", key)
    if m:
        return _rewrite_single_stream(m.group(1), m.group(2))
    return key
 def _rewrite_double_stream(idx: str, tail: str) -> str:
    dst = f"double_blocks.{idx}."
    mapping = {
        "norm1.linear.weight":          "img_mod.lin.weight",
        "norm1_context.linear.weight":  "txt_mod.lin.weight",
        "attn.norm_q.weight":           "img_attn.norm.query_norm.scale",
        "attn.norm_k.weight":           "img_attn.norm.key_norm.scale",
        "attn.norm_added_q.weight":     "txt_attn.norm.query_norm.scale",
        "attn.norm_added_k.weight":     "txt_attn.norm.key_norm.scale",
        "attn.to_out.0.weight":         "img_attn.proj.weight",
        "attn.to_add_out.weight":       "txt_attn.proj.weight",
        "ff.net.0.proj.weight":         "img_mlp.0.weight",
        "ff.net.2.weight":              "img_mlp.2.weight",
        "ff_context.net.0.proj.weight": "txt_mlp.0.weight",
        "ff_context.net.2.weight":      "txt_mlp.2.weight",
        "ff.linear_in.weight":          "img_mlp.0.weight",
        "ff.linear_out.weight":         "img_mlp.2.weight",
        "ff_context.linear_in.weight":  "txt_mlp.0.weight",
        "ff_context.linear_out.weight": "txt_mlp.2.weight",
    }
    return dst + mapping.get(tail, tail)
 # QKV triplets to fuse on output: source tails -> target fused tail.
 # Each tuple is (q_tail, k_tail, v_tail, fused_target_tail).
 QKV_DOUBLE_TRIPLETS = [
    ("attn.to_q.weight",       "attn.to_k.weight",       "attn.to_v.weight",       "img_attn.qkv.weight"),
    ("attn.add_q_proj.weight", "attn.add_k_proj.weight", "attn.add_v_proj.weight", "txt_attn.qkv.weight"),
 ]
 def _rewrite_single_stream(idx: str, tail: str) -> str:
    dst = f"single_blocks.{idx}."
    mapping = {
        "norm.linear.weight":          "modulation.lin.weight",
        "attn.norm_q.weight":          "norm.query_norm.scale",
        "attn.norm_k.weight":          "norm.key_norm.scale",
        "attn.to_qkv_mlp_proj.weight": "linear1.weight",
        "attn.to_out.weight":          "linear2.weight",
    }
    return dst + mapping.get(tail, tail)
 def read_safetensors_header(path: str):
    """Return (header dict, data start byte offset)."""
    with open(path, "rb") as f:
        hdr_len = struct.unpack("<Q", f.read(8))[0]
        hdr_bytes = f.read(hdr_len)
    return json.loads(hdr_bytes), 8 + hdr_len
 def collect_shard_paths(directory: str, weight_pattern: str):
    index_path = os.path.join(directory, f"{weight_pattern}.safetensors.index.json")
    if os.path.isfile(index_path):
        with open(index_path) as f:
            idx = json.load(f)
        return sorted({os.path.join(directory, n) for n in idx["weight_map"].values()})
    single = os.path.join(directory, f"{weight_pattern}.safetensors")
    if not os.path.isfile(single):
        raise FileNotFoundError(f"No checkpoint at {directory}: missing {weight_pattern}")
    return [single]
 def stage_tensors_for_section(section_dir: str, rewrite_fn):
    """Return a list of (new_key, shard_path, data_start_offset, info_dict) entries.
    A "qkv_fuse" pseudo-entry with three source descriptors is emitted when a
    transformer_blocks.* split q/k/v triplet is found, so the writer can fuse
    them into a single output tensor.
    """
    entries = []
    # First, index all raw keys per shard so we can detect qkv triplets.
    raw_by_block = {}  # block_idx -> {tail: (key, shard_path, data_off, info)}
    raw_others = []
    for shard_path in collect_shard_paths(section_dir, "diffusion_pytorch_model"):
        hdr, data_off = read_safetensors_header(shard_path)
        for key, info in hdr.items():
            if key == "__metadata__":
                continue
            m = re.match(r"backbone\.transformer_blocks\.(\d+)\.(.*)$", key)
            if m and any(m.group(2) in trip[:3] for trip in QKV_DOUBLE_TRIPLETS):
                idx = m.group(1)
                raw_by_block.setdefault(idx, {})[m.group(2)] = (key, shard_path, data_off, info)
            else:
                raw_others.append((key, shard_path, data_off, info))
    for key, shard_path, data_off, info in raw_others:
        new_key = rewrite_fn(key)
        # Swap the (scale, shift) halves to (shift, scale) at conversion time so
        # the on-disk weight matches BFL flux ordering and the runtime stays
        # version-agnostic. norm_out.linear weight shape is [2*dim, dim] and bias
        # is [2*dim]; both split along axis 0 (outermost == row-major outer).
        if new_key in ("final_layer.adaLN_modulation.1.weight",
                       "final_layer.adaLN_modulation.1.bias"):
            info = dict(info)
            info["_chunk_swap_halves"] = True
        entries.append((new_key, shard_path, data_off, info))
    for block_idx, tails in raw_by_block.items():
        for q_tail, k_tail, v_tail, fused_tail in QKV_DOUBLE_TRIPLETS:
            if q_tail in tails and k_tail in tails and v_tail in tails:
                q = tails[q_tail]; k = tails[k_tail]; v = tails[v_tail]
                # Validate shapes match.
                q_shape = q[3]["shape"]; k_shape = k[3]["shape"]; v_shape = v[3]["shape"]
                if q_shape != k_shape or q_shape != v_shape:
                    raise ValueError(f"qkv shape mismatch at block {block_idx} {q_tail}: q={q_shape} k={k_shape} v={v_shape}")
                fused_shape = [q_shape[0] * 3] + list(q_shape[1:])
                fused_info = {
                    "dtype": q[3]["dtype"],
                    "shape": fused_shape,
                    "_qkv_sources": [q, k, v],  # pseudo field consumed by writer
                }
                entries.append((f"double_blocks.{block_idx}.{fused_tail}",
                                None, None, fused_info))
                del tails[q_tail]; del tails[k_tail]; del tails[v_tail]
        # Anything left in tails was an unmatched single - pass through.
        for tail, payload in tails.items():
            entries.append((rewrite_fn(payload[0]),) + payload[1:])
    return entries
 _DTYPE_BYTES = {
    "BF16": 2, "F16": 2, "F32": 4, "F64": 8,
    "U8": 1, "I8": 1, "I16": 2, "I32": 4, "I64": 8,
    "BOOL": 1,
 }
 def _total_bytes(info: dict) -> int:
    if "_qkv_sources" in info:
        elems = 1
        for d in info["shape"]:
            elems *= d
        return elems * _DTYPE_BYTES[info["dtype"]]
    start, end = info["data_offsets"]
    return end - start
 def write_consolidated(out_path: str, entries):
    """Write a single safetensors file by streaming raw bytes from each shard.
    For qkv-fused entries, q/k/v are concatenated along axis 0 (row-major), so a
    simple byte-level concatenation produces the correct fused layout for any
    standard dtype.
    """
    entries = sorted(entries, key=lambda e: e[0])
    new_header = {}
    cur_offset = 0
    for new_key, shard_path, data_off, info in entries:
        size = _total_bytes(info)
        new_header[new_key] = {
            "dtype": info["dtype"],
            "shape": info["shape"],
            "data_offsets": [cur_offset, cur_offset + size],
        }
        cur_offset += size
    header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
    pad = (-len(header_json)) % 8
    header_json = header_json + (b" " * pad)
    def copy_range(src_path, src_data_off, src_info, out, byte_range=None):
        start, end = src_info["data_offsets"]
        if byte_range is not None:
            sub_start, sub_end = byte_range
            start, end = start + sub_start, start + sub_end
        with open(src_path, "rb") as src:
            src.seek(src_data_off + start)
            remaining = end - start
            while remaining > 0:
                chunk = src.read(min(8 * 1024 * 1024, remaining))
                if not chunk:
                    raise IOError(f"Truncated tensor in {src_path}")
                out.write(chunk)
                remaining -= len(chunk)
    with open(out_path, "wb") as out:
        out.write(struct.pack("<Q", len(header_json)))
        out.write(header_json)
        for new_key, shard_path, data_off, info in entries:
            if "_qkv_sources" in info:
                for q_entry in info["_qkv_sources"]:
                    _, src_path, src_data_off, src_info = q_entry
                    copy_range(src_path, src_data_off, src_info, out)
            elif info.get("_chunk_swap_halves"):
                size = _total_bytes(info)
                half = size // 2
                if size != half * 2:
                    raise ValueError(f"{new_key}: odd byte size {size} cannot be split into halves")
                copy_range(shard_path, data_off, info, out, byte_range=(half, size))
                copy_range(shard_path, data_off, info, out, byte_range=(0, half))
            else:
                copy_range(shard_path, data_off, info, out)
 def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("input_dir", help="SeFi diffusers checkpoint directory")
    parser.add_argument("output", help="Output transformer safetensors path (load via --diffusion-model)")
    args = parser.parse_args()
    transformer_entries = stage_tensors_for_section(
        os.path.join(args.input_dir, "transformer"), rewrite_transformer_key)
    print(f"Transformer tensors: {len(transformer_entries)}")
    print(f"Writing {len(transformer_entries)} tensors -> {args.output}")
    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
    write_consolidated(args.output, transformer_entries)
    print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")
 if __name__ == "__main__":
    main()
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
            arch = LLM::LLMArch::GPT_OSS_20B;
        } else if (sd_version_is_pid(version)) {
            arch = LLM::LLMArch::GEMMA2_2B;
-        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_krea2(version)) {
+        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_sefi_image(version) || sd_version_is_krea2(version)) {
            arch = LLM::LLMArch::QWEN3_VL;
        } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
            arch = LLM::LLMArch::QWEN3;
@ -1997,6 +1997,18 @@ struct LLMEmbedder : public Conditioner {
            prompt_attn_range.second = static_cast<int>(prompt.size());
            prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
        } else if (sd_version_is_sefi_image(version)) {
            prompt_template_encode_start_idx = 0;
            min_length                       = 1024;
            out_layers                       = {9, 18, 27};
            prompt = "<|im_start|>user\n";
            prompt_attn_range.first = static_cast<int>(prompt.size());
            prompt += conditioner_params.text;
            prompt_attn_range.second = static_cast<int>(prompt.size());
            prompt += "<|im_end|>\n<|im_start|>assistant\n";
        } else if (version == VERSION_OVIS_IMAGE) {
            prompt_template_encode_start_idx = 28;
            min_length                       = prompt_template_encode_start_idx + 256;
--- a/src/model.h
+++ b/src/model.h
@ -49,6 +49,7 @@ enum SDVersion {
    VERSION_LONGCAT,
    VERSION_PID,
    VERSION_IDEOGRAM4,
    VERSION_SEFI_IMAGE,
    VERSION_KREA2,
    VERSION_ESRGAN,
    VERSION_COUNT,
@ -187,6 +188,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) {
    return false;
 }
 static inline bool sd_version_is_sefi_image(SDVersion version) {
    if (version == VERSION_SEFI_IMAGE) {
        return true;
    }
    return false;
 }
 static inline bool sd_version_is_krea2(SDVersion version) {
    if (version == VERSION_KREA2) {
        return true;
@ -202,7 +210,7 @@ static inline bool sd_version_uses_flux_vae(SDVersion version) {
 }
 static inline bool sd_version_uses_flux2_vae(SDVersion version) {
-    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
+    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version) || sd_version_is_sefi_image(version)) {
        return true;
    }
    return false;
@ -242,6 +250,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
        sd_version_is_longcat(version) ||
        sd_version_is_pid(version) ||
        sd_version_is_ideogram4(version) ||
        sd_version_is_sefi_image(version) ||
        sd_version_is_krea2(version)) {
        return true;
    }
--- a/src/model/diffusion/flux.hpp
+++ b/src/model/diffusion/flux.hpp
@ -8,6 +8,7 @@
 #include "model/common/rope.hpp"
 #include "model/diffusion/dit.hpp"
 #include "model/diffusion/model.hpp"
 #include "model/diffusion/sefi_image.hpp"
 #include "model_loader.h"
 #define FLUX_GRAPH_SIZE 10240
@ -26,6 +27,9 @@ namespace Flux {
    struct FluxConfig {
        SDVersion version         = VERSION_FLUX;
        bool is_chroma            = false;
        bool is_sefi              = false;
        int64_t semantic_channels = 0;
        float sefi_delta_t        = 0.1f;
        int patch_size            = 2;
        int64_t in_channels       = 64;
        int64_t out_channels      = 64;
@ -88,6 +92,21 @@ namespace Flux {
                config.share_modulation = true;
                config.ref_index_scale  = 10.f;
                config.use_mlp_silu_act = true;
            } else if (sd_version_is_sefi_image(version)) {
                config.is_sefi           = true;
                config.semantic_channels = 16;
                config.in_channels       = 128 + config.semantic_channels;
                config.patch_size        = 1;
                config.out_channels      = 128 + config.semantic_channels;
                config.mlp_ratio         = 3.f;
                config.theta             = 2000;
                config.axes_dim          = {32, 32, 32, 32};
                config.vec_in_dim        = 0;
                config.qkv_bias          = false;
                config.disable_bias      = true;
                config.share_modulation  = true;
                config.ref_index_scale   = 10.f;
                config.use_mlp_silu_act  = true;
            } else if (sd_version_is_longcat(version)) {
                config.context_in_dim = 3584;
                config.vec_in_dim     = 0;
@ -723,8 +742,8 @@ namespace Flux {
                auto m     = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));  // [N, 2 * hidden_size]
                auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, 2, 0);
-                shift      = m_vec[0];  // [N, hidden_size]
+                shift      = m_vec[0];
-                scale      = m_vec[1];  // [N, hidden_size]
+                scale      = m_vec[1];
            }
            x = Flux::modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
@ -902,6 +921,8 @@ namespace Flux {
            }
            if (config.is_chroma) {
                blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(config.in_dim, config.hidden_size);
            } else if (config.is_sefi) {
                blocks["dual_time_embed"] = std::make_shared<SefiImage::SefiDualTimestepEmbeddings>(256, config.hidden_size);
            } else {
                blocks["time_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
                if (config.vec_in_dim > 0) {
@ -1027,6 +1048,11 @@ namespace Flux {
                if (y != nullptr) {
                    txt_img_mask = ggml_pad(ctx->ggml_ctx, y, static_cast<int>(img->ne[1]), 0, 0, 0);
                }
            } else if (config.is_sefi) {
                auto dual_time_embed = std::dynamic_pointer_cast<SefiImage::SefiDualTimestepEmbeddings>(blocks["dual_time_embed"]);
                auto timestep_sem    = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, 0);
                auto timestep_tex    = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, ggml_element_size(timesteps));
                vec                  = dual_time_embed->forward(ctx, timestep_sem, timestep_tex);
            } else {
                auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
                vec          = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
@ -1500,7 +1526,7 @@ namespace Flux {
                set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
            }
            std::set<int> txt_arange_dims;
-            if (sd_version_is_flux2(version)) {
+            if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) {
                txt_arange_dims    = {3};
                increase_ref_index = true;
            } else if (version == VERSION_OVIS_IMAGE) {
--- a/src/model/diffusion/sefi_image.hpp
+++ b/src/model/diffusion/sefi_image.hpp
@ -0,0 +1,91 @@
 #ifndef __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
 #define __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
 #include <memory>
 #include "model/common/block.hpp"
 namespace SefiImage {
    struct SefiImageConfig {
        int64_t semantic_channels        = 16;
        int64_t texture_latent_channels  = 32;
        int64_t timestep_guidance_in_dim = 256;
        int64_t hidden_size              = 3072;
        float timestep_shift_alpha       = 0.3f;
        float delta_t                    = 0.1f;
        int64_t packed_texture_channels(int patch_size) const {
            return texture_latent_channels * patch_size * patch_size;
        }
        int64_t packed_input_channels(int patch_size) const {
            return semantic_channels + packed_texture_channels(patch_size);
        }
        static SefiImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
                                                   const std::string& prefix) {
            SefiImageConfig config;
            for (const auto& [name, tensor_storage] : tensor_storage_map) {
                if (!starts_with(name, prefix)) {
                    continue;
                }
                if (ends_with(name, "dual_time_embed.semantic_embedder.linear_1.weight") && tensor_storage.n_dims == 2) {
                    config.timestep_guidance_in_dim = tensor_storage.ne[0];
                    config.hidden_size              = tensor_storage.ne[1] * 2;
                }
            }
            LOG_DEBUG("sefi_image: semantic_channels = %" PRId64 ", texture_latent_channels = %" PRId64 ", hidden_size = %" PRId64,
                      config.semantic_channels,
                      config.texture_latent_channels,
                      config.hidden_size);
            return config;
        }
    };
    struct SefiTimestepEmbedding : public GGMLBlock {
    public:
        SefiTimestepEmbedding(int64_t in_channels, int64_t time_embed_dim) {
            blocks["linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim, false));
            blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim, false));
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* sample) {
            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
            sample = linear_1->forward(ctx, sample);
            sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
            sample = linear_2->forward(ctx, sample);
            return sample;
        }
    };
    struct SefiDualTimestepEmbeddings : public GGMLBlock {
    public:
        SefiDualTimestepEmbeddings(int64_t in_channels, int64_t embedding_dim) {
            GGML_ASSERT(embedding_dim % 2 == 0);
            int64_t half_dim            = embedding_dim / 2;
            blocks["semantic_embedder"] = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
            blocks["texture_embedder"]  = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
            timestep_guidance_in_dim    = in_channels;
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx,
                             ggml_tensor* timestep_sem,
                             ggml_tensor* timestep_tex) {
            auto semantic_embedder = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["semantic_embedder"]);
            auto texture_embedder  = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["texture_embedder"]);
            auto sem_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_sem, timestep_guidance_in_dim, 10000, 1.f);
            auto tex_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_tex, timestep_guidance_in_dim, 10000, 1.f);
            auto sem_emb  = semantic_embedder->forward(ctx, sem_proj);
            auto tex_emb  = texture_embedder->forward(ctx, tex_proj);
            return ggml_concat(ctx->ggml_ctx, sem_emb, tex_emb, 0);
        }
    private:
        int64_t timestep_guidance_in_dim = 256;
    };
 }  // namespace SefiImage
 #endif  // __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
--- a/src/model/te/llm.hpp
+++ b/src/model/te/llm.hpp
@ -250,7 +250,7 @@ namespace LLM {
                    config.intermediate_size = tensor_storage.ne[1];
                }
            }
-            if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
+            if ((arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) && config.num_layers == 28) {
                config.num_heads = 16;
            }
            if (detected_vision_layers > 0) {
--- a/src/model/vae/auto_encoder_kl.hpp
+++ b/src/model/vae/auto_encoder_kl.hpp
@ -816,12 +816,13 @@ struct AutoEncoderKL : public VAE {
    }
    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
        auto latents_ = sd_version_is_sefi_image(version) ? sd::ops::slice(latents, 2, 16, 144) : latents;
        if (sd_version_uses_flux2_vae(version)) {
            int channel_dim                = 2;
-            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents_, channel_dim);
-            return (latents * std_tensor) / scale_factor + mean_tensor;
+            return (latents_ * std_tensor) / scale_factor + mean_tensor;
        }
-        return (latents / scale_factor) + shift_factor;
+        return (latents_ / scale_factor) + shift_factor;
    }
    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
--- a/src/model_loader.cpp
+++ b/src/model_loader.cpp
@ -66,7 +66,6 @@ const char* unused_tensors[] = {
    // "v_pred", // Used to detect SDXL vpred models
    "text_encoders.llm.output.weight",
    "text_encoders.llm.lm_head.",
    "first_stage_model.bn.",
 };
 bool is_unused_tensor(const std::string& name) {
@ -480,6 +479,9 @@ SDVersion ModelLoader::get_sd_version() {
        if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
            is_flux2 = true;
        }
        if (tensor_storage.name.find("dual_time_embed.semantic_embedder.linear_1.weight") != std::string::npos) {
            return VERSION_SEFI_IMAGE;
        }
        if (tensor_storage.name.find("single_blocks.47.linear1.weight") != std::string::npos) {
            has_single_block_47 = true;
        }
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@ -743,7 +743,7 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
        name = convert_diffusers_unet_to_original_sdxl(name);
    } else if (sd_version_is_sd3(version)) {
        name = convert_diffusers_dit_to_original_sd3(name);
-    } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
+    } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) {
        name = convert_diffusers_dit_to_original_flux(name);
    } else if (sd_version_is_z_image(version)) {
        name = convert_diffusers_dit_to_original_lumina2(name);
--- a/src/runtime/denoiser.hpp
+++ b/src/runtime/denoiser.hpp
@ -1005,6 +1005,8 @@ struct FluxFlowDenoiser : public DiscreteFlowDenoiser {
    }
 };
 struct SefiFlowDenoiser;
 struct Flux2FlowDenoiser : public FluxFlowDenoiser {
    Flux2FlowDenoiser() = default;
@ -1037,6 +1039,80 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser {
    }
 };
 struct SefiFlowDenoiser : public Flux2FlowDenoiser {
    static constexpr int kNumTrainTimesteps = 1000;
    static constexpr int kSemChannels       = 16;
    static constexpr int kTotalChannels     = 144;
    float delta_t              = 0.1f;
    float timestep_shift_alpha = 1.0f;
    std::vector<float> sem_sigmas;
    std::vector<float> tex_sigmas;
    std::vector<float> sem_timesteps;
    std::vector<float> tex_timesteps;
    SefiFlowDenoiser() = default;
    static float apply_alpha_shift(float u_unit, float alpha) {
        if (alpha == 1.0f) {
            return u_unit;
        }
        float denom = 1.0f + (alpha - 1.0f) * u_unit;
        return (alpha * u_unit) / denom;
    }
    std::vector<float> get_sigmas(uint32_t n,
                                  int image_seq_len,
                                  scheduler_t scheduler_type,
                                  SDVersion version,
                                  const char* extra_sample_args = nullptr) override {
        sem_sigmas.clear();
        tex_sigmas.clear();
        sem_timesteps.clear();
        tex_timesteps.clear();
        for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "sefi scheduler arg")) {
            if (key == "sefi_alpha") {
                if (!parse_strict_float(value, timestep_shift_alpha)) {
                    LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str());
                }
            } else if (key == "sefi_delta_t") {
                if (!parse_strict_float(value, delta_t)) {
                    LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str());
                }
            }
        }
        for (uint32_t i = 0; i <= n; ++i) {
            float u_base    = static_cast<float>(i) / static_cast<float>(n);
            float u_shifted = apply_alpha_shift(u_base, timestep_shift_alpha);
            float u_sem_raw = u_shifted * (1.0f + delta_t);
            float u_sem = std::min(u_sem_raw, 1.0f);
            float u_tex = std::max(0.0f, std::min(u_sem_raw - delta_t, 1.0f));
            int idx_sem = std::min(kNumTrainTimesteps - 1,
                                   std::max(0, static_cast<int>(u_sem * (kNumTrainTimesteps - 1))));
            int idx_tex = std::min(kNumTrainTimesteps - 1,
                                   std::max(0, static_cast<int>(u_tex * (kNumTrainTimesteps - 1))));
            float t_sem     = static_cast<float>(kNumTrainTimesteps - idx_sem);
            float t_tex     = static_cast<float>(kNumTrainTimesteps - idx_tex);
            float sigma_sem = t_sem / static_cast<float>(kNumTrainTimesteps);
            float sigma_tex = t_tex / static_cast<float>(kNumTrainTimesteps);
            sem_timesteps.push_back(t_sem);
            tex_timesteps.push_back(t_tex);
            sem_sigmas.push_back(sigma_sem);
            tex_sigmas.push_back(sigma_tex);
        }
        LOG_DEBUG("SefiFlowDenoiser: built %u-step dual schedule (alpha=%.2f delta_t=%.2f)",
                  n, timestep_shift_alpha, delta_t);
        return tex_sigmas;
    }
 };
 typedef std::function<sd::guidance::GuiderOutput(const sd::Tensor<float>&, float, int)> denoise_cb_t;
 static std::pair<float, float> get_ancestral_step(float sigma_from,
@ -1140,6 +1216,40 @@ static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
    return x;
 }
 static sd::Tensor<float> sample_sefi_euler(SefiFlowDenoiser* sefi,
                                           denoise_cb_t model,
                                           sd::Tensor<float> x) {
    const std::vector<float>& sigma_tex_vec = sefi->tex_sigmas;
    const std::vector<float>& sigma_sem_vec = sefi->sem_sigmas;
    int steps                               = static_cast<int>(sigma_tex_vec.size()) - 1;
    for (int i = 0; i < steps; i++) {
        float sigma_tex_cur  = sigma_tex_vec[i];
        float sigma_tex_next = sigma_tex_vec[i + 1];
        float sigma_sem_cur  = sigma_sem_vec[i];
        float sigma_sem_next = sigma_sem_vec[i + 1];
        if (sigma_tex_cur <= 1e-9f) {
            continue;
        }
        auto denoised_opt = model(x, sigma_tex_cur, i + 1);
        if (denoised_opt.pred.empty()) {
            return {};
        }
        sd::Tensor<float> denoised = std::move(denoised_opt.pred);
        sd::Tensor<float> velocity = (x - denoised) / sigma_tex_cur;
        auto x_sem      = sd::ops::slice(x, 2, 0, SefiFlowDenoiser::kSemChannels);
        auto x_tex      = sd::ops::slice(x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels);
        auto vel_sem    = sd::ops::slice(velocity, 2, 0, SefiFlowDenoiser::kSemChannels);
        auto vel_tex    = sd::ops::slice(velocity, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels);
        auto x_sem_next = x_sem + vel_sem * (sigma_sem_next - sigma_sem_cur);
        auto x_tex_next = x_tex + vel_tex * (sigma_tex_next - sigma_tex_cur);
        sd::ops::slice_assign(&x, 2, 0, SefiFlowDenoiser::kSemChannels, x_sem_next);
        sd::ops::slice_assign(&x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels, x_tex_next);
    }
    return x;
 }
 static sd::Tensor<float> sample_euler(denoise_cb_t model,
                                      sd::Tensor<float> x,
                                      const std::vector<float>& sigmas) {
@ -2055,7 +2165,13 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
                                            std::shared_ptr<RNG> rng,
                                            float eta,
                                            bool is_flow_denoiser,
-                                            const char* extra_sample_args) {
+                                            const char* extra_sample_args,
                                            std::shared_ptr<Denoiser> denoiser_for_dispatch = nullptr) {
    if (denoiser_for_dispatch) {
        if (auto sefi = std::dynamic_pointer_cast<SefiFlowDenoiser>(denoiser_for_dispatch)) {
            return sample_sefi_euler(sefi.get(), model, std::move(x));
        }
    }
    SamplerExtraArgs extra_args = parse_key_value_args(extra_sample_args, "extra sample arg");
    switch (method) {
        case EULER_A_SAMPLE_METHOD:
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -96,6 +96,7 @@ const char* model_version_to_str[] = {
    "Longcat-Image",
    "PiD",
    "Ideogram 4",
    "SeFi-Image",
    "Krea2",
    "ESRGAN",
 };
@ -691,7 +692,7 @@ public:
                                                                     version,
                                                                     sd_ctx_params->chroma_use_dit_mask,
                                                                     model_manager);
-            } else if (sd_version_is_flux2(version)) {
+            } else if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) {
                bool is_chroma   = false;
                cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
                                                                 tensor_storage_map,
@ -1295,6 +1296,8 @@ public:
                    } else if (sd_version_is_krea2(version)) {
                        default_flow_shift = 1.15f;
                    }
                } else if (sd_version_is_sefi_image(version)) {
                    pred_type = SEFI_FLOW_PRED;
                } else if (sd_version_is_flux2(version)) {
                    pred_type = FLUX2_FLOW_PRED;
                } else {
@ -1334,6 +1337,11 @@ public:
                    denoiser = std::make_shared<Flux2FlowDenoiser>();
                    break;
                }
                case SEFI_FLOW_PRED: {
                    LOG_INFO("running in SeFi-Image dual-time FLOW mode");
                    denoiser = std::make_shared<SefiFlowDenoiser>();
                    break;
                }
                default: {
                    LOG_ERROR("Unknown predition type %i", pred_type);
                    return false;
@ -1639,7 +1647,16 @@ public:
    std::vector<float> process_timesteps(const std::vector<float>& timesteps,
                                         const sd::Tensor<float>& init_latent,
-                                         const sd::Tensor<float>& denoise_mask) {
+                                         const sd::Tensor<float>& denoise_mask,
                                         int step) {
        if (auto sefi_denoiser = std::dynamic_pointer_cast<SefiFlowDenoiser>(denoiser)) {
            int sched_idx = step > 0 ? step - 1 : 0;
            if (sched_idx >= static_cast<int>(sefi_denoiser->tex_timesteps.size())) {
                sched_idx = static_cast<int>(sefi_denoiser->tex_timesteps.size()) - 1;
            }
            return {sefi_denoiser->sem_timesteps[sched_idx],
                    sefi_denoiser->tex_timesteps[sched_idx]};
        }
        if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") {
            int64_t frame_count = init_latent.shape()[2];
            auto new_timesteps  = std::vector<float>(static_cast<size_t>(frame_count), timesteps[0]);
@ -2051,7 +2068,7 @@ public:
                timesteps_vec          = process_ltxav_video_timesteps(base_timesteps_vec, init_latent, denoise_mask);
                audio_timesteps_tensor = sd::Tensor<float>({static_cast<int64_t>(base_timesteps_vec.size())}, base_timesteps_vec);
            } else {
-                timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask);
+                timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask, step);
            }
            const std::vector<float>& scaling_timesteps_vec = (sd_version_is_ltxav(version) && !denoise_mask.empty())
                                                                  ? base_timesteps_vec
@ -2121,7 +2138,7 @@ public:
                    diffusion_params.extra = UNetDiffusionExtra{-1, &controls, control_strength};
                } else if (sd_version_is_sd3(version)) {
                    diffusion_params.extra = SkipLayerDiffusionExtra{local_skip_layers};
-                } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
+                } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) {
                    diffusion_params.extra = FluxDiffusionExtra{&guidance_tensor,
                                                                local_skip_layers};
                } else if (sd_version_is_anima(version)) {
@ -2265,7 +2282,7 @@ public:
            return output;
        };
-        auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args);
+        auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args, denoiser);
        if (x0_opt.empty()) {
            LOG_ERROR("Diffusion model sampling failed");
            if (control_net) {
@ -2326,6 +2343,8 @@ public:
                latent_channel = 3;
            } else if (sd_version_is_pid(version)) {
                latent_channel = 3;
            } else if (sd_version_is_sefi_image(version)) {
                latent_channel = 144;
            } else if (sd_version_uses_flux2_vae(version)) {
                latent_channel = 128;
            } else {