feat: add SeFi-Image support (#1707)

2026-06-29 09:36:40 +00:00 · 2026-06-28 16:49:24 +02:00 · 2026-06-28 16:49:24 +02:00 · 03e9a22f4d
commit 03e9a22f4d
parent f54e45e81c
16 changed files with 736 additions and 17 deletions
--- a/README.md
+++ b/README.md
@ -53,6 +53,7 @@ API and command-line option may change frequently.***
    - [ERNIE-Image](./docs/ernie_image.md)
    - [Boogu Image](./docs/boogu_image.md)
    - [Krea2](./docs/krea2.md)
+    - [SeFi-Image](./docs/sefi_image.md)
    - [HiDream-O1-Image](./docs/hidream_o1_image.md)
    - [Ideogram4](./docs/ideogram4.md)
  - Image Edit Models
--- a/assets/sefi_image/example.png
+++ b/assets/sefi_image/example.png
--- a/docs/sefi_image.md
+++ b/docs/sefi_image.md
@ -0,0 +1,50 @@
+# How to Use
+
+SeFi-Image uses a Flux2-style dual-time transformer (semantic + texture streams), the standard Flux2 VAE, and Qwen3-VL as the LLM text encoder. Tech report: [arXiv:2606.22568](https://arxiv.org/abs/2606.22568).
+
+## Download weights
+
+The SeFi-Image family ships in three scales (1B / 2B / 5B) and three families (Base / RL / turbo), all gated on Hugging Face under https://huggingface.co/SeFi-Image.
+
+- 1B and 2B variants pair with Qwen3-VL-2B-Instruct.
+- 5B variants pair with Qwen3-VL-4B-Instruct.
+- All variants use the standard Flux2 VAE (`flux2_ae.safetensors` from https://huggingface.co/black-forest-labs/FLUX.2-dev).
+
+Convert the transformer and text encoder to sd.cpp safetensors:
+
+```bash
+python3 script/convert_sefi.py     <hf_repo_dir>                          <out_dir>/sefi_<scale>_<family>.safetensors
+python3 script/convert_qwen3_vl.py  <hf_repo_dir>/Qwen3-VL-XB-Instruct    <out_dir>/qwen3_vl_<X>b.safetensors
+```
+
+## Variant defaults
+
+| Family | timestep_shift_alpha | steps | cfg-scale |
+|---|---|---|---|
+| Base | 0.3 | 50 | 4.0 |
+| RL | 0.3 | 50 | 4.0 |
+| turbo | 1.0 | 4 | 1.0 |
+
+The dispatcher picks `alpha` from the filename (`turbo` substring => 1.0, otherwise 0.3). Override via `--extra-sample-args sefi_alpha=<value>` or `sefi_delta_t=<value>`.
+
+## Examples
+
+### 1B / 2B turbo
+
+```
+./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
+```
+
+### 1B / 2B base
+
+```
+./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_base.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 4.0 --steps 50 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
+```
+
+### 5B (needs streaming on 12 GiB VRAM)
+
+```
+./build/bin/sd-cli --diffusion-model /path/to/sefi_5b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_4b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --max-vram 8 --stream-layers --offload-to-cpu -o out.png
+```
+
+<img alt="SeFi-Image 5B turbo example" src="../assets/sefi_image/example.png" />
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -81,6 +81,7 @@ enum prediction_t {
    FLOW_PRED,
    FLUX_FLOW_PRED,
    FLUX2_FLOW_PRED,
+    SEFI_FLOW_PRED,
    PREDICTION_COUNT
 };

--- a/script/convert_qwen3_vl.py
+++ b/script/convert_qwen3_vl.py
@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""Convert a Qwen3-VL HF safetensors checkpoint into a sd.cpp-loadable form.
+
+The HF dump prefixes text-tower keys with ``model.language_model.`` and
+vision-tower keys with ``model.visual.``. sd.cpp expects ``model.<rest>`` for
+the text side; the vision side is converted by sd.cpp's own
+``convert_qwen3_vl_vision_name`` and is left as-is here.
+
+Operates on raw safetensors bytes so any dtype (BF16/F16/F32) is preserved.
+
+Usage:
+    python3 script/convert_qwen3_vl.py <hf_qwen3_vl_dir_or_safetensors> <output.safetensors>
+"""
+
+import argparse
+import json
+import os
+import struct
+import sys
+
+
+def rewrite_key(key: str) -> str:
+    if key.startswith("model.language_model."):
+        return "model." + key[len("model.language_model."):]
+    return key
+
+
+def read_safetensors_header(path: str):
+    with open(path, "rb") as f:
+        hdr_len = struct.unpack("<Q", f.read(8))[0]
+        hdr_bytes = f.read(hdr_len)
+    return json.loads(hdr_bytes), 8 + hdr_len
+
+
+def collect_shard_paths(path: str):
+    if os.path.isdir(path):
+        index_path = os.path.join(path, "model.safetensors.index.json")
+        if os.path.isfile(index_path):
+            with open(index_path) as f:
+                idx = json.load(f)
+            return sorted({os.path.join(path, n) for n in idx["weight_map"].values()})
+        single = os.path.join(path, "model.safetensors")
+        if os.path.isfile(single):
+            return [single]
+        raise FileNotFoundError(f"No Qwen3-VL safetensors in {path}")
+    if os.path.isfile(path):
+        return [path]
+    raise FileNotFoundError(path)
+
+
+def stage_tensors(input_path: str):
+    entries = []
+    for shard_path in collect_shard_paths(input_path):
+        hdr, data_off = read_safetensors_header(shard_path)
+        for key, info in hdr.items():
+            if key == "__metadata__":
+                continue
+            entries.append((rewrite_key(key), shard_path, data_off, info))
+    return entries
+
+
+def write_consolidated(out_path: str, entries):
+    entries = sorted(entries, key=lambda e: e[0])
+
+    new_header = {}
+    cur_offset = 0
+    for new_key, shard_path, data_off, info in entries:
+        start, end = info["data_offsets"]
+        size = end - start
+        new_header[new_key] = {
+            "dtype": info["dtype"],
+            "shape": info["shape"],
+            "data_offsets": [cur_offset, cur_offset + size],
+        }
+        cur_offset += size
+
+    header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
+    pad = (-len(header_json)) % 8
+    header_json = header_json + (b" " * pad)
+
+    with open(out_path, "wb") as out:
+        out.write(struct.pack("<Q", len(header_json)))
+        out.write(header_json)
+        for new_key, shard_path, data_off, info in entries:
+            start, end = info["data_offsets"]
+            with open(shard_path, "rb") as src:
+                src.seek(data_off + start)
+                remaining = end - start
+                while remaining > 0:
+                    chunk = src.read(min(8 * 1024 * 1024, remaining))
+                    if not chunk:
+                        raise IOError(f"Truncated tensor in {shard_path}")
+                    out.write(chunk)
+                    remaining -= len(chunk)
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("input", help="HF Qwen3-VL directory or single safetensors file")
+    parser.add_argument("output", help="Output single safetensors path")
+    args = parser.parse_args()
+
+    entries = stage_tensors(args.input)
+    print(f"Tensors: {len(entries)}")
+    print(f"Writing -> {args.output}")
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+    write_consolidated(args.output, entries)
+    print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")
+
+
+if __name__ == "__main__":
+    main()
--- a/script/convert_sefi.py
+++ b/script/convert_sefi.py
@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""Convert a SeFi-Image diffusers checkpoint into a single sd.cpp-compatible safetensors.
+
+Operates on raw safetensors bytes so any dtype (BF16, F32, ...) is preserved exactly.
+No numpy or torch dependency required.
+
+Usage:
+    python3 script/convert_sefi.py <sefi_diffusers_dir> <output.safetensors>
+"""
+
+import argparse
+import json
+import os
+import re
+import struct
+import sys
+
+
+_LINEAR_TO_LIN = re.compile(r"\.linear\.")
+_SHARED_MOD_PREFIXES = (
+    "double_stream_modulation_img",
+    "double_stream_modulation_txt",
+    "single_stream_modulation",
+)
+
+
+def rewrite_transformer_key(key: str) -> str:
+    if key.startswith("backbone."):
+        key = key[len("backbone."):]
+    elif key.startswith("dual_time_embed."):
+        return key
+
+    if any(key.startswith(prefix + ".") for prefix in _SHARED_MOD_PREFIXES):
+        key = _LINEAR_TO_LIN.sub(".lin.", key, count=1)
+
+    if key == "context_embedder.weight":
+        return "txt_in.weight"
+    if key == "context_embedder.bias":
+        return "txt_in.bias"
+    if key == "x_embedder.weight":
+        return "img_in.weight"
+    if key == "x_embedder.bias":
+        return "img_in.bias"
+
+    if key == "proj_out.weight":
+        return "final_layer.linear.weight"
+    if key == "proj_out.bias":
+        return "final_layer.linear.bias"
+    if key == "norm_out.linear.weight":
+        return "final_layer.adaLN_modulation.1.weight"
+    if key == "norm_out.linear.bias":
+        return "final_layer.adaLN_modulation.1.bias"
+
+    m = re.match(r"transformer_blocks\.(\d+)\.(.*)$", key)
+    if m:
+        return _rewrite_double_stream(m.group(1), m.group(2))
+    m = re.match(r"single_transformer_blocks\.(\d+)\.(.*)$", key)
+    if m:
+        return _rewrite_single_stream(m.group(1), m.group(2))
+
+    return key
+
+
+def _rewrite_double_stream(idx: str, tail: str) -> str:
+    dst = f"double_blocks.{idx}."
+    mapping = {
+        "norm1.linear.weight":          "img_mod.lin.weight",
+        "norm1_context.linear.weight":  "txt_mod.lin.weight",
+        "attn.norm_q.weight":           "img_attn.norm.query_norm.scale",
+        "attn.norm_k.weight":           "img_attn.norm.key_norm.scale",
+        "attn.norm_added_q.weight":     "txt_attn.norm.query_norm.scale",
+        "attn.norm_added_k.weight":     "txt_attn.norm.key_norm.scale",
+        "attn.to_out.0.weight":         "img_attn.proj.weight",
+        "attn.to_add_out.weight":       "txt_attn.proj.weight",
+        "ff.net.0.proj.weight":         "img_mlp.0.weight",
+        "ff.net.2.weight":              "img_mlp.2.weight",
+        "ff_context.net.0.proj.weight": "txt_mlp.0.weight",
+        "ff_context.net.2.weight":      "txt_mlp.2.weight",
+        "ff.linear_in.weight":          "img_mlp.0.weight",
+        "ff.linear_out.weight":         "img_mlp.2.weight",
+        "ff_context.linear_in.weight":  "txt_mlp.0.weight",
+        "ff_context.linear_out.weight": "txt_mlp.2.weight",
+    }
+    return dst + mapping.get(tail, tail)
+
+
+# QKV triplets to fuse on output: source tails -> target fused tail.
+# Each tuple is (q_tail, k_tail, v_tail, fused_target_tail).
+QKV_DOUBLE_TRIPLETS = [
+    ("attn.to_q.weight",       "attn.to_k.weight",       "attn.to_v.weight",       "img_attn.qkv.weight"),
+    ("attn.add_q_proj.weight", "attn.add_k_proj.weight", "attn.add_v_proj.weight", "txt_attn.qkv.weight"),
+]
+
+
+def _rewrite_single_stream(idx: str, tail: str) -> str:
+    dst = f"single_blocks.{idx}."
+    mapping = {
+        "norm.linear.weight":          "modulation.lin.weight",
+        "attn.norm_q.weight":          "norm.query_norm.scale",
+        "attn.norm_k.weight":          "norm.key_norm.scale",
+        "attn.to_qkv_mlp_proj.weight": "linear1.weight",
+        "attn.to_out.weight":          "linear2.weight",
+    }
+    return dst + mapping.get(tail, tail)
+
+
+
+
+def read_safetensors_header(path: str):
+    """Return (header dict, data start byte offset)."""
+    with open(path, "rb") as f:
+        hdr_len = struct.unpack("<Q", f.read(8))[0]
+        hdr_bytes = f.read(hdr_len)
+    return json.loads(hdr_bytes), 8 + hdr_len
+
+
+def collect_shard_paths(directory: str, weight_pattern: str):
+    index_path = os.path.join(directory, f"{weight_pattern}.safetensors.index.json")
+    if os.path.isfile(index_path):
+        with open(index_path) as f:
+            idx = json.load(f)
+        return sorted({os.path.join(directory, n) for n in idx["weight_map"].values()})
+    single = os.path.join(directory, f"{weight_pattern}.safetensors")
+    if not os.path.isfile(single):
+        raise FileNotFoundError(f"No checkpoint at {directory}: missing {weight_pattern}")
+    return [single]
+
+
+def stage_tensors_for_section(section_dir: str, rewrite_fn):
+    """Return a list of (new_key, shard_path, data_start_offset, info_dict) entries.
+
+    A "qkv_fuse" pseudo-entry with three source descriptors is emitted when a
+    transformer_blocks.* split q/k/v triplet is found, so the writer can fuse
+    them into a single output tensor.
+    """
+    entries = []
+    # First, index all raw keys per shard so we can detect qkv triplets.
+    raw_by_block = {}  # block_idx -> {tail: (key, shard_path, data_off, info)}
+    raw_others = []
+    for shard_path in collect_shard_paths(section_dir, "diffusion_pytorch_model"):
+        hdr, data_off = read_safetensors_header(shard_path)
+        for key, info in hdr.items():
+            if key == "__metadata__":
+                continue
+            m = re.match(r"backbone\.transformer_blocks\.(\d+)\.(.*)$", key)
+            if m and any(m.group(2) in trip[:3] for trip in QKV_DOUBLE_TRIPLETS):
+                idx = m.group(1)
+                raw_by_block.setdefault(idx, {})[m.group(2)] = (key, shard_path, data_off, info)
+            else:
+                raw_others.append((key, shard_path, data_off, info))
+
+    for key, shard_path, data_off, info in raw_others:
+        new_key = rewrite_fn(key)
+        # Swap the (scale, shift) halves to (shift, scale) at conversion time so
+        # the on-disk weight matches BFL flux ordering and the runtime stays
+        # version-agnostic. norm_out.linear weight shape is [2*dim, dim] and bias
+        # is [2*dim]; both split along axis 0 (outermost == row-major outer).
+        if new_key in ("final_layer.adaLN_modulation.1.weight",
+                       "final_layer.adaLN_modulation.1.bias"):
+            info = dict(info)
+            info["_chunk_swap_halves"] = True
+        entries.append((new_key, shard_path, data_off, info))
+
+    for block_idx, tails in raw_by_block.items():
+        for q_tail, k_tail, v_tail, fused_tail in QKV_DOUBLE_TRIPLETS:
+            if q_tail in tails and k_tail in tails and v_tail in tails:
+                q = tails[q_tail]; k = tails[k_tail]; v = tails[v_tail]
+                # Validate shapes match.
+                q_shape = q[3]["shape"]; k_shape = k[3]["shape"]; v_shape = v[3]["shape"]
+                if q_shape != k_shape or q_shape != v_shape:
+                    raise ValueError(f"qkv shape mismatch at block {block_idx} {q_tail}: q={q_shape} k={k_shape} v={v_shape}")
+                fused_shape = [q_shape[0] * 3] + list(q_shape[1:])
+                fused_info = {
+                    "dtype": q[3]["dtype"],
+                    "shape": fused_shape,
+                    "_qkv_sources": [q, k, v],  # pseudo field consumed by writer
+                }
+                entries.append((f"double_blocks.{block_idx}.{fused_tail}",
+                                None, None, fused_info))
+                del tails[q_tail]; del tails[k_tail]; del tails[v_tail]
+        # Anything left in tails was an unmatched single - pass through.
+        for tail, payload in tails.items():
+            entries.append((rewrite_fn(payload[0]),) + payload[1:])
+    return entries
+
+
+_DTYPE_BYTES = {
+    "BF16": 2, "F16": 2, "F32": 4, "F64": 8,
+    "U8": 1, "I8": 1, "I16": 2, "I32": 4, "I64": 8,
+    "BOOL": 1,
+}
+
+
+def _total_bytes(info: dict) -> int:
+    if "_qkv_sources" in info:
+        elems = 1
+        for d in info["shape"]:
+            elems *= d
+        return elems * _DTYPE_BYTES[info["dtype"]]
+    start, end = info["data_offsets"]
+    return end - start
+
+
+def write_consolidated(out_path: str, entries):
+    """Write a single safetensors file by streaming raw bytes from each shard.
+
+    For qkv-fused entries, q/k/v are concatenated along axis 0 (row-major), so a
+    simple byte-level concatenation produces the correct fused layout for any
+    standard dtype.
+    """
+    entries = sorted(entries, key=lambda e: e[0])
+
+    new_header = {}
+    cur_offset = 0
+    for new_key, shard_path, data_off, info in entries:
+        size = _total_bytes(info)
+        new_header[new_key] = {
+            "dtype": info["dtype"],
+            "shape": info["shape"],
+            "data_offsets": [cur_offset, cur_offset + size],
+        }
+        cur_offset += size
+
+    header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
+    pad = (-len(header_json)) % 8
+    header_json = header_json + (b" " * pad)
+
+    def copy_range(src_path, src_data_off, src_info, out, byte_range=None):
+        start, end = src_info["data_offsets"]
+        if byte_range is not None:
+            sub_start, sub_end = byte_range
+            start, end = start + sub_start, start + sub_end
+        with open(src_path, "rb") as src:
+            src.seek(src_data_off + start)
+            remaining = end - start
+            while remaining > 0:
+                chunk = src.read(min(8 * 1024 * 1024, remaining))
+                if not chunk:
+                    raise IOError(f"Truncated tensor in {src_path}")
+                out.write(chunk)
+                remaining -= len(chunk)
+
+    with open(out_path, "wb") as out:
+        out.write(struct.pack("<Q", len(header_json)))
+        out.write(header_json)
+        for new_key, shard_path, data_off, info in entries:
+            if "_qkv_sources" in info:
+                for q_entry in info["_qkv_sources"]:
+                    _, src_path, src_data_off, src_info = q_entry
+                    copy_range(src_path, src_data_off, src_info, out)
+            elif info.get("_chunk_swap_halves"):
+                size = _total_bytes(info)
+                half = size // 2
+                if size != half * 2:
+                    raise ValueError(f"{new_key}: odd byte size {size} cannot be split into halves")
+                copy_range(shard_path, data_off, info, out, byte_range=(half, size))
+                copy_range(shard_path, data_off, info, out, byte_range=(0, half))
+            else:
+                copy_range(shard_path, data_off, info, out)
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("input_dir", help="SeFi diffusers checkpoint directory")
+    parser.add_argument("output", help="Output transformer safetensors path (load via --diffusion-model)")
+    args = parser.parse_args()
+
+    transformer_entries = stage_tensors_for_section(
+        os.path.join(args.input_dir, "transformer"), rewrite_transformer_key)
+
+    print(f"Transformer tensors: {len(transformer_entries)}")
+    print(f"Writing {len(transformer_entries)} tensors -> {args.output}")
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+    write_consolidated(args.output, transformer_entries)
+    print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
            arch = LLM::LLMArch::GPT_OSS_20B;
        } else if (sd_version_is_pid(version)) {
            arch = LLM::LLMArch::GEMMA2_2B;
-        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_krea2(version)) {
+        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_sefi_image(version) || sd_version_is_krea2(version)) {
            arch = LLM::LLMArch::QWEN3_VL;
        } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
            arch = LLM::LLMArch::QWEN3;
@ -1997,6 +1997,18 @@ struct LLMEmbedder : public Conditioner {
            prompt_attn_range.second = static_cast<int>(prompt.size());

            prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+        } else if (sd_version_is_sefi_image(version)) {
+            prompt_template_encode_start_idx = 0;
+            min_length                       = 1024;
+            out_layers                       = {9, 18, 27};
+
+            prompt = "<|im_start|>user\n";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "<|im_end|>\n<|im_start|>assistant\n";
        } else if (version == VERSION_OVIS_IMAGE) {
            prompt_template_encode_start_idx = 28;
            min_length                       = prompt_template_encode_start_idx + 256;
--- a/src/model.h
+++ b/src/model.h
@ -49,6 +49,7 @@ enum SDVersion {
    VERSION_LONGCAT,
    VERSION_PID,
    VERSION_IDEOGRAM4,
+    VERSION_SEFI_IMAGE,
    VERSION_KREA2,
    VERSION_ESRGAN,
    VERSION_COUNT,
@ -187,6 +188,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_sefi_image(SDVersion version) {
+    if (version == VERSION_SEFI_IMAGE) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_krea2(SDVersion version) {
    if (version == VERSION_KREA2) {
        return true;
@ -202,7 +210,7 @@ static inline bool sd_version_uses_flux_vae(SDVersion version) {
 }

 static inline bool sd_version_uses_flux2_vae(SDVersion version) {
-    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
+    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version) || sd_version_is_sefi_image(version)) {
        return true;
    }
    return false;
@ -242,6 +250,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
        sd_version_is_longcat(version) ||
        sd_version_is_pid(version) ||
        sd_version_is_ideogram4(version) ||
+        sd_version_is_sefi_image(version) ||
        sd_version_is_krea2(version)) {
        return true;
    }
--- a/src/model/diffusion/flux.hpp
+++ b/src/model/diffusion/flux.hpp
@ -8,6 +8,7 @@
 #include "model/common/rope.hpp"
 #include "model/diffusion/dit.hpp"
 #include "model/diffusion/model.hpp"
+#include "model/diffusion/sefi_image.hpp"
 #include "model_loader.h"

 #define FLUX_GRAPH_SIZE 10240
@ -26,6 +27,9 @@ namespace Flux {
    struct FluxConfig {
        SDVersion version         = VERSION_FLUX;
        bool is_chroma            = false;
+        bool is_sefi              = false;
+        int64_t semantic_channels = 0;
+        float sefi_delta_t        = 0.1f;
        int patch_size            = 2;
        int64_t in_channels       = 64;
        int64_t out_channels      = 64;
@ -88,6 +92,21 @@ namespace Flux {
                config.share_modulation = true;
                config.ref_index_scale  = 10.f;
                config.use_mlp_silu_act = true;
+            } else if (sd_version_is_sefi_image(version)) {
+                config.is_sefi           = true;
+                config.semantic_channels = 16;
+                config.in_channels       = 128 + config.semantic_channels;
+                config.patch_size        = 1;
+                config.out_channels      = 128 + config.semantic_channels;
+                config.mlp_ratio         = 3.f;
+                config.theta             = 2000;
+                config.axes_dim          = {32, 32, 32, 32};
+                config.vec_in_dim        = 0;
+                config.qkv_bias          = false;
+                config.disable_bias      = true;
+                config.share_modulation  = true;
+                config.ref_index_scale   = 10.f;
+                config.use_mlp_silu_act  = true;
            } else if (sd_version_is_longcat(version)) {
                config.context_in_dim = 3584;
                config.vec_in_dim     = 0;
@ -723,8 +742,8 @@ namespace Flux {

                auto m     = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));  // [N, 2 * hidden_size]
                auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, 2, 0);
-                shift      = m_vec[0];  // [N, hidden_size]
-                scale      = m_vec[1];  // [N, hidden_size]
+                shift      = m_vec[0];
+                scale      = m_vec[1];
            }

            x = Flux::modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
@ -902,6 +921,8 @@ namespace Flux {
            }
            if (config.is_chroma) {
                blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(config.in_dim, config.hidden_size);
+            } else if (config.is_sefi) {
+                blocks["dual_time_embed"] = std::make_shared<SefiImage::SefiDualTimestepEmbeddings>(256, config.hidden_size);
            } else {
                blocks["time_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
                if (config.vec_in_dim > 0) {
@ -1027,6 +1048,11 @@ namespace Flux {
                if (y != nullptr) {
                    txt_img_mask = ggml_pad(ctx->ggml_ctx, y, static_cast<int>(img->ne[1]), 0, 0, 0);
                }
+            } else if (config.is_sefi) {
+                auto dual_time_embed = std::dynamic_pointer_cast<SefiImage::SefiDualTimestepEmbeddings>(blocks["dual_time_embed"]);
+                auto timestep_sem    = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, 0);
+                auto timestep_tex    = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, ggml_element_size(timesteps));
+                vec                  = dual_time_embed->forward(ctx, timestep_sem, timestep_tex);
            } else {
                auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
                vec          = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
@ -1500,7 +1526,7 @@ namespace Flux {
                set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
            }
            std::set<int> txt_arange_dims;
-            if (sd_version_is_flux2(version)) {
+            if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) {
                txt_arange_dims    = {3};
                increase_ref_index = true;
            } else if (version == VERSION_OVIS_IMAGE) {
--- a/src/model/diffusion/sefi_image.hpp
+++ b/src/model/diffusion/sefi_image.hpp
@ -0,0 +1,91 @@
+#ifndef __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
+#define __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
+
+#include <memory>
+
+#include "model/common/block.hpp"
+
+namespace SefiImage {
+    struct SefiImageConfig {
+        int64_t semantic_channels        = 16;
+        int64_t texture_latent_channels  = 32;
+        int64_t timestep_guidance_in_dim = 256;
+        int64_t hidden_size              = 3072;
+        float timestep_shift_alpha       = 0.3f;
+        float delta_t                    = 0.1f;
+
+        int64_t packed_texture_channels(int patch_size) const {
+            return texture_latent_channels * patch_size * patch_size;
+        }
+
+        int64_t packed_input_channels(int patch_size) const {
+            return semantic_channels + packed_texture_channels(patch_size);
+        }
+
+        static SefiImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                                   const std::string& prefix) {
+            SefiImageConfig config;
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (ends_with(name, "dual_time_embed.semantic_embedder.linear_1.weight") && tensor_storage.n_dims == 2) {
+                    config.timestep_guidance_in_dim = tensor_storage.ne[0];
+                    config.hidden_size              = tensor_storage.ne[1] * 2;
+                }
+            }
+            LOG_DEBUG("sefi_image: semantic_channels = %" PRId64 ", texture_latent_channels = %" PRId64 ", hidden_size = %" PRId64,
+                      config.semantic_channels,
+                      config.texture_latent_channels,
+                      config.hidden_size);
+            return config;
+        }
+    };
+
+    struct SefiTimestepEmbedding : public GGMLBlock {
+    public:
+        SefiTimestepEmbedding(int64_t in_channels, int64_t time_embed_dim) {
+            blocks["linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim, false));
+            blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim, false));
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* sample) {
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
+            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
+
+            sample = linear_1->forward(ctx, sample);
+            sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
+            sample = linear_2->forward(ctx, sample);
+            return sample;
+        }
+    };
+
+    struct SefiDualTimestepEmbeddings : public GGMLBlock {
+    public:
+        SefiDualTimestepEmbeddings(int64_t in_channels, int64_t embedding_dim) {
+            GGML_ASSERT(embedding_dim % 2 == 0);
+            int64_t half_dim            = embedding_dim / 2;
+            blocks["semantic_embedder"] = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
+            blocks["texture_embedder"]  = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
+            timestep_guidance_in_dim    = in_channels;
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* timestep_sem,
+                             ggml_tensor* timestep_tex) {
+            auto semantic_embedder = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["semantic_embedder"]);
+            auto texture_embedder  = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["texture_embedder"]);
+
+            auto sem_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_sem, timestep_guidance_in_dim, 10000, 1.f);
+            auto tex_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_tex, timestep_guidance_in_dim, 10000, 1.f);
+            auto sem_emb  = semantic_embedder->forward(ctx, sem_proj);
+            auto tex_emb  = texture_embedder->forward(ctx, tex_proj);
+            return ggml_concat(ctx->ggml_ctx, sem_emb, tex_emb, 0);
+        }
+
+    private:
+        int64_t timestep_guidance_in_dim = 256;
+    };
+}  // namespace SefiImage
+
+#endif  // __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
--- a/src/model/te/llm.hpp
+++ b/src/model/te/llm.hpp
@ -250,7 +250,7 @@ namespace LLM {
                    config.intermediate_size = tensor_storage.ne[1];
                }
            }
-            if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
+            if ((arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) && config.num_layers == 28) {
                config.num_heads = 16;
            }
            if (detected_vision_layers > 0) {
--- a/src/model/vae/auto_encoder_kl.hpp
+++ b/src/model/vae/auto_encoder_kl.hpp
@ -816,12 +816,13 @@ struct AutoEncoderKL : public VAE {
    }

    sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
+        auto latents_ = sd_version_is_sefi_image(version) ? sd::ops::slice(latents, 2, 16, 144) : latents;
        if (sd_version_uses_flux2_vae(version)) {
            int channel_dim                = 2;
-            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
-            return (latents * std_tensor) / scale_factor + mean_tensor;
+            auto [mean_tensor, std_tensor] = get_latents_mean_std(latents_, channel_dim);
+            return (latents_ * std_tensor) / scale_factor + mean_tensor;
        }
-        return (latents / scale_factor) + shift_factor;
+        return (latents_ / scale_factor) + shift_factor;
    }

    sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
--- a/src/model_loader.cpp
+++ b/src/model_loader.cpp
@ -66,7 +66,6 @@ const char* unused_tensors[] = {
    // "v_pred", // Used to detect SDXL vpred models
    "text_encoders.llm.output.weight",
    "text_encoders.llm.lm_head.",
-    "first_stage_model.bn.",
 };

 bool is_unused_tensor(const std::string& name) {
@ -480,6 +479,9 @@ SDVersion ModelLoader::get_sd_version() {
        if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
            is_flux2 = true;
        }
+        if (tensor_storage.name.find("dual_time_embed.semantic_embedder.linear_1.weight") != std::string::npos) {
+            return VERSION_SEFI_IMAGE;
+        }
        if (tensor_storage.name.find("single_blocks.47.linear1.weight") != std::string::npos) {
            has_single_block_47 = true;
        }
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@ -743,7 +743,7 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
        name = convert_diffusers_unet_to_original_sdxl(name);
    } else if (sd_version_is_sd3(version)) {
        name = convert_diffusers_dit_to_original_sd3(name);
-    } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
+    } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) {
        name = convert_diffusers_dit_to_original_flux(name);
    } else if (sd_version_is_z_image(version)) {
        name = convert_diffusers_dit_to_original_lumina2(name);
--- a/src/runtime/denoiser.hpp
+++ b/src/runtime/denoiser.hpp
@ -1005,6 +1005,8 @@ struct FluxFlowDenoiser : public DiscreteFlowDenoiser {
    }
 };

+struct SefiFlowDenoiser;
+
 struct Flux2FlowDenoiser : public FluxFlowDenoiser {
    Flux2FlowDenoiser() = default;

@ -1037,6 +1039,80 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser {
    }
 };

+struct SefiFlowDenoiser : public Flux2FlowDenoiser {
+    static constexpr int kNumTrainTimesteps = 1000;
+    static constexpr int kSemChannels       = 16;
+    static constexpr int kTotalChannels     = 144;
+
+    float delta_t              = 0.1f;
+    float timestep_shift_alpha = 1.0f;
+
+    std::vector<float> sem_sigmas;
+    std::vector<float> tex_sigmas;
+    std::vector<float> sem_timesteps;
+    std::vector<float> tex_timesteps;
+
+    SefiFlowDenoiser() = default;
+
+    static float apply_alpha_shift(float u_unit, float alpha) {
+        if (alpha == 1.0f) {
+            return u_unit;
+        }
+        float denom = 1.0f + (alpha - 1.0f) * u_unit;
+        return (alpha * u_unit) / denom;
+    }
+
+    std::vector<float> get_sigmas(uint32_t n,
+                                  int image_seq_len,
+                                  scheduler_t scheduler_type,
+                                  SDVersion version,
+                                  const char* extra_sample_args = nullptr) override {
+        sem_sigmas.clear();
+        tex_sigmas.clear();
+        sem_timesteps.clear();
+        tex_timesteps.clear();
+
+        for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "sefi scheduler arg")) {
+            if (key == "sefi_alpha") {
+                if (!parse_strict_float(value, timestep_shift_alpha)) {
+                    LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str());
+                }
+            } else if (key == "sefi_delta_t") {
+                if (!parse_strict_float(value, delta_t)) {
+                    LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str());
+                }
+            }
+        }
+
+        for (uint32_t i = 0; i <= n; ++i) {
+            float u_base    = static_cast<float>(i) / static_cast<float>(n);
+            float u_shifted = apply_alpha_shift(u_base, timestep_shift_alpha);
+            float u_sem_raw = u_shifted * (1.0f + delta_t);
+
+            float u_sem = std::min(u_sem_raw, 1.0f);
+            float u_tex = std::max(0.0f, std::min(u_sem_raw - delta_t, 1.0f));
+
+            int idx_sem = std::min(kNumTrainTimesteps - 1,
+                                   std::max(0, static_cast<int>(u_sem * (kNumTrainTimesteps - 1))));
+            int idx_tex = std::min(kNumTrainTimesteps - 1,
+                                   std::max(0, static_cast<int>(u_tex * (kNumTrainTimesteps - 1))));
+
+            float t_sem     = static_cast<float>(kNumTrainTimesteps - idx_sem);
+            float t_tex     = static_cast<float>(kNumTrainTimesteps - idx_tex);
+            float sigma_sem = t_sem / static_cast<float>(kNumTrainTimesteps);
+            float sigma_tex = t_tex / static_cast<float>(kNumTrainTimesteps);
+
+            sem_timesteps.push_back(t_sem);
+            tex_timesteps.push_back(t_tex);
+            sem_sigmas.push_back(sigma_sem);
+            tex_sigmas.push_back(sigma_tex);
+        }
+        LOG_DEBUG("SefiFlowDenoiser: built %u-step dual schedule (alpha=%.2f delta_t=%.2f)",
+                  n, timestep_shift_alpha, delta_t);
+        return tex_sigmas;
+    }
+};
+
 typedef std::function<sd::guidance::GuiderOutput(const sd::Tensor<float>&, float, int)> denoise_cb_t;

 static std::pair<float, float> get_ancestral_step(float sigma_from,
@ -1140,6 +1216,40 @@ static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
    return x;
 }

+static sd::Tensor<float> sample_sefi_euler(SefiFlowDenoiser* sefi,
+                                           denoise_cb_t model,
+                                           sd::Tensor<float> x) {
+    const std::vector<float>& sigma_tex_vec = sefi->tex_sigmas;
+    const std::vector<float>& sigma_sem_vec = sefi->sem_sigmas;
+    int steps                               = static_cast<int>(sigma_tex_vec.size()) - 1;
+    for (int i = 0; i < steps; i++) {
+        float sigma_tex_cur  = sigma_tex_vec[i];
+        float sigma_tex_next = sigma_tex_vec[i + 1];
+        float sigma_sem_cur  = sigma_sem_vec[i];
+        float sigma_sem_next = sigma_sem_vec[i + 1];
+        if (sigma_tex_cur <= 1e-9f) {
+            continue;
+        }
+        auto denoised_opt = model(x, sigma_tex_cur, i + 1);
+        if (denoised_opt.pred.empty()) {
+            return {};
+        }
+        sd::Tensor<float> denoised = std::move(denoised_opt.pred);
+        sd::Tensor<float> velocity = (x - denoised) / sigma_tex_cur;
+
+        auto x_sem      = sd::ops::slice(x, 2, 0, SefiFlowDenoiser::kSemChannels);
+        auto x_tex      = sd::ops::slice(x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels);
+        auto vel_sem    = sd::ops::slice(velocity, 2, 0, SefiFlowDenoiser::kSemChannels);
+        auto vel_tex    = sd::ops::slice(velocity, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels);
+        auto x_sem_next = x_sem + vel_sem * (sigma_sem_next - sigma_sem_cur);
+        auto x_tex_next = x_tex + vel_tex * (sigma_tex_next - sigma_tex_cur);
+
+        sd::ops::slice_assign(&x, 2, 0, SefiFlowDenoiser::kSemChannels, x_sem_next);
+        sd::ops::slice_assign(&x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels, x_tex_next);
+    }
+    return x;
+}
+
 static sd::Tensor<float> sample_euler(denoise_cb_t model,
                                      sd::Tensor<float> x,
                                      const std::vector<float>& sigmas) {
@ -2055,7 +2165,13 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
                                            std::shared_ptr<RNG> rng,
                                            float eta,
                                            bool is_flow_denoiser,
-                                            const char* extra_sample_args) {
+                                            const char* extra_sample_args,
+                                            std::shared_ptr<Denoiser> denoiser_for_dispatch = nullptr) {
+    if (denoiser_for_dispatch) {
+        if (auto sefi = std::dynamic_pointer_cast<SefiFlowDenoiser>(denoiser_for_dispatch)) {
+            return sample_sefi_euler(sefi.get(), model, std::move(x));
+        }
+    }
    SamplerExtraArgs extra_args = parse_key_value_args(extra_sample_args, "extra sample arg");
    switch (method) {
        case EULER_A_SAMPLE_METHOD:
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -96,6 +96,7 @@ const char* model_version_to_str[] = {
    "Longcat-Image",
    "PiD",
    "Ideogram 4",
+    "SeFi-Image",
    "Krea2",
    "ESRGAN",
 };
@ -691,7 +692,7 @@ public:
                                                                     version,
                                                                     sd_ctx_params->chroma_use_dit_mask,
                                                                     model_manager);
-            } else if (sd_version_is_flux2(version)) {
+            } else if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) {
                bool is_chroma   = false;
                cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
                                                                 tensor_storage_map,
@ -1295,6 +1296,8 @@ public:
                    } else if (sd_version_is_krea2(version)) {
                        default_flow_shift = 1.15f;
                    }
+                } else if (sd_version_is_sefi_image(version)) {
+                    pred_type = SEFI_FLOW_PRED;
                } else if (sd_version_is_flux2(version)) {
                    pred_type = FLUX2_FLOW_PRED;
                } else {
@ -1334,6 +1337,11 @@ public:
                    denoiser = std::make_shared<Flux2FlowDenoiser>();
                    break;
                }
+                case SEFI_FLOW_PRED: {
+                    LOG_INFO("running in SeFi-Image dual-time FLOW mode");
+                    denoiser = std::make_shared<SefiFlowDenoiser>();
+                    break;
+                }
                default: {
                    LOG_ERROR("Unknown predition type %i", pred_type);
                    return false;
@ -1639,7 +1647,16 @@ public:

    std::vector<float> process_timesteps(const std::vector<float>& timesteps,
                                         const sd::Tensor<float>& init_latent,
-                                         const sd::Tensor<float>& denoise_mask) {
+                                         const sd::Tensor<float>& denoise_mask,
+                                         int step) {
+        if (auto sefi_denoiser = std::dynamic_pointer_cast<SefiFlowDenoiser>(denoiser)) {
+            int sched_idx = step > 0 ? step - 1 : 0;
+            if (sched_idx >= static_cast<int>(sefi_denoiser->tex_timesteps.size())) {
+                sched_idx = static_cast<int>(sefi_denoiser->tex_timesteps.size()) - 1;
+            }
+            return {sefi_denoiser->sem_timesteps[sched_idx],
+                    sefi_denoiser->tex_timesteps[sched_idx]};
+        }
        if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") {
            int64_t frame_count = init_latent.shape()[2];
            auto new_timesteps  = std::vector<float>(static_cast<size_t>(frame_count), timesteps[0]);
@ -2051,7 +2068,7 @@ public:
                timesteps_vec          = process_ltxav_video_timesteps(base_timesteps_vec, init_latent, denoise_mask);
                audio_timesteps_tensor = sd::Tensor<float>({static_cast<int64_t>(base_timesteps_vec.size())}, base_timesteps_vec);
            } else {
-                timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask);
+                timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask, step);
            }
            const std::vector<float>& scaling_timesteps_vec = (sd_version_is_ltxav(version) && !denoise_mask.empty())
                                                                  ? base_timesteps_vec
@ -2121,7 +2138,7 @@ public:
                    diffusion_params.extra = UNetDiffusionExtra{-1, &controls, control_strength};
                } else if (sd_version_is_sd3(version)) {
                    diffusion_params.extra = SkipLayerDiffusionExtra{local_skip_layers};
-                } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
+                } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) {
                    diffusion_params.extra = FluxDiffusionExtra{&guidance_tensor,
                                                                local_skip_layers};
                } else if (sd_version_is_anima(version)) {
@ -2265,7 +2282,7 @@ public:
            return output;
        };

-        auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args);
+        auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args, denoiser);
        if (x0_opt.empty()) {
            LOG_ERROR("Diffusion model sampling failed");
            if (control_net) {
@ -2326,6 +2343,8 @@ public:
                latent_channel = 3;
            } else if (sd_version_is_pid(version)) {
                latent_channel = 3;
+            } else if (sd_version_is_sefi_image(version)) {
+                latent_channel = 144;
            } else if (sd_version_uses_flux2_vae(version)) {
                latent_channel = 128;
            } else {