mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-29 09:36:40 +00:00
feat: add SeFi-Image support (#1707)
This commit is contained in:
parent
f54e45e81c
commit
03e9a22f4d
@ -53,6 +53,7 @@ API and command-line option may change frequently.***
|
||||
- [ERNIE-Image](./docs/ernie_image.md)
|
||||
- [Boogu Image](./docs/boogu_image.md)
|
||||
- [Krea2](./docs/krea2.md)
|
||||
- [SeFi-Image](./docs/sefi_image.md)
|
||||
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
|
||||
- [Ideogram4](./docs/ideogram4.md)
|
||||
- Image Edit Models
|
||||
|
||||
BIN
assets/sefi_image/example.png
Normal file
BIN
assets/sefi_image/example.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.7 MiB |
50
docs/sefi_image.md
Normal file
50
docs/sefi_image.md
Normal file
@ -0,0 +1,50 @@
|
||||
# How to Use
|
||||
|
||||
SeFi-Image uses a Flux2-style dual-time transformer (semantic + texture streams), the standard Flux2 VAE, and Qwen3-VL as the LLM text encoder. Tech report: [arXiv:2606.22568](https://arxiv.org/abs/2606.22568).
|
||||
|
||||
## Download weights
|
||||
|
||||
The SeFi-Image family ships in three scales (1B / 2B / 5B) and three families (Base / RL / turbo), all gated on Hugging Face under https://huggingface.co/SeFi-Image.
|
||||
|
||||
- 1B and 2B variants pair with Qwen3-VL-2B-Instruct.
|
||||
- 5B variants pair with Qwen3-VL-4B-Instruct.
|
||||
- All variants use the standard Flux2 VAE (`flux2_ae.safetensors` from https://huggingface.co/black-forest-labs/FLUX.2-dev).
|
||||
|
||||
Convert the transformer and text encoder to sd.cpp safetensors:
|
||||
|
||||
```bash
|
||||
python3 script/convert_sefi.py <hf_repo_dir> <out_dir>/sefi_<scale>_<family>.safetensors
|
||||
python3 script/convert_qwen3_vl.py <hf_repo_dir>/Qwen3-VL-XB-Instruct <out_dir>/qwen3_vl_<X>b.safetensors
|
||||
```
|
||||
|
||||
## Variant defaults
|
||||
|
||||
| Family | timestep_shift_alpha | steps | cfg-scale |
|
||||
|---|---|---|---|
|
||||
| Base | 0.3 | 50 | 4.0 |
|
||||
| RL | 0.3 | 50 | 4.0 |
|
||||
| turbo | 1.0 | 4 | 1.0 |
|
||||
|
||||
The dispatcher picks `alpha` from the filename (`turbo` substring => 1.0, otherwise 0.3). Override via `--extra-sample-args sefi_alpha=<value>` or `sefi_delta_t=<value>`.
|
||||
|
||||
## Examples
|
||||
|
||||
### 1B / 2B turbo
|
||||
|
||||
```
|
||||
./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
|
||||
```
|
||||
|
||||
### 1B / 2B base
|
||||
|
||||
```
|
||||
./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_base.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 4.0 --steps 50 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
|
||||
```
|
||||
|
||||
### 5B (needs streaming on 12 GiB VRAM)
|
||||
|
||||
```
|
||||
./build/bin/sd-cli --diffusion-model /path/to/sefi_5b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_4b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --max-vram 8 --stream-layers --offload-to-cpu -o out.png
|
||||
```
|
||||
|
||||
<img alt="SeFi-Image 5B turbo example" src="../assets/sefi_image/example.png" />
|
||||
@ -81,6 +81,7 @@ enum prediction_t {
|
||||
FLOW_PRED,
|
||||
FLUX_FLOW_PRED,
|
||||
FLUX2_FLOW_PRED,
|
||||
SEFI_FLOW_PRED,
|
||||
PREDICTION_COUNT
|
||||
};
|
||||
|
||||
|
||||
112
script/convert_qwen3_vl.py
Normal file
112
script/convert_qwen3_vl.py
Normal file
@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Convert a Qwen3-VL HF safetensors checkpoint into a sd.cpp-loadable form.
|
||||
|
||||
The HF dump prefixes text-tower keys with ``model.language_model.`` and
|
||||
vision-tower keys with ``model.visual.``. sd.cpp expects ``model.<rest>`` for
|
||||
the text side; the vision side is converted by sd.cpp's own
|
||||
``convert_qwen3_vl_vision_name`` and is left as-is here.
|
||||
|
||||
Operates on raw safetensors bytes so any dtype (BF16/F16/F32) is preserved.
|
||||
|
||||
Usage:
|
||||
python3 script/convert_qwen3_vl.py <hf_qwen3_vl_dir_or_safetensors> <output.safetensors>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
|
||||
|
||||
def rewrite_key(key: str) -> str:
|
||||
if key.startswith("model.language_model."):
|
||||
return "model." + key[len("model.language_model."):]
|
||||
return key
|
||||
|
||||
|
||||
def read_safetensors_header(path: str):
|
||||
with open(path, "rb") as f:
|
||||
hdr_len = struct.unpack("<Q", f.read(8))[0]
|
||||
hdr_bytes = f.read(hdr_len)
|
||||
return json.loads(hdr_bytes), 8 + hdr_len
|
||||
|
||||
|
||||
def collect_shard_paths(path: str):
|
||||
if os.path.isdir(path):
|
||||
index_path = os.path.join(path, "model.safetensors.index.json")
|
||||
if os.path.isfile(index_path):
|
||||
with open(index_path) as f:
|
||||
idx = json.load(f)
|
||||
return sorted({os.path.join(path, n) for n in idx["weight_map"].values()})
|
||||
single = os.path.join(path, "model.safetensors")
|
||||
if os.path.isfile(single):
|
||||
return [single]
|
||||
raise FileNotFoundError(f"No Qwen3-VL safetensors in {path}")
|
||||
if os.path.isfile(path):
|
||||
return [path]
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
|
||||
def stage_tensors(input_path: str):
|
||||
entries = []
|
||||
for shard_path in collect_shard_paths(input_path):
|
||||
hdr, data_off = read_safetensors_header(shard_path)
|
||||
for key, info in hdr.items():
|
||||
if key == "__metadata__":
|
||||
continue
|
||||
entries.append((rewrite_key(key), shard_path, data_off, info))
|
||||
return entries
|
||||
|
||||
|
||||
def write_consolidated(out_path: str, entries):
|
||||
entries = sorted(entries, key=lambda e: e[0])
|
||||
|
||||
new_header = {}
|
||||
cur_offset = 0
|
||||
for new_key, shard_path, data_off, info in entries:
|
||||
start, end = info["data_offsets"]
|
||||
size = end - start
|
||||
new_header[new_key] = {
|
||||
"dtype": info["dtype"],
|
||||
"shape": info["shape"],
|
||||
"data_offsets": [cur_offset, cur_offset + size],
|
||||
}
|
||||
cur_offset += size
|
||||
|
||||
header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
|
||||
pad = (-len(header_json)) % 8
|
||||
header_json = header_json + (b" " * pad)
|
||||
|
||||
with open(out_path, "wb") as out:
|
||||
out.write(struct.pack("<Q", len(header_json)))
|
||||
out.write(header_json)
|
||||
for new_key, shard_path, data_off, info in entries:
|
||||
start, end = info["data_offsets"]
|
||||
with open(shard_path, "rb") as src:
|
||||
src.seek(data_off + start)
|
||||
remaining = end - start
|
||||
while remaining > 0:
|
||||
chunk = src.read(min(8 * 1024 * 1024, remaining))
|
||||
if not chunk:
|
||||
raise IOError(f"Truncated tensor in {shard_path}")
|
||||
out.write(chunk)
|
||||
remaining -= len(chunk)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("input", help="HF Qwen3-VL directory or single safetensors file")
|
||||
parser.add_argument("output", help="Output single safetensors path")
|
||||
args = parser.parse_args()
|
||||
|
||||
entries = stage_tensors(args.input)
|
||||
print(f"Tensors: {len(entries)}")
|
||||
print(f"Writing -> {args.output}")
|
||||
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
||||
write_consolidated(args.output, entries)
|
||||
print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
279
script/convert_sefi.py
Normal file
279
script/convert_sefi.py
Normal file
@ -0,0 +1,279 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Convert a SeFi-Image diffusers checkpoint into a single sd.cpp-compatible safetensors.
|
||||
|
||||
Operates on raw safetensors bytes so any dtype (BF16, F32, ...) is preserved exactly.
|
||||
No numpy or torch dependency required.
|
||||
|
||||
Usage:
|
||||
python3 script/convert_sefi.py <sefi_diffusers_dir> <output.safetensors>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import sys
|
||||
|
||||
|
||||
_LINEAR_TO_LIN = re.compile(r"\.linear\.")
|
||||
_SHARED_MOD_PREFIXES = (
|
||||
"double_stream_modulation_img",
|
||||
"double_stream_modulation_txt",
|
||||
"single_stream_modulation",
|
||||
)
|
||||
|
||||
|
||||
def rewrite_transformer_key(key: str) -> str:
|
||||
if key.startswith("backbone."):
|
||||
key = key[len("backbone."):]
|
||||
elif key.startswith("dual_time_embed."):
|
||||
return key
|
||||
|
||||
if any(key.startswith(prefix + ".") for prefix in _SHARED_MOD_PREFIXES):
|
||||
key = _LINEAR_TO_LIN.sub(".lin.", key, count=1)
|
||||
|
||||
if key == "context_embedder.weight":
|
||||
return "txt_in.weight"
|
||||
if key == "context_embedder.bias":
|
||||
return "txt_in.bias"
|
||||
if key == "x_embedder.weight":
|
||||
return "img_in.weight"
|
||||
if key == "x_embedder.bias":
|
||||
return "img_in.bias"
|
||||
|
||||
if key == "proj_out.weight":
|
||||
return "final_layer.linear.weight"
|
||||
if key == "proj_out.bias":
|
||||
return "final_layer.linear.bias"
|
||||
if key == "norm_out.linear.weight":
|
||||
return "final_layer.adaLN_modulation.1.weight"
|
||||
if key == "norm_out.linear.bias":
|
||||
return "final_layer.adaLN_modulation.1.bias"
|
||||
|
||||
m = re.match(r"transformer_blocks\.(\d+)\.(.*)$", key)
|
||||
if m:
|
||||
return _rewrite_double_stream(m.group(1), m.group(2))
|
||||
m = re.match(r"single_transformer_blocks\.(\d+)\.(.*)$", key)
|
||||
if m:
|
||||
return _rewrite_single_stream(m.group(1), m.group(2))
|
||||
|
||||
return key
|
||||
|
||||
|
||||
def _rewrite_double_stream(idx: str, tail: str) -> str:
|
||||
dst = f"double_blocks.{idx}."
|
||||
mapping = {
|
||||
"norm1.linear.weight": "img_mod.lin.weight",
|
||||
"norm1_context.linear.weight": "txt_mod.lin.weight",
|
||||
"attn.norm_q.weight": "img_attn.norm.query_norm.scale",
|
||||
"attn.norm_k.weight": "img_attn.norm.key_norm.scale",
|
||||
"attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale",
|
||||
"attn.norm_added_k.weight": "txt_attn.norm.key_norm.scale",
|
||||
"attn.to_out.0.weight": "img_attn.proj.weight",
|
||||
"attn.to_add_out.weight": "txt_attn.proj.weight",
|
||||
"ff.net.0.proj.weight": "img_mlp.0.weight",
|
||||
"ff.net.2.weight": "img_mlp.2.weight",
|
||||
"ff_context.net.0.proj.weight": "txt_mlp.0.weight",
|
||||
"ff_context.net.2.weight": "txt_mlp.2.weight",
|
||||
"ff.linear_in.weight": "img_mlp.0.weight",
|
||||
"ff.linear_out.weight": "img_mlp.2.weight",
|
||||
"ff_context.linear_in.weight": "txt_mlp.0.weight",
|
||||
"ff_context.linear_out.weight": "txt_mlp.2.weight",
|
||||
}
|
||||
return dst + mapping.get(tail, tail)
|
||||
|
||||
|
||||
# QKV triplets to fuse on output: source tails -> target fused tail.
|
||||
# Each tuple is (q_tail, k_tail, v_tail, fused_target_tail).
|
||||
QKV_DOUBLE_TRIPLETS = [
|
||||
("attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight", "img_attn.qkv.weight"),
|
||||
("attn.add_q_proj.weight", "attn.add_k_proj.weight", "attn.add_v_proj.weight", "txt_attn.qkv.weight"),
|
||||
]
|
||||
|
||||
|
||||
def _rewrite_single_stream(idx: str, tail: str) -> str:
|
||||
dst = f"single_blocks.{idx}."
|
||||
mapping = {
|
||||
"norm.linear.weight": "modulation.lin.weight",
|
||||
"attn.norm_q.weight": "norm.query_norm.scale",
|
||||
"attn.norm_k.weight": "norm.key_norm.scale",
|
||||
"attn.to_qkv_mlp_proj.weight": "linear1.weight",
|
||||
"attn.to_out.weight": "linear2.weight",
|
||||
}
|
||||
return dst + mapping.get(tail, tail)
|
||||
|
||||
|
||||
|
||||
|
||||
def read_safetensors_header(path: str):
|
||||
"""Return (header dict, data start byte offset)."""
|
||||
with open(path, "rb") as f:
|
||||
hdr_len = struct.unpack("<Q", f.read(8))[0]
|
||||
hdr_bytes = f.read(hdr_len)
|
||||
return json.loads(hdr_bytes), 8 + hdr_len
|
||||
|
||||
|
||||
def collect_shard_paths(directory: str, weight_pattern: str):
|
||||
index_path = os.path.join(directory, f"{weight_pattern}.safetensors.index.json")
|
||||
if os.path.isfile(index_path):
|
||||
with open(index_path) as f:
|
||||
idx = json.load(f)
|
||||
return sorted({os.path.join(directory, n) for n in idx["weight_map"].values()})
|
||||
single = os.path.join(directory, f"{weight_pattern}.safetensors")
|
||||
if not os.path.isfile(single):
|
||||
raise FileNotFoundError(f"No checkpoint at {directory}: missing {weight_pattern}")
|
||||
return [single]
|
||||
|
||||
|
||||
def stage_tensors_for_section(section_dir: str, rewrite_fn):
|
||||
"""Return a list of (new_key, shard_path, data_start_offset, info_dict) entries.
|
||||
|
||||
A "qkv_fuse" pseudo-entry with three source descriptors is emitted when a
|
||||
transformer_blocks.* split q/k/v triplet is found, so the writer can fuse
|
||||
them into a single output tensor.
|
||||
"""
|
||||
entries = []
|
||||
# First, index all raw keys per shard so we can detect qkv triplets.
|
||||
raw_by_block = {} # block_idx -> {tail: (key, shard_path, data_off, info)}
|
||||
raw_others = []
|
||||
for shard_path in collect_shard_paths(section_dir, "diffusion_pytorch_model"):
|
||||
hdr, data_off = read_safetensors_header(shard_path)
|
||||
for key, info in hdr.items():
|
||||
if key == "__metadata__":
|
||||
continue
|
||||
m = re.match(r"backbone\.transformer_blocks\.(\d+)\.(.*)$", key)
|
||||
if m and any(m.group(2) in trip[:3] for trip in QKV_DOUBLE_TRIPLETS):
|
||||
idx = m.group(1)
|
||||
raw_by_block.setdefault(idx, {})[m.group(2)] = (key, shard_path, data_off, info)
|
||||
else:
|
||||
raw_others.append((key, shard_path, data_off, info))
|
||||
|
||||
for key, shard_path, data_off, info in raw_others:
|
||||
new_key = rewrite_fn(key)
|
||||
# Swap the (scale, shift) halves to (shift, scale) at conversion time so
|
||||
# the on-disk weight matches BFL flux ordering and the runtime stays
|
||||
# version-agnostic. norm_out.linear weight shape is [2*dim, dim] and bias
|
||||
# is [2*dim]; both split along axis 0 (outermost == row-major outer).
|
||||
if new_key in ("final_layer.adaLN_modulation.1.weight",
|
||||
"final_layer.adaLN_modulation.1.bias"):
|
||||
info = dict(info)
|
||||
info["_chunk_swap_halves"] = True
|
||||
entries.append((new_key, shard_path, data_off, info))
|
||||
|
||||
for block_idx, tails in raw_by_block.items():
|
||||
for q_tail, k_tail, v_tail, fused_tail in QKV_DOUBLE_TRIPLETS:
|
||||
if q_tail in tails and k_tail in tails and v_tail in tails:
|
||||
q = tails[q_tail]; k = tails[k_tail]; v = tails[v_tail]
|
||||
# Validate shapes match.
|
||||
q_shape = q[3]["shape"]; k_shape = k[3]["shape"]; v_shape = v[3]["shape"]
|
||||
if q_shape != k_shape or q_shape != v_shape:
|
||||
raise ValueError(f"qkv shape mismatch at block {block_idx} {q_tail}: q={q_shape} k={k_shape} v={v_shape}")
|
||||
fused_shape = [q_shape[0] * 3] + list(q_shape[1:])
|
||||
fused_info = {
|
||||
"dtype": q[3]["dtype"],
|
||||
"shape": fused_shape,
|
||||
"_qkv_sources": [q, k, v], # pseudo field consumed by writer
|
||||
}
|
||||
entries.append((f"double_blocks.{block_idx}.{fused_tail}",
|
||||
None, None, fused_info))
|
||||
del tails[q_tail]; del tails[k_tail]; del tails[v_tail]
|
||||
# Anything left in tails was an unmatched single - pass through.
|
||||
for tail, payload in tails.items():
|
||||
entries.append((rewrite_fn(payload[0]),) + payload[1:])
|
||||
return entries
|
||||
|
||||
|
||||
_DTYPE_BYTES = {
|
||||
"BF16": 2, "F16": 2, "F32": 4, "F64": 8,
|
||||
"U8": 1, "I8": 1, "I16": 2, "I32": 4, "I64": 8,
|
||||
"BOOL": 1,
|
||||
}
|
||||
|
||||
|
||||
def _total_bytes(info: dict) -> int:
|
||||
if "_qkv_sources" in info:
|
||||
elems = 1
|
||||
for d in info["shape"]:
|
||||
elems *= d
|
||||
return elems * _DTYPE_BYTES[info["dtype"]]
|
||||
start, end = info["data_offsets"]
|
||||
return end - start
|
||||
|
||||
|
||||
def write_consolidated(out_path: str, entries):
|
||||
"""Write a single safetensors file by streaming raw bytes from each shard.
|
||||
|
||||
For qkv-fused entries, q/k/v are concatenated along axis 0 (row-major), so a
|
||||
simple byte-level concatenation produces the correct fused layout for any
|
||||
standard dtype.
|
||||
"""
|
||||
entries = sorted(entries, key=lambda e: e[0])
|
||||
|
||||
new_header = {}
|
||||
cur_offset = 0
|
||||
for new_key, shard_path, data_off, info in entries:
|
||||
size = _total_bytes(info)
|
||||
new_header[new_key] = {
|
||||
"dtype": info["dtype"],
|
||||
"shape": info["shape"],
|
||||
"data_offsets": [cur_offset, cur_offset + size],
|
||||
}
|
||||
cur_offset += size
|
||||
|
||||
header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
|
||||
pad = (-len(header_json)) % 8
|
||||
header_json = header_json + (b" " * pad)
|
||||
|
||||
def copy_range(src_path, src_data_off, src_info, out, byte_range=None):
|
||||
start, end = src_info["data_offsets"]
|
||||
if byte_range is not None:
|
||||
sub_start, sub_end = byte_range
|
||||
start, end = start + sub_start, start + sub_end
|
||||
with open(src_path, "rb") as src:
|
||||
src.seek(src_data_off + start)
|
||||
remaining = end - start
|
||||
while remaining > 0:
|
||||
chunk = src.read(min(8 * 1024 * 1024, remaining))
|
||||
if not chunk:
|
||||
raise IOError(f"Truncated tensor in {src_path}")
|
||||
out.write(chunk)
|
||||
remaining -= len(chunk)
|
||||
|
||||
with open(out_path, "wb") as out:
|
||||
out.write(struct.pack("<Q", len(header_json)))
|
||||
out.write(header_json)
|
||||
for new_key, shard_path, data_off, info in entries:
|
||||
if "_qkv_sources" in info:
|
||||
for q_entry in info["_qkv_sources"]:
|
||||
_, src_path, src_data_off, src_info = q_entry
|
||||
copy_range(src_path, src_data_off, src_info, out)
|
||||
elif info.get("_chunk_swap_halves"):
|
||||
size = _total_bytes(info)
|
||||
half = size // 2
|
||||
if size != half * 2:
|
||||
raise ValueError(f"{new_key}: odd byte size {size} cannot be split into halves")
|
||||
copy_range(shard_path, data_off, info, out, byte_range=(half, size))
|
||||
copy_range(shard_path, data_off, info, out, byte_range=(0, half))
|
||||
else:
|
||||
copy_range(shard_path, data_off, info, out)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("input_dir", help="SeFi diffusers checkpoint directory")
|
||||
parser.add_argument("output", help="Output transformer safetensors path (load via --diffusion-model)")
|
||||
args = parser.parse_args()
|
||||
|
||||
transformer_entries = stage_tensors_for_section(
|
||||
os.path.join(args.input_dir, "transformer"), rewrite_transformer_key)
|
||||
|
||||
print(f"Transformer tensors: {len(transformer_entries)}")
|
||||
print(f"Writing {len(transformer_entries)} tensors -> {args.output}")
|
||||
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
||||
write_consolidated(args.output, transformer_entries)
|
||||
print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
|
||||
arch = LLM::LLMArch::GPT_OSS_20B;
|
||||
} else if (sd_version_is_pid(version)) {
|
||||
arch = LLM::LLMArch::GEMMA2_2B;
|
||||
} else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_krea2(version)) {
|
||||
} else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_sefi_image(version) || sd_version_is_krea2(version)) {
|
||||
arch = LLM::LLMArch::QWEN3_VL;
|
||||
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
|
||||
arch = LLM::LLMArch::QWEN3;
|
||||
@ -1997,6 +1997,18 @@ struct LLMEmbedder : public Conditioner {
|
||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||
|
||||
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
|
||||
} else if (sd_version_is_sefi_image(version)) {
|
||||
prompt_template_encode_start_idx = 0;
|
||||
min_length = 1024;
|
||||
out_layers = {9, 18, 27};
|
||||
|
||||
prompt = "<|im_start|>user\n";
|
||||
|
||||
prompt_attn_range.first = static_cast<int>(prompt.size());
|
||||
prompt += conditioner_params.text;
|
||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||
|
||||
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
||||
} else if (version == VERSION_OVIS_IMAGE) {
|
||||
prompt_template_encode_start_idx = 28;
|
||||
min_length = prompt_template_encode_start_idx + 256;
|
||||
|
||||
11
src/model.h
11
src/model.h
@ -49,6 +49,7 @@ enum SDVersion {
|
||||
VERSION_LONGCAT,
|
||||
VERSION_PID,
|
||||
VERSION_IDEOGRAM4,
|
||||
VERSION_SEFI_IMAGE,
|
||||
VERSION_KREA2,
|
||||
VERSION_ESRGAN,
|
||||
VERSION_COUNT,
|
||||
@ -187,6 +188,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool sd_version_is_sefi_image(SDVersion version) {
|
||||
if (version == VERSION_SEFI_IMAGE) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool sd_version_is_krea2(SDVersion version) {
|
||||
if (version == VERSION_KREA2) {
|
||||
return true;
|
||||
@ -202,7 +210,7 @@ static inline bool sd_version_uses_flux_vae(SDVersion version) {
|
||||
}
|
||||
|
||||
static inline bool sd_version_uses_flux2_vae(SDVersion version) {
|
||||
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
|
||||
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version) || sd_version_is_sefi_image(version)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -242,6 +250,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
|
||||
sd_version_is_longcat(version) ||
|
||||
sd_version_is_pid(version) ||
|
||||
sd_version_is_ideogram4(version) ||
|
||||
sd_version_is_sefi_image(version) ||
|
||||
sd_version_is_krea2(version)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
#include "model/common/rope.hpp"
|
||||
#include "model/diffusion/dit.hpp"
|
||||
#include "model/diffusion/model.hpp"
|
||||
#include "model/diffusion/sefi_image.hpp"
|
||||
#include "model_loader.h"
|
||||
|
||||
#define FLUX_GRAPH_SIZE 10240
|
||||
@ -26,6 +27,9 @@ namespace Flux {
|
||||
struct FluxConfig {
|
||||
SDVersion version = VERSION_FLUX;
|
||||
bool is_chroma = false;
|
||||
bool is_sefi = false;
|
||||
int64_t semantic_channels = 0;
|
||||
float sefi_delta_t = 0.1f;
|
||||
int patch_size = 2;
|
||||
int64_t in_channels = 64;
|
||||
int64_t out_channels = 64;
|
||||
@ -88,6 +92,21 @@ namespace Flux {
|
||||
config.share_modulation = true;
|
||||
config.ref_index_scale = 10.f;
|
||||
config.use_mlp_silu_act = true;
|
||||
} else if (sd_version_is_sefi_image(version)) {
|
||||
config.is_sefi = true;
|
||||
config.semantic_channels = 16;
|
||||
config.in_channels = 128 + config.semantic_channels;
|
||||
config.patch_size = 1;
|
||||
config.out_channels = 128 + config.semantic_channels;
|
||||
config.mlp_ratio = 3.f;
|
||||
config.theta = 2000;
|
||||
config.axes_dim = {32, 32, 32, 32};
|
||||
config.vec_in_dim = 0;
|
||||
config.qkv_bias = false;
|
||||
config.disable_bias = true;
|
||||
config.share_modulation = true;
|
||||
config.ref_index_scale = 10.f;
|
||||
config.use_mlp_silu_act = true;
|
||||
} else if (sd_version_is_longcat(version)) {
|
||||
config.context_in_dim = 3584;
|
||||
config.vec_in_dim = 0;
|
||||
@ -723,8 +742,8 @@ namespace Flux {
|
||||
|
||||
auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 2 * hidden_size]
|
||||
auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, 2, 0);
|
||||
shift = m_vec[0]; // [N, hidden_size]
|
||||
scale = m_vec[1]; // [N, hidden_size]
|
||||
shift = m_vec[0];
|
||||
scale = m_vec[1];
|
||||
}
|
||||
|
||||
x = Flux::modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
|
||||
@ -902,6 +921,8 @@ namespace Flux {
|
||||
}
|
||||
if (config.is_chroma) {
|
||||
blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(config.in_dim, config.hidden_size);
|
||||
} else if (config.is_sefi) {
|
||||
blocks["dual_time_embed"] = std::make_shared<SefiImage::SefiDualTimestepEmbeddings>(256, config.hidden_size);
|
||||
} else {
|
||||
blocks["time_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
|
||||
if (config.vec_in_dim > 0) {
|
||||
@ -1027,6 +1048,11 @@ namespace Flux {
|
||||
if (y != nullptr) {
|
||||
txt_img_mask = ggml_pad(ctx->ggml_ctx, y, static_cast<int>(img->ne[1]), 0, 0, 0);
|
||||
}
|
||||
} else if (config.is_sefi) {
|
||||
auto dual_time_embed = std::dynamic_pointer_cast<SefiImage::SefiDualTimestepEmbeddings>(blocks["dual_time_embed"]);
|
||||
auto timestep_sem = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, 0);
|
||||
auto timestep_tex = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, ggml_element_size(timesteps));
|
||||
vec = dual_time_embed->forward(ctx, timestep_sem, timestep_tex);
|
||||
} else {
|
||||
auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
|
||||
vec = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
|
||||
@ -1500,7 +1526,7 @@ namespace Flux {
|
||||
set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
|
||||
}
|
||||
std::set<int> txt_arange_dims;
|
||||
if (sd_version_is_flux2(version)) {
|
||||
if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) {
|
||||
txt_arange_dims = {3};
|
||||
increase_ref_index = true;
|
||||
} else if (version == VERSION_OVIS_IMAGE) {
|
||||
|
||||
91
src/model/diffusion/sefi_image.hpp
Normal file
91
src/model/diffusion/sefi_image.hpp
Normal file
@ -0,0 +1,91 @@
|
||||
#ifndef __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
|
||||
#define __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "model/common/block.hpp"
|
||||
|
||||
namespace SefiImage {
|
||||
struct SefiImageConfig {
|
||||
int64_t semantic_channels = 16;
|
||||
int64_t texture_latent_channels = 32;
|
||||
int64_t timestep_guidance_in_dim = 256;
|
||||
int64_t hidden_size = 3072;
|
||||
float timestep_shift_alpha = 0.3f;
|
||||
float delta_t = 0.1f;
|
||||
|
||||
int64_t packed_texture_channels(int patch_size) const {
|
||||
return texture_latent_channels * patch_size * patch_size;
|
||||
}
|
||||
|
||||
int64_t packed_input_channels(int patch_size) const {
|
||||
return semantic_channels + packed_texture_channels(patch_size);
|
||||
}
|
||||
|
||||
static SefiImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
|
||||
const std::string& prefix) {
|
||||
SefiImageConfig config;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
continue;
|
||||
}
|
||||
if (ends_with(name, "dual_time_embed.semantic_embedder.linear_1.weight") && tensor_storage.n_dims == 2) {
|
||||
config.timestep_guidance_in_dim = tensor_storage.ne[0];
|
||||
config.hidden_size = tensor_storage.ne[1] * 2;
|
||||
}
|
||||
}
|
||||
LOG_DEBUG("sefi_image: semantic_channels = %" PRId64 ", texture_latent_channels = %" PRId64 ", hidden_size = %" PRId64,
|
||||
config.semantic_channels,
|
||||
config.texture_latent_channels,
|
||||
config.hidden_size);
|
||||
return config;
|
||||
}
|
||||
};
|
||||
|
||||
struct SefiTimestepEmbedding : public GGMLBlock {
|
||||
public:
|
||||
SefiTimestepEmbedding(int64_t in_channels, int64_t time_embed_dim) {
|
||||
blocks["linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim, false));
|
||||
blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim, false));
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* sample) {
|
||||
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
|
||||
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
|
||||
|
||||
sample = linear_1->forward(ctx, sample);
|
||||
sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
|
||||
sample = linear_2->forward(ctx, sample);
|
||||
return sample;
|
||||
}
|
||||
};
|
||||
|
||||
struct SefiDualTimestepEmbeddings : public GGMLBlock {
|
||||
public:
|
||||
SefiDualTimestepEmbeddings(int64_t in_channels, int64_t embedding_dim) {
|
||||
GGML_ASSERT(embedding_dim % 2 == 0);
|
||||
int64_t half_dim = embedding_dim / 2;
|
||||
blocks["semantic_embedder"] = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
|
||||
blocks["texture_embedder"] = std::make_shared<SefiTimestepEmbedding>(in_channels, half_dim);
|
||||
timestep_guidance_in_dim = in_channels;
|
||||
}
|
||||
|
||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* timestep_sem,
|
||||
ggml_tensor* timestep_tex) {
|
||||
auto semantic_embedder = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["semantic_embedder"]);
|
||||
auto texture_embedder = std::dynamic_pointer_cast<SefiTimestepEmbedding>(blocks["texture_embedder"]);
|
||||
|
||||
auto sem_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_sem, timestep_guidance_in_dim, 10000, 1.f);
|
||||
auto tex_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_tex, timestep_guidance_in_dim, 10000, 1.f);
|
||||
auto sem_emb = semantic_embedder->forward(ctx, sem_proj);
|
||||
auto tex_emb = texture_embedder->forward(ctx, tex_proj);
|
||||
return ggml_concat(ctx->ggml_ctx, sem_emb, tex_emb, 0);
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t timestep_guidance_in_dim = 256;
|
||||
};
|
||||
} // namespace SefiImage
|
||||
|
||||
#endif // __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__
|
||||
@ -250,7 +250,7 @@ namespace LLM {
|
||||
config.intermediate_size = tensor_storage.ne[1];
|
||||
}
|
||||
}
|
||||
if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
|
||||
if ((arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) && config.num_layers == 28) {
|
||||
config.num_heads = 16;
|
||||
}
|
||||
if (detected_vision_layers > 0) {
|
||||
|
||||
@ -816,12 +816,13 @@ struct AutoEncoderKL : public VAE {
|
||||
}
|
||||
|
||||
sd::Tensor<float> diffusion_to_vae_latents(const sd::Tensor<float>& latents) override {
|
||||
auto latents_ = sd_version_is_sefi_image(version) ? sd::ops::slice(latents, 2, 16, 144) : latents;
|
||||
if (sd_version_uses_flux2_vae(version)) {
|
||||
int channel_dim = 2;
|
||||
auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
|
||||
return (latents * std_tensor) / scale_factor + mean_tensor;
|
||||
auto [mean_tensor, std_tensor] = get_latents_mean_std(latents_, channel_dim);
|
||||
return (latents_ * std_tensor) / scale_factor + mean_tensor;
|
||||
}
|
||||
return (latents / scale_factor) + shift_factor;
|
||||
return (latents_ / scale_factor) + shift_factor;
|
||||
}
|
||||
|
||||
sd::Tensor<float> vae_to_diffusion_latents(const sd::Tensor<float>& latents) override {
|
||||
|
||||
@ -66,7 +66,6 @@ const char* unused_tensors[] = {
|
||||
// "v_pred", // Used to detect SDXL vpred models
|
||||
"text_encoders.llm.output.weight",
|
||||
"text_encoders.llm.lm_head.",
|
||||
"first_stage_model.bn.",
|
||||
};
|
||||
|
||||
bool is_unused_tensor(const std::string& name) {
|
||||
@ -480,6 +479,9 @@ SDVersion ModelLoader::get_sd_version() {
|
||||
if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
|
||||
is_flux2 = true;
|
||||
}
|
||||
if (tensor_storage.name.find("dual_time_embed.semantic_embedder.linear_1.weight") != std::string::npos) {
|
||||
return VERSION_SEFI_IMAGE;
|
||||
}
|
||||
if (tensor_storage.name.find("single_blocks.47.linear1.weight") != std::string::npos) {
|
||||
has_single_block_47 = true;
|
||||
}
|
||||
|
||||
@ -743,7 +743,7 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
|
||||
name = convert_diffusers_unet_to_original_sdxl(name);
|
||||
} else if (sd_version_is_sd3(version)) {
|
||||
name = convert_diffusers_dit_to_original_sd3(name);
|
||||
} else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
|
||||
} else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) {
|
||||
name = convert_diffusers_dit_to_original_flux(name);
|
||||
} else if (sd_version_is_z_image(version)) {
|
||||
name = convert_diffusers_dit_to_original_lumina2(name);
|
||||
|
||||
@ -1005,6 +1005,8 @@ struct FluxFlowDenoiser : public DiscreteFlowDenoiser {
|
||||
}
|
||||
};
|
||||
|
||||
struct SefiFlowDenoiser;
|
||||
|
||||
struct Flux2FlowDenoiser : public FluxFlowDenoiser {
|
||||
Flux2FlowDenoiser() = default;
|
||||
|
||||
@ -1037,6 +1039,80 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser {
|
||||
}
|
||||
};
|
||||
|
||||
struct SefiFlowDenoiser : public Flux2FlowDenoiser {
|
||||
static constexpr int kNumTrainTimesteps = 1000;
|
||||
static constexpr int kSemChannels = 16;
|
||||
static constexpr int kTotalChannels = 144;
|
||||
|
||||
float delta_t = 0.1f;
|
||||
float timestep_shift_alpha = 1.0f;
|
||||
|
||||
std::vector<float> sem_sigmas;
|
||||
std::vector<float> tex_sigmas;
|
||||
std::vector<float> sem_timesteps;
|
||||
std::vector<float> tex_timesteps;
|
||||
|
||||
SefiFlowDenoiser() = default;
|
||||
|
||||
static float apply_alpha_shift(float u_unit, float alpha) {
|
||||
if (alpha == 1.0f) {
|
||||
return u_unit;
|
||||
}
|
||||
float denom = 1.0f + (alpha - 1.0f) * u_unit;
|
||||
return (alpha * u_unit) / denom;
|
||||
}
|
||||
|
||||
std::vector<float> get_sigmas(uint32_t n,
|
||||
int image_seq_len,
|
||||
scheduler_t scheduler_type,
|
||||
SDVersion version,
|
||||
const char* extra_sample_args = nullptr) override {
|
||||
sem_sigmas.clear();
|
||||
tex_sigmas.clear();
|
||||
sem_timesteps.clear();
|
||||
tex_timesteps.clear();
|
||||
|
||||
for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "sefi scheduler arg")) {
|
||||
if (key == "sefi_alpha") {
|
||||
if (!parse_strict_float(value, timestep_shift_alpha)) {
|
||||
LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str());
|
||||
}
|
||||
} else if (key == "sefi_delta_t") {
|
||||
if (!parse_strict_float(value, delta_t)) {
|
||||
LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i <= n; ++i) {
|
||||
float u_base = static_cast<float>(i) / static_cast<float>(n);
|
||||
float u_shifted = apply_alpha_shift(u_base, timestep_shift_alpha);
|
||||
float u_sem_raw = u_shifted * (1.0f + delta_t);
|
||||
|
||||
float u_sem = std::min(u_sem_raw, 1.0f);
|
||||
float u_tex = std::max(0.0f, std::min(u_sem_raw - delta_t, 1.0f));
|
||||
|
||||
int idx_sem = std::min(kNumTrainTimesteps - 1,
|
||||
std::max(0, static_cast<int>(u_sem * (kNumTrainTimesteps - 1))));
|
||||
int idx_tex = std::min(kNumTrainTimesteps - 1,
|
||||
std::max(0, static_cast<int>(u_tex * (kNumTrainTimesteps - 1))));
|
||||
|
||||
float t_sem = static_cast<float>(kNumTrainTimesteps - idx_sem);
|
||||
float t_tex = static_cast<float>(kNumTrainTimesteps - idx_tex);
|
||||
float sigma_sem = t_sem / static_cast<float>(kNumTrainTimesteps);
|
||||
float sigma_tex = t_tex / static_cast<float>(kNumTrainTimesteps);
|
||||
|
||||
sem_timesteps.push_back(t_sem);
|
||||
tex_timesteps.push_back(t_tex);
|
||||
sem_sigmas.push_back(sigma_sem);
|
||||
tex_sigmas.push_back(sigma_tex);
|
||||
}
|
||||
LOG_DEBUG("SefiFlowDenoiser: built %u-step dual schedule (alpha=%.2f delta_t=%.2f)",
|
||||
n, timestep_shift_alpha, delta_t);
|
||||
return tex_sigmas;
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::function<sd::guidance::GuiderOutput(const sd::Tensor<float>&, float, int)> denoise_cb_t;
|
||||
|
||||
static std::pair<float, float> get_ancestral_step(float sigma_from,
|
||||
@ -1140,6 +1216,40 @@ static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
|
||||
return x;
|
||||
}
|
||||
|
||||
static sd::Tensor<float> sample_sefi_euler(SefiFlowDenoiser* sefi,
|
||||
denoise_cb_t model,
|
||||
sd::Tensor<float> x) {
|
||||
const std::vector<float>& sigma_tex_vec = sefi->tex_sigmas;
|
||||
const std::vector<float>& sigma_sem_vec = sefi->sem_sigmas;
|
||||
int steps = static_cast<int>(sigma_tex_vec.size()) - 1;
|
||||
for (int i = 0; i < steps; i++) {
|
||||
float sigma_tex_cur = sigma_tex_vec[i];
|
||||
float sigma_tex_next = sigma_tex_vec[i + 1];
|
||||
float sigma_sem_cur = sigma_sem_vec[i];
|
||||
float sigma_sem_next = sigma_sem_vec[i + 1];
|
||||
if (sigma_tex_cur <= 1e-9f) {
|
||||
continue;
|
||||
}
|
||||
auto denoised_opt = model(x, sigma_tex_cur, i + 1);
|
||||
if (denoised_opt.pred.empty()) {
|
||||
return {};
|
||||
}
|
||||
sd::Tensor<float> denoised = std::move(denoised_opt.pred);
|
||||
sd::Tensor<float> velocity = (x - denoised) / sigma_tex_cur;
|
||||
|
||||
auto x_sem = sd::ops::slice(x, 2, 0, SefiFlowDenoiser::kSemChannels);
|
||||
auto x_tex = sd::ops::slice(x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels);
|
||||
auto vel_sem = sd::ops::slice(velocity, 2, 0, SefiFlowDenoiser::kSemChannels);
|
||||
auto vel_tex = sd::ops::slice(velocity, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels);
|
||||
auto x_sem_next = x_sem + vel_sem * (sigma_sem_next - sigma_sem_cur);
|
||||
auto x_tex_next = x_tex + vel_tex * (sigma_tex_next - sigma_tex_cur);
|
||||
|
||||
sd::ops::slice_assign(&x, 2, 0, SefiFlowDenoiser::kSemChannels, x_sem_next);
|
||||
sd::ops::slice_assign(&x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels, x_tex_next);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
static sd::Tensor<float> sample_euler(denoise_cb_t model,
|
||||
sd::Tensor<float> x,
|
||||
const std::vector<float>& sigmas) {
|
||||
@ -2055,7 +2165,13 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
|
||||
std::shared_ptr<RNG> rng,
|
||||
float eta,
|
||||
bool is_flow_denoiser,
|
||||
const char* extra_sample_args) {
|
||||
const char* extra_sample_args,
|
||||
std::shared_ptr<Denoiser> denoiser_for_dispatch = nullptr) {
|
||||
if (denoiser_for_dispatch) {
|
||||
if (auto sefi = std::dynamic_pointer_cast<SefiFlowDenoiser>(denoiser_for_dispatch)) {
|
||||
return sample_sefi_euler(sefi.get(), model, std::move(x));
|
||||
}
|
||||
}
|
||||
SamplerExtraArgs extra_args = parse_key_value_args(extra_sample_args, "extra sample arg");
|
||||
switch (method) {
|
||||
case EULER_A_SAMPLE_METHOD:
|
||||
|
||||
@ -96,6 +96,7 @@ const char* model_version_to_str[] = {
|
||||
"Longcat-Image",
|
||||
"PiD",
|
||||
"Ideogram 4",
|
||||
"SeFi-Image",
|
||||
"Krea2",
|
||||
"ESRGAN",
|
||||
};
|
||||
@ -691,7 +692,7 @@ public:
|
||||
version,
|
||||
sd_ctx_params->chroma_use_dit_mask,
|
||||
model_manager);
|
||||
} else if (sd_version_is_flux2(version)) {
|
||||
} else if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) {
|
||||
bool is_chroma = false;
|
||||
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map,
|
||||
@ -1295,6 +1296,8 @@ public:
|
||||
} else if (sd_version_is_krea2(version)) {
|
||||
default_flow_shift = 1.15f;
|
||||
}
|
||||
} else if (sd_version_is_sefi_image(version)) {
|
||||
pred_type = SEFI_FLOW_PRED;
|
||||
} else if (sd_version_is_flux2(version)) {
|
||||
pred_type = FLUX2_FLOW_PRED;
|
||||
} else {
|
||||
@ -1334,6 +1337,11 @@ public:
|
||||
denoiser = std::make_shared<Flux2FlowDenoiser>();
|
||||
break;
|
||||
}
|
||||
case SEFI_FLOW_PRED: {
|
||||
LOG_INFO("running in SeFi-Image dual-time FLOW mode");
|
||||
denoiser = std::make_shared<SefiFlowDenoiser>();
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LOG_ERROR("Unknown predition type %i", pred_type);
|
||||
return false;
|
||||
@ -1639,7 +1647,16 @@ public:
|
||||
|
||||
std::vector<float> process_timesteps(const std::vector<float>& timesteps,
|
||||
const sd::Tensor<float>& init_latent,
|
||||
const sd::Tensor<float>& denoise_mask) {
|
||||
const sd::Tensor<float>& denoise_mask,
|
||||
int step) {
|
||||
if (auto sefi_denoiser = std::dynamic_pointer_cast<SefiFlowDenoiser>(denoiser)) {
|
||||
int sched_idx = step > 0 ? step - 1 : 0;
|
||||
if (sched_idx >= static_cast<int>(sefi_denoiser->tex_timesteps.size())) {
|
||||
sched_idx = static_cast<int>(sefi_denoiser->tex_timesteps.size()) - 1;
|
||||
}
|
||||
return {sefi_denoiser->sem_timesteps[sched_idx],
|
||||
sefi_denoiser->tex_timesteps[sched_idx]};
|
||||
}
|
||||
if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") {
|
||||
int64_t frame_count = init_latent.shape()[2];
|
||||
auto new_timesteps = std::vector<float>(static_cast<size_t>(frame_count), timesteps[0]);
|
||||
@ -2051,7 +2068,7 @@ public:
|
||||
timesteps_vec = process_ltxav_video_timesteps(base_timesteps_vec, init_latent, denoise_mask);
|
||||
audio_timesteps_tensor = sd::Tensor<float>({static_cast<int64_t>(base_timesteps_vec.size())}, base_timesteps_vec);
|
||||
} else {
|
||||
timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask);
|
||||
timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask, step);
|
||||
}
|
||||
const std::vector<float>& scaling_timesteps_vec = (sd_version_is_ltxav(version) && !denoise_mask.empty())
|
||||
? base_timesteps_vec
|
||||
@ -2121,7 +2138,7 @@ public:
|
||||
diffusion_params.extra = UNetDiffusionExtra{-1, &controls, control_strength};
|
||||
} else if (sd_version_is_sd3(version)) {
|
||||
diffusion_params.extra = SkipLayerDiffusionExtra{local_skip_layers};
|
||||
} else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
|
||||
} else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) {
|
||||
diffusion_params.extra = FluxDiffusionExtra{&guidance_tensor,
|
||||
local_skip_layers};
|
||||
} else if (sd_version_is_anima(version)) {
|
||||
@ -2265,7 +2282,7 @@ public:
|
||||
return output;
|
||||
};
|
||||
|
||||
auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args);
|
||||
auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args, denoiser);
|
||||
if (x0_opt.empty()) {
|
||||
LOG_ERROR("Diffusion model sampling failed");
|
||||
if (control_net) {
|
||||
@ -2326,6 +2343,8 @@ public:
|
||||
latent_channel = 3;
|
||||
} else if (sd_version_is_pid(version)) {
|
||||
latent_channel = 3;
|
||||
} else if (sd_version_is_sefi_image(version)) {
|
||||
latent_channel = 144;
|
||||
} else if (sd_version_uses_flux2_vae(version)) {
|
||||
latent_channel = 128;
|
||||
} else {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user