mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-24 23:26:43 +00:00
Compare commits
No commits in common. "b9254dda0d10b91ee6f17fb7f4420097dd29824b" and "1f9ee88e09c258053fa59d5e05e23dfb10fa0b13" have entirely different histories.
b9254dda0d
...
1f9ee88e09
@ -15,7 +15,6 @@ API and command-line option may change frequently.***
|
|||||||
|
|
||||||
## 🔥Important News
|
## 🔥Important News
|
||||||
|
|
||||||
* **2026/06/04** 🚀 stable-diffusion.cpp now supports **Ideogram4**
|
|
||||||
* **2026/05/31** 🚀 stable-diffusion.cpp now supports **PiD**
|
* **2026/05/31** 🚀 stable-diffusion.cpp now supports **PiD**
|
||||||
* **2026/05/27** 🚀 stable-diffusion.cpp now supports **Lens**
|
* **2026/05/27** 🚀 stable-diffusion.cpp now supports **Lens**
|
||||||
* **2026/05/17** 🚀 stable-diffusion.cpp now supports **LTX-2.3**
|
* **2026/05/17** 🚀 stable-diffusion.cpp now supports **LTX-2.3**
|
||||||
@ -51,7 +50,6 @@ API and command-line option may change frequently.***
|
|||||||
- [Anima](./docs/anima.md)
|
- [Anima](./docs/anima.md)
|
||||||
- [ERNIE-Image](./docs/ernie_image.md)
|
- [ERNIE-Image](./docs/ernie_image.md)
|
||||||
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
|
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
|
||||||
- [Ideogram4](./docs/ideogram4.md)
|
|
||||||
- Image Edit Models
|
- Image Edit Models
|
||||||
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
- [FLUX.1-Kontext-dev](./docs/kontext.md)
|
||||||
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
|
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 2.5 MiB |
@ -1,40 +0,0 @@
|
|||||||
# How to Use
|
|
||||||
|
|
||||||
## Download weights
|
|
||||||
|
|
||||||
- Download Ideogram4
|
|
||||||
- safetensors: https://huggingface.co/ideogram-ai/ideogram-4-fp8/tree/main/transformer
|
|
||||||
- Download Ideogram4 uncond
|
|
||||||
- safetensors: https://huggingface.co/ideogram-ai/ideogram-4-fp8/tree/main/unconditional_transformer
|
|
||||||
- Download vae
|
|
||||||
- safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
|
|
||||||
- Download Qwen3-VL-8B-Instruct
|
|
||||||
- gguf: https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
|
|
||||||
|
|
||||||
## Convert weights
|
|
||||||
|
|
||||||
fp8 scale -> bf16
|
|
||||||
|
|
||||||
```
|
|
||||||
python .\convert_fp8_scale_to_bf16.py --input .\ideogram4_fp8.safetensors --output ideogram4_bf16.safetensors
|
|
||||||
python .\convert_fp8_scale_to_bf16.py --input .\ideogram4_uncond_fp8.safetensors --output ideogram4_uncond_bf16.safetensors
|
|
||||||
```
|
|
||||||
|
|
||||||
bf16 -> q8
|
|
||||||
|
|
||||||
```
|
|
||||||
.\bin\Release\sd-cli.exe -M convert -m ideogram4_bf16.safetensors -o ideogram4-Q8_0.gguf --tensor-type-rules "^layers.*adaln_modulation.*weight=q8_0,layers.*attention.o.*weight=q8_0,layers.*attention.qkv.*weight=q8_0,layers.*feed_forward.*weight=q8_0" -v
|
|
||||||
|
|
||||||
.\bin\Release\sd-cli.exe -M convert -m ideogram4_uncond_bf16.safetensors -o ideogram4_uncond-Q8_0.gguf --tensor-type-rules "^layers.*adaln_modulation.*weight=q8_0,layers.*attention.o.*weight=q8_0,layers.*attention.qkv.*weight=q8_0,layers.*feed_forward.*weight=q8_0" -v
|
|
||||||
```
|
|
||||||
|
|
||||||
If you want lower VRAM usage, you can change the quantization from q8_0 to a lower-level quantization, such as q4_0.
|
|
||||||
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
```sh
|
|
||||||
.\bin\Release\sd-cli.exe --diffusion-model ideogram4-Q8_0.gguf --uncond-diffusion-model ideogram4_uncond-Q8_0.gguf --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors -p '{"high_level_description":"A square 1024 x 1024 luxury fashion magazine cover featuring exactly one short chubby fluffy cat as the main model. The cat sits on a soft ivory studio floor, facing the viewer with a stylish calm expression, wearing tiny black sunglasses, a red silk scarf, and a small gold collar charm. In front of the cat on the floor is a wide horizontal luxury nameplate that clearly reads ideogram4.cpp. The whole design feels premium, fashionable, clean, and editorial.","style_description":{"aesthetics":"luxury fashion magazine cover, high-end pet couture campaign, minimalist editorial design, elegant studio photography, soft paper texture, refined typography, fashionable and polished","lighting":"Soft diffused studio lighting, gentle spotlight on the cat, subtle floor shadow, warm ivory highlights, clean separation between subject and background","photo":"high-resolution fashion editorial photography look, front-facing cat portrait, crisp fur details, glossy sunglasses, clear readable nameplate text, shallow depth of field","medium":"mixed media fashion photography and premium editorial graphic design","color_palette":["#F4EFE7","#111111","#D8B56D","#B73A3A","#FFFFFF","#8A7A6A"]},"compositional_deconstruction":{"canvas":"Square 1024 x 1024 canvas with a normal upright orientation. Do not rotate the poster or any text. Use a clean fashion magazine cover layout.","background":"Warm ivory studio backdrop with subtle paper grain, a soft spotlight gradient, faint floor shadow, and a few minimal gold editorial lines. The background is spacious, premium, and uncluttered.","layout":"Top center has a small elegant headline. Center area features one cat as the main fashion model. Lower foreground has a wide horizontal luxury nameplate placed on the floor in front of the cat. Bottom center has a small footer. All text is horizontal, upright, and readable left to right.","elements":[{"type":"text","desc":"Top center headline reading LOOK WHAT I FOUND in a refined high-fashion serif font. The headline is horizontal, centered, elegant, and secondary to the nameplate text."},{"type":"obj","desc":"Exactly one short chubby fluffy cat sitting in the center like a luxury fashion model. The cat has a large round head, compact body, short legs, soft detailed fur, expressive eyes, and a calm confident pose. The cat is cute and rounded, not tall, not stretched, not duplicated."},{"type":"obj","desc":"Tiny glossy black sunglasses worn naturally by the cat, slightly oversized but still showing the cat face clearly. The sunglasses add a chic fashion-editorial attitude."},{"type":"obj","desc":"A red silk scarf tied neatly around the cat neck, with soft folds and a couture feeling. The scarf must not cover the cat face or the nameplate."},{"type":"obj","desc":"A small gold collar charm or fashion accessory under the scarf, subtle and premium, adding a luxury campaign detail."},{"type":"obj","desc":"In the lower foreground, place a wide horizontal luxury nameplate on the floor in front of the cat. The nameplate is low, flat, landscape-oriented, much wider than tall, like a fashion show seat card or premium display plaque. It is centered, front-facing, level, and fully visible. It must not become vertical, tall, standing, rotated, or side-facing."},{"type":"text","desc":"Print the exact text ideogram4.cpp only on the wide horizontal nameplate. Use clean bold black lettering, perfectly spelled, lowercase, with the number 4 and .cpp extension. The text must fit completely inside the nameplate, stay horizontal, and be readable from left to right."},{"type":"obj","desc":"Add sparse premium editorial accents around the edges: thin gold lines, small code brackets, tiny cursor marks, subtle dots, and minimal geometric details. No extra cats, no stickers, no animal faces, no busy decorations."},{"type":"text","desc":"Bottom center footer reading tiny paws, big compile energy in a small refined monospace or editorial font. The footer is horizontal, centered, understated, and much smaller than the nameplate text."}]}}' --diffusion-fa -v --offload-to-cpu -H 1024 -W 1024
|
|
||||||
```
|
|
||||||
|
|
||||||
<img alt="ideogram4 image example" src="../assets/ideogram4/example.png" />
|
|
||||||
@ -41,8 +41,6 @@ Context Options:
|
|||||||
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
||||||
--diffusion-model <string> path to the standalone diffusion model
|
--diffusion-model <string> path to the standalone diffusion model
|
||||||
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
|
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
|
||||||
--uncond-diffusion-model <string> path to the standalone unconditional diffusion model, currently used by
|
|
||||||
Ideogram4 CFG
|
|
||||||
--vae <string> path to standalone vae model
|
--vae <string> path to standalone vae model
|
||||||
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
||||||
--tae <string> alias of --taesd
|
--tae <string> alias of --taesd
|
||||||
|
|||||||
@ -169,9 +169,8 @@ struct SDCliParams {
|
|||||||
return 1;
|
return 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto on_help_arg = [&](int argc, const char** argv, int index, bool& valid) {
|
auto on_help_arg = [&](int argc, const char** argv, int index) {
|
||||||
normal_exit = true;
|
normal_exit = true;
|
||||||
valid = true;
|
|
||||||
return -1;
|
return -1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -245,7 +245,6 @@ bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& o
|
|||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool valid = false;
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
bool found_arg = false;
|
bool found_arg = false;
|
||||||
@ -288,7 +287,7 @@ bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& o
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
if (match_and_apply(options.manual_options, [&](auto& option) {
|
if (match_and_apply(options.manual_options, [&](auto& option) {
|
||||||
int ret = option.cb(argc, argv, i, valid);
|
int ret = option.cb(argc, argv, i);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
invalid_arg = true;
|
invalid_arg = true;
|
||||||
return;
|
return;
|
||||||
@ -300,9 +299,7 @@ bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& o
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (invalid_arg) {
|
if (invalid_arg) {
|
||||||
if (!valid) {
|
|
||||||
LOG_ERROR("error: invalid parameter for argument: %s", arg.c_str());
|
LOG_ERROR("error: invalid parameter for argument: %s", arg.c_str());
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!found_arg) {
|
if (!found_arg) {
|
||||||
@ -359,10 +356,6 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
"--high-noise-diffusion-model",
|
"--high-noise-diffusion-model",
|
||||||
"path to the standalone high noise diffusion model",
|
"path to the standalone high noise diffusion model",
|
||||||
&high_noise_diffusion_model_path},
|
&high_noise_diffusion_model_path},
|
||||||
{"",
|
|
||||||
"--uncond-diffusion-model",
|
|
||||||
"path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG",
|
|
||||||
&uncond_diffusion_model_path},
|
|
||||||
{"",
|
{"",
|
||||||
"--embeddings-connectors",
|
"--embeddings-connectors",
|
||||||
"path to LTXAV embeddings connectors",
|
"path to LTXAV embeddings connectors",
|
||||||
@ -713,7 +706,6 @@ std::string SDContextParams::to_string() const {
|
|||||||
<< " llm_vision_path: \"" << llm_vision_path << "\",\n"
|
<< " llm_vision_path: \"" << llm_vision_path << "\",\n"
|
||||||
<< " diffusion_model_path: \"" << diffusion_model_path << "\",\n"
|
<< " diffusion_model_path: \"" << diffusion_model_path << "\",\n"
|
||||||
<< " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
|
<< " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
|
||||||
<< " uncond_diffusion_model_path: \"" << uncond_diffusion_model_path << "\",\n"
|
|
||||||
<< " embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
|
<< " embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
|
||||||
<< " vae_path: \"" << vae_path << "\",\n"
|
<< " vae_path: \"" << vae_path << "\",\n"
|
||||||
<< " vae_format: \"" << vae_format << "\",\n"
|
<< " vae_format: \"" << vae_format << "\",\n"
|
||||||
@ -777,7 +769,6 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
|
|||||||
llm_vision_path.c_str(),
|
llm_vision_path.c_str(),
|
||||||
diffusion_model_path.c_str(),
|
diffusion_model_path.c_str(),
|
||||||
high_noise_diffusion_model_path.c_str(),
|
high_noise_diffusion_model_path.c_str(),
|
||||||
uncond_diffusion_model_path.c_str(),
|
|
||||||
embeddings_connectors_path.c_str(),
|
embeddings_connectors_path.c_str(),
|
||||||
vae_path.c_str(),
|
vae_path.c_str(),
|
||||||
audio_vae_path.c_str(),
|
audio_vae_path.c_str(),
|
||||||
@ -2528,7 +2519,6 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
|
|||||||
set_json_basename_if_not_empty(models, "llm_vision", ctx_params.llm_vision_path);
|
set_json_basename_if_not_empty(models, "llm_vision", ctx_params.llm_vision_path);
|
||||||
set_json_basename_if_not_empty(models, "diffusion_model", ctx_params.diffusion_model_path);
|
set_json_basename_if_not_empty(models, "diffusion_model", ctx_params.diffusion_model_path);
|
||||||
set_json_basename_if_not_empty(models, "high_noise_diffusion_model", ctx_params.high_noise_diffusion_model_path);
|
set_json_basename_if_not_empty(models, "high_noise_diffusion_model", ctx_params.high_noise_diffusion_model_path);
|
||||||
set_json_basename_if_not_empty(models, "uncond_diffusion_model", ctx_params.uncond_diffusion_model_path);
|
|
||||||
set_json_basename_if_not_empty(models, "vae", ctx_params.vae_path);
|
set_json_basename_if_not_empty(models, "vae", ctx_params.vae_path);
|
||||||
set_json_basename_if_not_empty(models, "taesd", ctx_params.taesd_path);
|
set_json_basename_if_not_empty(models, "taesd", ctx_params.taesd_path);
|
||||||
set_json_basename_if_not_empty(models, "control_net", ctx_params.control_net_path);
|
set_json_basename_if_not_empty(models, "control_net", ctx_params.control_net_path);
|
||||||
@ -2696,9 +2686,6 @@ std::string get_image_params(const SDContextParams& ctx_params,
|
|||||||
if (!ctx_params.diffusion_model_path.empty()) {
|
if (!ctx_params.diffusion_model_path.empty()) {
|
||||||
parameter_string += "Unet: " + sd_basename(ctx_params.diffusion_model_path) + ", ";
|
parameter_string += "Unet: " + sd_basename(ctx_params.diffusion_model_path) + ", ";
|
||||||
}
|
}
|
||||||
if (!ctx_params.uncond_diffusion_model_path.empty()) {
|
|
||||||
parameter_string += "Uncond Unet: " + sd_basename(ctx_params.uncond_diffusion_model_path) + ", ";
|
|
||||||
}
|
|
||||||
if (!ctx_params.vae_path.empty()) {
|
if (!ctx_params.vae_path.empty()) {
|
||||||
parameter_string += "VAE: " + sd_basename(ctx_params.vae_path) + ", ";
|
parameter_string += "VAE: " + sd_basename(ctx_params.vae_path) + ", ";
|
||||||
}
|
}
|
||||||
|
|||||||
@ -56,42 +56,11 @@ struct BoolOption {
|
|||||||
bool* target;
|
bool* target;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ManualFunction {
|
|
||||||
std::function<int(int, const char**, int, bool&)> _func;
|
|
||||||
|
|
||||||
ManualFunction() = default;
|
|
||||||
|
|
||||||
ManualFunction(std::function<int(int argc, const char** argv, int index, bool& valid)> func)
|
|
||||||
: _func(std::move(func)) {
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename F>
|
|
||||||
ManualFunction(F func)
|
|
||||||
: _func(make_function(func)) {
|
|
||||||
}
|
|
||||||
|
|
||||||
int operator()(int argc, const char** argv, int index, bool& valid) const {
|
|
||||||
return _func(argc, argv, index, valid);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
template <typename F>
|
|
||||||
static std::function<int(int, const char**, int, bool&)> make_function(F func) {
|
|
||||||
if constexpr (std::is_invocable_v<F, int, const char**, int, bool&>) {
|
|
||||||
return func;
|
|
||||||
} else {
|
|
||||||
return [func](int argc, const char** argv, int index, bool&) {
|
|
||||||
return func(argc, argv, index);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ManualOption {
|
struct ManualOption {
|
||||||
std::string short_name;
|
std::string short_name;
|
||||||
std::string long_name;
|
std::string long_name;
|
||||||
std::string desc;
|
std::string desc;
|
||||||
ManualFunction cb;
|
std::function<int(int argc, const char** argv, int index)> cb;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ArgOptions {
|
struct ArgOptions {
|
||||||
@ -123,7 +92,6 @@ struct SDContextParams {
|
|||||||
std::string llm_vision_path;
|
std::string llm_vision_path;
|
||||||
std::string diffusion_model_path;
|
std::string diffusion_model_path;
|
||||||
std::string high_noise_diffusion_model_path;
|
std::string high_noise_diffusion_model_path;
|
||||||
std::string uncond_diffusion_model_path;
|
|
||||||
std::string embeddings_connectors_path;
|
std::string embeddings_connectors_path;
|
||||||
std::string vae_path;
|
std::string vae_path;
|
||||||
std::string vae_format = "auto";
|
std::string vae_format = "auto";
|
||||||
|
|||||||
@ -143,8 +143,6 @@ Context Options:
|
|||||||
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
|
||||||
--diffusion-model <string> path to the standalone diffusion model
|
--diffusion-model <string> path to the standalone diffusion model
|
||||||
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
|
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
|
||||||
--uncond-diffusion-model <string> path to the standalone unconditional diffusion model, currently used by
|
|
||||||
Ideogram4 CFG
|
|
||||||
--vae <string> path to standalone vae model
|
--vae <string> path to standalone vae model
|
||||||
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
|
||||||
--tae <string> alias of --taesd
|
--tae <string> alias of --taesd
|
||||||
|
|||||||
@ -203,9 +203,8 @@ ArgOptions SDSvrParams::get_options() {
|
|||||||
{"", "--color", "colors the logging tags according to level", true, &color},
|
{"", "--color", "colors the logging tags according to level", true, &color},
|
||||||
};
|
};
|
||||||
|
|
||||||
auto on_help_arg = [&](int, const char**, int, bool& valid) {
|
auto on_help_arg = [&](int, const char**, int) {
|
||||||
normal_exit = true;
|
normal_exit = true;
|
||||||
valid = true;
|
|
||||||
return -1;
|
return -1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -186,7 +186,6 @@ typedef struct {
|
|||||||
const char* llm_vision_path;
|
const char* llm_vision_path;
|
||||||
const char* diffusion_model_path;
|
const char* diffusion_model_path;
|
||||||
const char* high_noise_diffusion_model_path;
|
const char* high_noise_diffusion_model_path;
|
||||||
const char* uncond_diffusion_model_path;
|
|
||||||
const char* embeddings_connectors_path;
|
const char* embeddings_connectors_path;
|
||||||
const char* vae_path;
|
const char* vae_path;
|
||||||
const char* audio_vae_path;
|
const char* audio_vae_path;
|
||||||
|
|||||||
@ -1,283 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import math
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
from collections import Counter
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from safetensors import safe_open
|
|
||||||
|
|
||||||
|
|
||||||
FLOAT_DTYPES = {
|
|
||||||
"BF16",
|
|
||||||
"F16",
|
|
||||||
"F32",
|
|
||||||
"F64",
|
|
||||||
"F8_E4M3",
|
|
||||||
"F8_E4M3FN",
|
|
||||||
"F8_E5M2",
|
|
||||||
}
|
|
||||||
|
|
||||||
FP8_DTYPES = {
|
|
||||||
"F8_E4M3",
|
|
||||||
"F8_E4M3FN",
|
|
||||||
"F8_E5M2",
|
|
||||||
}
|
|
||||||
|
|
||||||
DTYPE_SIZES = {
|
|
||||||
"BOOL": 1,
|
|
||||||
"U8": 1,
|
|
||||||
"I8": 1,
|
|
||||||
"F8_E4M3": 1,
|
|
||||||
"F8_E4M3FN": 1,
|
|
||||||
"F8_E5M2": 1,
|
|
||||||
"U16": 2,
|
|
||||||
"I16": 2,
|
|
||||||
"F16": 2,
|
|
||||||
"BF16": 2,
|
|
||||||
"U32": 4,
|
|
||||||
"I32": 4,
|
|
||||||
"F32": 4,
|
|
||||||
"U64": 8,
|
|
||||||
"I64": 8,
|
|
||||||
"F64": 8,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def read_safetensors_header(path: Path):
|
|
||||||
with path.open("rb") as f:
|
|
||||||
header_len = struct.unpack("<Q", f.read(8))[0]
|
|
||||||
header = f.read(header_len).decode("utf-8").rstrip()
|
|
||||||
return json.loads(header)
|
|
||||||
|
|
||||||
|
|
||||||
def numel(shape):
|
|
||||||
return math.prod(shape) if shape else 1
|
|
||||||
|
|
||||||
|
|
||||||
def scale_key_for_weight(name: str):
|
|
||||||
if name.endswith(".weight"):
|
|
||||||
return name[:-len(".weight")] + ".weight_scale"
|
|
||||||
if name.endswith("weight"):
|
|
||||||
return name + "_scale"
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def tensor_nbytes(dtype: str, shape):
|
|
||||||
return numel(shape) * DTYPE_SIZES[dtype]
|
|
||||||
|
|
||||||
|
|
||||||
def build_output_plan(header):
|
|
||||||
entries = {k: v for k, v in header.items() if k != "__metadata__"}
|
|
||||||
paired_scale_keys = set()
|
|
||||||
plan = []
|
|
||||||
|
|
||||||
for name, info in entries.items():
|
|
||||||
scale_key = scale_key_for_weight(name)
|
|
||||||
if info["dtype"] in FP8_DTYPES and scale_key in entries:
|
|
||||||
paired_scale_keys.add(scale_key)
|
|
||||||
|
|
||||||
for name, info in entries.items():
|
|
||||||
if name in paired_scale_keys:
|
|
||||||
continue
|
|
||||||
|
|
||||||
dtype = info["dtype"]
|
|
||||||
shape = info["shape"]
|
|
||||||
scale_key = scale_key_for_weight(name)
|
|
||||||
|
|
||||||
if dtype in FP8_DTYPES and scale_key in entries:
|
|
||||||
scale_info = entries[scale_key]
|
|
||||||
plan.append(
|
|
||||||
{
|
|
||||||
"name": name,
|
|
||||||
"source_dtype": dtype,
|
|
||||||
"output_dtype": "BF16",
|
|
||||||
"shape": shape,
|
|
||||||
"mode": "fp8_scaled_weight",
|
|
||||||
"scale_key": scale_key,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if dtype in FLOAT_DTYPES:
|
|
||||||
plan.append(
|
|
||||||
{
|
|
||||||
"name": name,
|
|
||||||
"source_dtype": dtype,
|
|
||||||
"output_dtype": "BF16",
|
|
||||||
"shape": shape,
|
|
||||||
"mode": "float_to_bf16",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
plan.append(
|
|
||||||
{
|
|
||||||
"name": name,
|
|
||||||
"source_dtype": dtype,
|
|
||||||
"output_dtype": dtype,
|
|
||||||
"shape": shape,
|
|
||||||
"mode": "copy",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
metadata = dict(header.get("__metadata__", {}) or {})
|
|
||||||
metadata["format"] = "pt"
|
|
||||||
metadata["conversion"] = "fp8_weight_scale_to_bf16"
|
|
||||||
|
|
||||||
output_header = {"__metadata__": metadata}
|
|
||||||
offset = 0
|
|
||||||
for item in plan:
|
|
||||||
size = tensor_nbytes(item["output_dtype"], item["shape"])
|
|
||||||
output_header[item["name"]] = {
|
|
||||||
"dtype": item["output_dtype"],
|
|
||||||
"shape": item["shape"],
|
|
||||||
"data_offsets": [offset, offset + size],
|
|
||||||
}
|
|
||||||
offset += size
|
|
||||||
|
|
||||||
return plan, output_header, offset
|
|
||||||
|
|
||||||
|
|
||||||
def write_tensor_bytes(out, tensor):
|
|
||||||
tensor = tensor.detach().cpu().contiguous()
|
|
||||||
if tensor.numel() == 0:
|
|
||||||
return
|
|
||||||
if tensor.dtype == torch.bfloat16:
|
|
||||||
tensor.view(torch.uint16).numpy().tofile(out)
|
|
||||||
elif tensor.dtype in (getattr(torch, "float8_e4m3fn", None), getattr(torch, "float8_e5m2", None)):
|
|
||||||
tensor.view(torch.uint8).numpy().tofile(out)
|
|
||||||
else:
|
|
||||||
tensor.numpy().tofile(out)
|
|
||||||
|
|
||||||
|
|
||||||
def scale_view_for_chunk(scale, chunk, first_dim_start=0, first_dim_end=None):
|
|
||||||
scale = scale.to(torch.float32)
|
|
||||||
|
|
||||||
if scale.numel() == 1:
|
|
||||||
return scale.reshape((1,) * chunk.ndim)
|
|
||||||
|
|
||||||
if chunk.ndim > 0 and scale.ndim == 1:
|
|
||||||
if first_dim_end is not None and scale.shape[0] >= first_dim_end:
|
|
||||||
scale = scale[first_dim_start:first_dim_end]
|
|
||||||
if scale.shape[0] == chunk.shape[0]:
|
|
||||||
return scale.reshape((scale.shape[0],) + (1,) * (chunk.ndim - 1))
|
|
||||||
|
|
||||||
return scale
|
|
||||||
|
|
||||||
|
|
||||||
def write_scaled_fp8_weight(out, weight, scale, chunk_rows):
|
|
||||||
if weight.ndim == 0:
|
|
||||||
result = weight.to(torch.float32) * scale_view_for_chunk(scale, weight)
|
|
||||||
write_tensor_bytes(out, result.to(torch.bfloat16))
|
|
||||||
return
|
|
||||||
|
|
||||||
rows = weight.shape[0]
|
|
||||||
for start in range(0, rows, chunk_rows):
|
|
||||||
end = min(start + chunk_rows, rows)
|
|
||||||
chunk = weight[start:end].to(torch.float32)
|
|
||||||
scale_view = scale_view_for_chunk(scale, chunk, start, end)
|
|
||||||
result = chunk * scale_view
|
|
||||||
write_tensor_bytes(out, result.to(torch.bfloat16))
|
|
||||||
|
|
||||||
|
|
||||||
def write_float_as_bf16(out, tensor, chunk_rows):
|
|
||||||
if tensor.dtype == torch.bfloat16:
|
|
||||||
write_tensor_bytes(out, tensor)
|
|
||||||
return
|
|
||||||
|
|
||||||
if tensor.ndim == 0:
|
|
||||||
write_tensor_bytes(out, tensor.to(torch.bfloat16))
|
|
||||||
return
|
|
||||||
|
|
||||||
rows = tensor.shape[0]
|
|
||||||
for start in range(0, rows, chunk_rows):
|
|
||||||
end = min(start + chunk_rows, rows)
|
|
||||||
write_tensor_bytes(out, tensor[start:end].to(torch.bfloat16))
|
|
||||||
|
|
||||||
|
|
||||||
def convert(input_path: Path, output_path: Path, chunk_rows: int, dry_run: bool):
|
|
||||||
header = read_safetensors_header(input_path)
|
|
||||||
plan, output_header, data_size = build_output_plan(header)
|
|
||||||
|
|
||||||
source_counts = Counter(item["source_dtype"] for item in plan)
|
|
||||||
output_counts = Counter(item["output_dtype"] for item in plan)
|
|
||||||
scaled_count = sum(item["mode"] == "fp8_scaled_weight" for item in plan)
|
|
||||||
dropped_scales = sum(item["mode"] == "fp8_scaled_weight" for item in plan)
|
|
||||||
header_bytes = json.dumps(output_header, separators=(",", ":")).encode("utf-8")
|
|
||||||
expected_size = 8 + len(header_bytes) + data_size
|
|
||||||
|
|
||||||
print(f"input: {input_path}")
|
|
||||||
print(f"output: {output_path}")
|
|
||||||
print(f"tensors written: {len(plan)}")
|
|
||||||
print(f"scaled fp8 weights dequantized: {scaled_count}")
|
|
||||||
print(f"weight_scale tensors dropped: {dropped_scales}")
|
|
||||||
print(f"source dtypes: {dict(sorted(source_counts.items()))}")
|
|
||||||
print(f"output dtypes: {dict(sorted(output_counts.items()))}")
|
|
||||||
print(f"expected output size: {expected_size / (1024 ** 3):.2f} GiB")
|
|
||||||
|
|
||||||
if dry_run:
|
|
||||||
return
|
|
||||||
|
|
||||||
if output_path.exists():
|
|
||||||
raise FileExistsError(f"{output_path} already exists; pass --overwrite to replace it")
|
|
||||||
|
|
||||||
tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
|
|
||||||
if tmp_path.exists():
|
|
||||||
raise FileExistsError(f"{tmp_path} already exists; remove it or choose another output")
|
|
||||||
|
|
||||||
with safe_open(str(input_path), framework="pt", device="cpu") as sf, tmp_path.open("wb") as out:
|
|
||||||
out.write(struct.pack("<Q", len(header_bytes)))
|
|
||||||
out.write(header_bytes)
|
|
||||||
|
|
||||||
for index, item in enumerate(plan, 1):
|
|
||||||
name = item["name"]
|
|
||||||
print(f"[{index:04d}/{len(plan):04d}] {name} -> {item['output_dtype']}")
|
|
||||||
|
|
||||||
tensor = sf.get_tensor(name)
|
|
||||||
if item["mode"] == "fp8_scaled_weight":
|
|
||||||
scale = sf.get_tensor(item["scale_key"])
|
|
||||||
write_scaled_fp8_weight(out, tensor, scale, chunk_rows)
|
|
||||||
elif item["mode"] == "float_to_bf16":
|
|
||||||
write_float_as_bf16(out, tensor, chunk_rows)
|
|
||||||
else:
|
|
||||||
write_tensor_bytes(out, tensor)
|
|
||||||
|
|
||||||
actual_size = out.tell()
|
|
||||||
|
|
||||||
if actual_size != expected_size:
|
|
||||||
tmp_path.unlink(missing_ok=True)
|
|
||||||
raise RuntimeError(f"wrote {actual_size} bytes, expected {expected_size} bytes")
|
|
||||||
|
|
||||||
tmp_path.replace(output_path)
|
|
||||||
print("done")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Convert an fp8 safetensors checkpoint with weight_scale tensors to bf16."
|
|
||||||
)
|
|
||||||
parser.add_argument("--input", default="ideogram4_fp8.safetensors", type=Path)
|
|
||||||
parser.add_argument("--output", default="ideogram4_bf16.safetensors", type=Path)
|
|
||||||
parser.add_argument("--chunk-rows", default=1024, type=int)
|
|
||||||
parser.add_argument("--dry-run", action="store_true")
|
|
||||||
parser.add_argument("--overwrite", action="store_true")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
input_path = args.input.resolve()
|
|
||||||
output_path = args.output.resolve()
|
|
||||||
|
|
||||||
if args.chunk_rows < 1:
|
|
||||||
raise ValueError("--chunk-rows must be >= 1")
|
|
||||||
if not input_path.exists():
|
|
||||||
raise FileNotFoundError(input_path)
|
|
||||||
if args.overwrite and output_path.exists():
|
|
||||||
output_path.unlink()
|
|
||||||
|
|
||||||
convert(input_path, output_path, args.chunk_rows, args.dry_run)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@ -1759,8 +1759,6 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
arch = LLM::LLMArch::GPT_OSS_20B;
|
arch = LLM::LLMArch::GPT_OSS_20B;
|
||||||
} else if (sd_version_is_pid(version)) {
|
} else if (sd_version_is_pid(version)) {
|
||||||
arch = LLM::LLMArch::GEMMA2_2B;
|
arch = LLM::LLMArch::GEMMA2_2B;
|
||||||
} else if (sd_version_is_ideogram4(version)) {
|
|
||||||
arch = LLM::LLMArch::QWEN3_VL;
|
|
||||||
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
|
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
|
||||||
arch = LLM::LLMArch::QWEN3;
|
arch = LLM::LLMArch::QWEN3;
|
||||||
}
|
}
|
||||||
@ -2103,14 +2101,6 @@ struct LLMEmbedder : public Conditioner {
|
|||||||
prompt_attn_range.second = static_cast<int>(prompt.size());
|
prompt_attn_range.second = static_cast<int>(prompt.size());
|
||||||
|
|
||||||
prompt += "[/INST]";
|
prompt += "[/INST]";
|
||||||
} else if (sd_version_is_ideogram4(version)) {
|
|
||||||
prompt_template_encode_start_idx = 0;
|
|
||||||
out_layers = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 36};
|
|
||||||
|
|
||||||
prompt = "<|im_start|>user\n";
|
|
||||||
prompt += conditioner_params.text;
|
|
||||||
prompt += "<|im_end|>\n<|im_start|>assistant\n";
|
|
||||||
prompt_attn_range = {0, 0};
|
|
||||||
} else if (sd_version_is_ernie_image(version)) {
|
} else if (sd_version_is_ernie_image(version)) {
|
||||||
prompt_template_encode_start_idx = 0;
|
prompt_template_encode_start_idx = 0;
|
||||||
out_layers = {25}; // -2
|
out_layers = {25}; // -2
|
||||||
|
|||||||
@ -1708,7 +1708,6 @@ protected:
|
|||||||
|
|
||||||
size_t max_graph_vram_bytes = 0;
|
size_t max_graph_vram_bytes = 0;
|
||||||
bool stream_layers_enabled = false;
|
bool stream_layers_enabled = false;
|
||||||
size_t observed_max_effective_budget_ = 0;
|
|
||||||
|
|
||||||
sd::layer_registry::LayerRegistry layer_registry_;
|
sd::layer_registry::LayerRegistry layer_registry_;
|
||||||
|
|
||||||
@ -2447,7 +2446,7 @@ protected:
|
|||||||
constexpr size_t safety_margin = 512ull * 1024 * 1024;
|
constexpr size_t safety_margin = 512ull * 1024 * 1024;
|
||||||
size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
|
size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
|
||||||
if (free_clamp < effective_budget) {
|
if (free_clamp < effective_budget) {
|
||||||
LOG_DEBUG("%s clamping streaming budget: actual free VRAM %.2f MB < user cap %.2f MB",
|
LOG_INFO("%s clamping streaming budget: actual free VRAM %.2f MB < user cap %.2f MB",
|
||||||
get_desc().c_str(),
|
get_desc().c_str(),
|
||||||
free_clamp / (1024.0 * 1024.0),
|
free_clamp / (1024.0 * 1024.0),
|
||||||
effective_budget / (1024.0 * 1024.0));
|
effective_budget / (1024.0 * 1024.0));
|
||||||
@ -2456,16 +2455,6 @@ protected:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool budget_increased = false;
|
|
||||||
if (stream_layers_enabled) {
|
|
||||||
if (effective_budget > observed_max_effective_budget_) {
|
|
||||||
observed_max_effective_budget_ = effective_budget;
|
|
||||||
budget_increased = true;
|
|
||||||
} else {
|
|
||||||
effective_budget = observed_max_effective_budget_;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (effective_budget_out != nullptr) {
|
if (effective_budget_out != nullptr) {
|
||||||
*effective_budget_out = effective_budget;
|
*effective_budget_out = effective_budget;
|
||||||
}
|
}
|
||||||
@ -2477,15 +2466,9 @@ protected:
|
|||||||
params_tensor_set_,
|
params_tensor_set_,
|
||||||
get_desc().c_str());
|
get_desc().c_str());
|
||||||
if (stream_layers_enabled) {
|
if (stream_layers_enabled) {
|
||||||
if (budget_increased) {
|
|
||||||
LOG_INFO("%s streaming budget = %.2f MB",
|
LOG_INFO("%s streaming budget = %.2f MB",
|
||||||
get_desc().c_str(),
|
get_desc().c_str(),
|
||||||
effective_budget / (1024.0 * 1024.0));
|
effective_budget / (1024.0 * 1024.0));
|
||||||
} else {
|
|
||||||
LOG_DEBUG("%s streaming budget = %.2f MB",
|
|
||||||
get_desc().c_str(),
|
|
||||||
effective_budget / (1024.0 * 1024.0));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -3034,18 +3017,7 @@ public:
|
|||||||
LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
|
LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// Pinned host buffer when CPU-offloaded for DMA-direct H2D.
|
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
|
||||||
ggml_backend_buffer_type_t params_buft = nullptr;
|
|
||||||
if (params_backend != runtime_backend) {
|
|
||||||
ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend);
|
|
||||||
if (runtime_dev != nullptr) {
|
|
||||||
params_buft = ggml_backend_dev_host_buffer_type(runtime_dev);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (params_buft == nullptr) {
|
|
||||||
params_buft = ggml_backend_get_default_buffer_type(params_backend);
|
|
||||||
}
|
|
||||||
params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft);
|
|
||||||
if (params_buffer == nullptr) {
|
if (params_buffer == nullptr) {
|
||||||
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
|
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
|
||||||
get_desc().c_str(),
|
get_desc().c_str(),
|
||||||
@ -3070,7 +3042,6 @@ public:
|
|||||||
ggml_backend_buffer_free(params_buffer);
|
ggml_backend_buffer_free(params_buffer);
|
||||||
params_buffer = nullptr;
|
params_buffer = nullptr;
|
||||||
}
|
}
|
||||||
observed_max_effective_budget_ = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t get_params_buffer_size() {
|
size_t get_params_buffer_size() {
|
||||||
@ -3347,14 +3318,11 @@ protected:
|
|||||||
bool bias;
|
bool bias;
|
||||||
bool force_f32;
|
bool force_f32;
|
||||||
bool force_prec_f32;
|
bool force_prec_f32;
|
||||||
bool allow_weight_scale;
|
|
||||||
bool has_weight_scale = false;
|
|
||||||
float scale;
|
float scale;
|
||||||
std::string prefix;
|
std::string prefix;
|
||||||
|
|
||||||
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
|
||||||
this->prefix = prefix;
|
this->prefix = prefix;
|
||||||
has_weight_scale = false;
|
|
||||||
enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
|
enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
|
||||||
if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
|
if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
|
||||||
wtype = GGML_TYPE_F32;
|
wtype = GGML_TYPE_F32;
|
||||||
@ -3364,10 +3332,6 @@ protected:
|
|||||||
enum ggml_type wtype = GGML_TYPE_F32;
|
enum ggml_type wtype = GGML_TYPE_F32;
|
||||||
params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_features);
|
params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_features);
|
||||||
}
|
}
|
||||||
if (allow_weight_scale && tensor_storage_map.find(prefix + "weight_scale") != tensor_storage_map.end()) {
|
|
||||||
params["weight_scale"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_features);
|
|
||||||
has_weight_scale = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -3376,14 +3340,12 @@ public:
|
|||||||
bool bias = true,
|
bool bias = true,
|
||||||
bool force_f32 = false,
|
bool force_f32 = false,
|
||||||
bool force_prec_f32 = false,
|
bool force_prec_f32 = false,
|
||||||
float scale = 1.f,
|
float scale = 1.f)
|
||||||
bool allow_weight_scale = false)
|
|
||||||
: in_features(in_features),
|
: in_features(in_features),
|
||||||
out_features(out_features),
|
out_features(out_features),
|
||||||
bias(bias),
|
bias(bias),
|
||||||
force_f32(force_f32),
|
force_f32(force_f32),
|
||||||
force_prec_f32(force_prec_f32),
|
force_prec_f32(force_prec_f32),
|
||||||
allow_weight_scale(allow_weight_scale),
|
|
||||||
scale(scale) {}
|
scale(scale) {}
|
||||||
|
|
||||||
void set_scale(float scale_) {
|
void set_scale(float scale_) {
|
||||||
@ -3400,24 +3362,14 @@ public:
|
|||||||
if (bias) {
|
if (bias) {
|
||||||
b = params["bias"];
|
b = params["bias"];
|
||||||
}
|
}
|
||||||
ggml_tensor* linear_bias = has_weight_scale ? nullptr : b;
|
|
||||||
ggml_tensor* out = nullptr;
|
|
||||||
if (ctx->weight_adapter) {
|
if (ctx->weight_adapter) {
|
||||||
WeightAdapter::ForwardParams forward_params;
|
WeightAdapter::ForwardParams forward_params;
|
||||||
forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
|
forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
|
||||||
forward_params.linear.force_prec_f32 = force_prec_f32;
|
forward_params.linear.force_prec_f32 = force_prec_f32;
|
||||||
forward_params.linear.scale = scale;
|
forward_params.linear.scale = scale;
|
||||||
out = ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, linear_bias, prefix, forward_params);
|
return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params);
|
||||||
} else {
|
|
||||||
out = ggml_ext_linear(ctx->ggml_ctx, x, w, linear_bias, force_prec_f32, scale);
|
|
||||||
}
|
}
|
||||||
if (has_weight_scale) {
|
return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
|
||||||
out = ggml_mul(ctx->ggml_ctx, out, params["weight_scale"]);
|
|
||||||
if (b != nullptr) {
|
|
||||||
out = ggml_add_inplace(ctx->ggml_ctx, out, b);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -699,7 +699,7 @@ namespace sd::ggml_graph_cut {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (log_desc != nullptr) {
|
if (log_desc != nullptr) {
|
||||||
LOG_DEBUG("%s graph cut max_vram budget merge took %lld ms",
|
LOG_INFO("%s graph cut max_vram budget merge took %lld ms",
|
||||||
log_desc,
|
log_desc,
|
||||||
ggml_time_ms() - t_budget_begin);
|
ggml_time_ms() - t_budget_begin);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,527 +0,0 @@
|
|||||||
#ifndef __IDEOGRAM4_HPP__
|
|
||||||
#define __IDEOGRAM4_HPP__
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <memory>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "diffusion_model.hpp"
|
|
||||||
#include "ggml_extend.hpp"
|
|
||||||
#include "ggml_graph_cut.h"
|
|
||||||
#include "rope.hpp"
|
|
||||||
|
|
||||||
namespace Ideogram4 {
|
|
||||||
constexpr int IDEOGRAM4_GRAPH_SIZE = 65536;
|
|
||||||
constexpr int OUTPUT_IMAGE_INDICATOR = 2;
|
|
||||||
constexpr int IMAGE_POSITION_OFFSET = 65536;
|
|
||||||
constexpr int DEFAULT_MROPE_SECTION_T = 24;
|
|
||||||
constexpr int DEFAULT_MROPE_SECTION_H = 20;
|
|
||||||
constexpr int DEFAULT_MROPE_SECTION_W = 20;
|
|
||||||
constexpr int TIMESTEP_MAX_PERIOD = 10000;
|
|
||||||
constexpr int LLM_HIDDEN_STATE_LAYERS = 13;
|
|
||||||
|
|
||||||
struct Ideogram4Config {
|
|
||||||
int64_t emb_dim = 4608;
|
|
||||||
int64_t num_layers = 34;
|
|
||||||
int64_t num_heads = 18;
|
|
||||||
int64_t intermediate_size = 12288;
|
|
||||||
int64_t adanln_dim = 512;
|
|
||||||
int64_t in_channels = 128;
|
|
||||||
int64_t llm_features_dim = 53248;
|
|
||||||
int64_t rope_theta = 5000000;
|
|
||||||
float norm_eps = 1e-5f;
|
|
||||||
int patch_size = 2;
|
|
||||||
int ae_channels = 32;
|
|
||||||
std::vector<int> mrope_section = {DEFAULT_MROPE_SECTION_T,
|
|
||||||
DEFAULT_MROPE_SECTION_H,
|
|
||||||
DEFAULT_MROPE_SECTION_W};
|
|
||||||
};
|
|
||||||
|
|
||||||
__STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx,
|
|
||||||
ggml_tensor* timesteps,
|
|
||||||
int dim) {
|
|
||||||
GGML_ASSERT(dim % 2 == 0);
|
|
||||||
auto embedding = ggml_ext_timestep_embedding(ctx, timesteps, dim, TIMESTEP_MAX_PERIOD, 10.f);
|
|
||||||
auto chunks = ggml_ext_chunk(ctx, embedding, 2, 0);
|
|
||||||
return ggml_concat(ctx, chunks[1], chunks[0], 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ ggml_tensor* to_token_modulation(ggml_context* ctx, ggml_tensor* x) {
|
|
||||||
// [N, C] -> [N, 1, C] in PyTorch layout.
|
|
||||||
if (ggml_n_dims(x) < 3 || x->ne[1] != 1) {
|
|
||||||
x = ggml_reshape_3d(ctx, x, x->ne[0], 1, x->ne[1]);
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ ggml_tensor* interleave_hidden_state_layers(ggml_context* ctx, ggml_tensor* x) {
|
|
||||||
// Match upstream stack(...).permute(1, 2, 3, 0).reshape(...):
|
|
||||||
// [layers * hidden, tokens, batch] -> [hidden * layers, tokens, batch].
|
|
||||||
GGML_ASSERT(x->ne[0] % LLM_HIDDEN_STATE_LAYERS == 0);
|
|
||||||
const int64_t hidden_size = x->ne[0] / LLM_HIDDEN_STATE_LAYERS;
|
|
||||||
const int64_t token_count = x->ne[1];
|
|
||||||
const int64_t batch_count = x->ne[2];
|
|
||||||
|
|
||||||
x = ggml_reshape_4d(ctx, x, hidden_size, LLM_HIDDEN_STATE_LAYERS, token_count, batch_count);
|
|
||||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
|
|
||||||
return ggml_reshape_3d(ctx, x, hidden_size * LLM_HIDDEN_STATE_LAYERS, token_count, batch_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) {
|
|
||||||
scale = to_token_modulation(ctx, scale);
|
|
||||||
return ggml_add(ctx, x, ggml_mul(ctx, x, scale));
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ ggml_tensor* patchify(ggml_context* ctx, ggml_tensor* x, const Ideogram4Config& config) {
|
|
||||||
// x: [N, 128, H, W] with channel order [ae, ph, pw].
|
|
||||||
// return: [N, H*W, 128] with token channel order [ph, pw, ae].
|
|
||||||
const int64_t W = x->ne[0];
|
|
||||||
const int64_t H = x->ne[1];
|
|
||||||
const int64_t C = x->ne[2];
|
|
||||||
const int64_t N = x->ne[3];
|
|
||||||
|
|
||||||
GGML_ASSERT(N == 1);
|
|
||||||
GGML_ASSERT(C == config.ae_channels * config.patch_size * config.patch_size);
|
|
||||||
|
|
||||||
x = ggml_cont(ctx, x);
|
|
||||||
x = ggml_reshape_4d(ctx, x, W * H, config.patch_size, config.patch_size, config.ae_channels);
|
|
||||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 1, 2, 0));
|
|
||||||
x = ggml_reshape_3d(ctx, x, C, W * H, N);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ ggml_tensor* unpatchify(ggml_context* ctx,
|
|
||||||
ggml_tensor* x,
|
|
||||||
int64_t H,
|
|
||||||
int64_t W,
|
|
||||||
const Ideogram4Config& config) {
|
|
||||||
const int64_t C = x->ne[0];
|
|
||||||
const int64_t N = x->ne[2];
|
|
||||||
|
|
||||||
GGML_ASSERT(N == 1);
|
|
||||||
GGML_ASSERT(C == config.ae_channels * config.patch_size * config.patch_size);
|
|
||||||
GGML_ASSERT(x->ne[1] == H * W);
|
|
||||||
|
|
||||||
x = ggml_reshape_4d(ctx, x, config.ae_channels, config.patch_size, config.patch_size, H * W);
|
|
||||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 1, 2, 0));
|
|
||||||
x = ggml_reshape_4d(ctx, x, W, H, C, N);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ std::shared_ptr<Linear> make_linear(int64_t in_features,
|
|
||||||
int64_t out_features,
|
|
||||||
bool bias = true) {
|
|
||||||
return std::make_shared<Linear>(in_features, out_features, bias, false, false, 1.f, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ std::vector<float> gen_ideogram4_pe(int grid_h,
|
|
||||||
int grid_w,
|
|
||||||
int bs,
|
|
||||||
int context_len,
|
|
||||||
int head_dim,
|
|
||||||
int rope_theta,
|
|
||||||
const std::vector<int>& mrope_section) {
|
|
||||||
GGML_ASSERT(bs == 1);
|
|
||||||
std::vector<std::vector<float>> ids(static_cast<size_t>(bs) * (context_len + grid_h * grid_w),
|
|
||||||
std::vector<float>(3, 0.f));
|
|
||||||
|
|
||||||
for (int i = 0; i < context_len; ++i) {
|
|
||||||
ids[i] = {static_cast<float>(i), static_cast<float>(i), static_cast<float>(i)};
|
|
||||||
}
|
|
||||||
|
|
||||||
int cursor = context_len;
|
|
||||||
for (int y = 0; y < grid_h; ++y) {
|
|
||||||
for (int x = 0; x < grid_w; ++x) {
|
|
||||||
ids[cursor++] = {static_cast<float>(IMAGE_POSITION_OFFSET),
|
|
||||||
static_cast<float>(IMAGE_POSITION_OFFSET + y),
|
|
||||||
static_cast<float>(IMAGE_POSITION_OFFSET + x)};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Rope::embed_interleaved_mrope(ids, bs, static_cast<float>(rope_theta), head_dim, mrope_section);
|
|
||||||
}
|
|
||||||
|
|
||||||
class Ideogram4Attention : public GGMLBlock {
|
|
||||||
protected:
|
|
||||||
int64_t hidden_size;
|
|
||||||
int64_t num_heads;
|
|
||||||
int64_t head_dim;
|
|
||||||
|
|
||||||
public:
|
|
||||||
Ideogram4Attention(int64_t hidden_size, int64_t num_heads, float eps)
|
|
||||||
: hidden_size(hidden_size), num_heads(num_heads), head_dim(hidden_size / num_heads) {
|
|
||||||
GGML_ASSERT(hidden_size % num_heads == 0);
|
|
||||||
blocks["qkv"] = make_linear(hidden_size, hidden_size * 3, false);
|
|
||||||
blocks["norm_q"] = std::make_shared<RMSNorm>(head_dim, eps);
|
|
||||||
blocks["norm_k"] = std::make_shared<RMSNorm>(head_dim, eps);
|
|
||||||
blocks["o"] = make_linear(hidden_size, hidden_size, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
|
||||||
ggml_tensor* x,
|
|
||||||
ggml_tensor* pe,
|
|
||||||
ggml_tensor* mask = nullptr) {
|
|
||||||
int64_t n_token = x->ne[1];
|
|
||||||
int64_t N = x->ne[2];
|
|
||||||
|
|
||||||
auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
|
|
||||||
auto norm_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
|
|
||||||
auto norm_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
|
|
||||||
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
|
|
||||||
|
|
||||||
auto qkv = qkv_proj->forward(ctx, x);
|
|
||||||
auto qkv_vec = split_qkv(ctx->ggml_ctx, qkv);
|
|
||||||
auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, n_token, N);
|
|
||||||
auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, n_token, N);
|
|
||||||
auto v = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, n_token, N);
|
|
||||||
|
|
||||||
q = norm_q->forward(ctx, q);
|
|
||||||
k = norm_k->forward(ctx, k);
|
|
||||||
|
|
||||||
x = Rope::attention(ctx, q, k, v, pe, mask, 1.f / 128.f, false);
|
|
||||||
x = out_proj->forward(ctx, x);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class Ideogram4MLP : public GGMLBlock {
|
|
||||||
public:
|
|
||||||
Ideogram4MLP(int64_t dim, int64_t hidden_dim) {
|
|
||||||
blocks["w1"] = make_linear(dim, hidden_dim, false);
|
|
||||||
blocks["w2"] = make_linear(hidden_dim, dim, false);
|
|
||||||
blocks["w3"] = make_linear(dim, hidden_dim, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
|
||||||
auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
|
|
||||||
auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
|
|
||||||
auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
|
|
||||||
|
|
||||||
auto x1 = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
|
|
||||||
auto x3 = w3->forward(ctx, x);
|
|
||||||
x = ggml_mul(ctx->ggml_ctx, x1, x3);
|
|
||||||
x = w2->forward(ctx, x);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class Ideogram4TransformerBlock : public GGMLBlock {
|
|
||||||
public:
|
|
||||||
Ideogram4TransformerBlock(const Ideogram4Config& config) {
|
|
||||||
blocks["attention"] = std::make_shared<Ideogram4Attention>(config.emb_dim, config.num_heads, config.norm_eps);
|
|
||||||
blocks["feed_forward"] = std::make_shared<Ideogram4MLP>(config.emb_dim, config.intermediate_size);
|
|
||||||
blocks["attention_norm1"] = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
|
|
||||||
blocks["ffn_norm1"] = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
|
|
||||||
blocks["attention_norm2"] = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
|
|
||||||
blocks["ffn_norm2"] = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
|
|
||||||
blocks["adaln_modulation"] = make_linear(config.adanln_dim, 4 * config.emb_dim, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
|
||||||
ggml_tensor* x,
|
|
||||||
ggml_tensor* pe,
|
|
||||||
ggml_tensor* adaln_input,
|
|
||||||
ggml_tensor* mask = nullptr) {
|
|
||||||
auto attention = std::dynamic_pointer_cast<Ideogram4Attention>(blocks["attention"]);
|
|
||||||
auto feed_forward = std::dynamic_pointer_cast<Ideogram4MLP>(blocks["feed_forward"]);
|
|
||||||
auto attention_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm1"]);
|
|
||||||
auto ffn_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
|
|
||||||
auto attention_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm2"]);
|
|
||||||
auto ffn_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
|
|
||||||
auto adaln_modulation = std::dynamic_pointer_cast<Linear>(blocks["adaln_modulation"]);
|
|
||||||
|
|
||||||
auto mod = adaln_modulation->forward(ctx, adaln_input);
|
|
||||||
auto mods = ggml_ext_chunk(ctx->ggml_ctx, mod, 4, 0);
|
|
||||||
auto scale_msa = mods[0];
|
|
||||||
auto gate_msa = to_token_modulation(ctx->ggml_ctx, ggml_tanh(ctx->ggml_ctx, mods[1]));
|
|
||||||
auto scale_mlp = mods[2];
|
|
||||||
auto gate_mlp = to_token_modulation(ctx->ggml_ctx, ggml_tanh(ctx->ggml_ctx, mods[3]));
|
|
||||||
|
|
||||||
auto attn_out = attention_norm1->forward(ctx, x);
|
|
||||||
attn_out = modulate(ctx->ggml_ctx, attn_out, scale_msa);
|
|
||||||
attn_out = attention->forward(ctx, attn_out, pe, mask);
|
|
||||||
attn_out = attention_norm2->forward(ctx, attn_out);
|
|
||||||
x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
|
|
||||||
|
|
||||||
auto ffn_out = ffn_norm1->forward(ctx, x);
|
|
||||||
ffn_out = modulate(ctx->ggml_ctx, ffn_out, scale_mlp);
|
|
||||||
ffn_out = feed_forward->forward(ctx, ffn_out);
|
|
||||||
ffn_out = ffn_norm2->forward(ctx, ffn_out);
|
|
||||||
x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, ffn_out, gate_mlp));
|
|
||||||
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class Ideogram4EmbedScalar : public GGMLBlock {
|
|
||||||
protected:
|
|
||||||
int64_t dim;
|
|
||||||
|
|
||||||
public:
|
|
||||||
Ideogram4EmbedScalar(int64_t dim)
|
|
||||||
: dim(dim) {
|
|
||||||
blocks["mlp_in"] = make_linear(dim, dim, true);
|
|
||||||
blocks["mlp_out"] = make_linear(dim, dim, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
|
|
||||||
auto mlp_in = std::dynamic_pointer_cast<Linear>(blocks["mlp_in"]);
|
|
||||||
auto mlp_out = std::dynamic_pointer_cast<Linear>(blocks["mlp_out"]);
|
|
||||||
|
|
||||||
x = timestep_embedding_sin_cos(ctx->ggml_ctx, x, static_cast<int>(dim));
|
|
||||||
x = ggml_silu(ctx->ggml_ctx, mlp_in->forward(ctx, x));
|
|
||||||
x = mlp_out->forward(ctx, x);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class Ideogram4FinalLayer : public GGMLBlock {
|
|
||||||
public:
|
|
||||||
Ideogram4FinalLayer(const Ideogram4Config& config) {
|
|
||||||
blocks["norm_final"] = std::make_shared<LayerNorm>(config.emb_dim, 1e-6f, false);
|
|
||||||
blocks["linear"] = make_linear(config.emb_dim, config.in_channels, true);
|
|
||||||
blocks["adaln_modulation"] = make_linear(config.adanln_dim, config.emb_dim, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* c) {
|
|
||||||
auto norm_final = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
|
|
||||||
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
|
|
||||||
auto adaln_modulation = std::dynamic_pointer_cast<Linear>(blocks["adaln_modulation"]);
|
|
||||||
|
|
||||||
auto scale = adaln_modulation->forward(ctx, ggml_silu(ctx->ggml_ctx, c));
|
|
||||||
x = norm_final->forward(ctx, x);
|
|
||||||
x = modulate(ctx->ggml_ctx, x, scale);
|
|
||||||
x = linear->forward(ctx, x);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class Ideogram4Transformer : public GGMLBlock {
|
|
||||||
protected:
|
|
||||||
Ideogram4Config config;
|
|
||||||
|
|
||||||
public:
|
|
||||||
Ideogram4Transformer() = default;
|
|
||||||
explicit Ideogram4Transformer(Ideogram4Config config)
|
|
||||||
: config(std::move(config)) {
|
|
||||||
blocks["input_proj"] = make_linear(this->config.in_channels, this->config.emb_dim, true);
|
|
||||||
blocks["llm_cond_norm"] = std::make_shared<RMSNorm>(this->config.llm_features_dim, 1e-6f);
|
|
||||||
blocks["llm_cond_proj"] = make_linear(this->config.llm_features_dim, this->config.emb_dim, true);
|
|
||||||
blocks["t_embedding"] = std::make_shared<Ideogram4EmbedScalar>(this->config.emb_dim);
|
|
||||||
blocks["adaln_proj"] = make_linear(this->config.emb_dim, this->config.adanln_dim, true);
|
|
||||||
blocks["embed_image_indicator"] = std::make_shared<Embedding>(2, this->config.emb_dim);
|
|
||||||
|
|
||||||
for (int i = 0; i < this->config.num_layers; ++i) {
|
|
||||||
blocks["layers." + std::to_string(i)] = std::make_shared<Ideogram4TransformerBlock>(this->config);
|
|
||||||
}
|
|
||||||
blocks["final_layer"] = std::make_shared<Ideogram4FinalLayer>(this->config);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor* forward(GGMLRunnerContext* ctx,
|
|
||||||
ggml_tensor* x,
|
|
||||||
ggml_tensor* timestep,
|
|
||||||
ggml_tensor* context,
|
|
||||||
ggml_tensor* pe,
|
|
||||||
ggml_tensor* image_indicator_ids) {
|
|
||||||
int64_t W = x->ne[0];
|
|
||||||
int64_t H = x->ne[1];
|
|
||||||
int64_t N = x->ne[3];
|
|
||||||
GGML_ASSERT(N == 1);
|
|
||||||
|
|
||||||
auto input_proj = std::dynamic_pointer_cast<Linear>(blocks["input_proj"]);
|
|
||||||
auto llm_cond_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["llm_cond_norm"]);
|
|
||||||
auto llm_cond_proj = std::dynamic_pointer_cast<Linear>(blocks["llm_cond_proj"]);
|
|
||||||
auto t_embedding = std::dynamic_pointer_cast<Ideogram4EmbedScalar>(blocks["t_embedding"]);
|
|
||||||
auto adaln_proj = std::dynamic_pointer_cast<Linear>(blocks["adaln_proj"]);
|
|
||||||
auto embed_image_indicator = std::dynamic_pointer_cast<Embedding>(blocks["embed_image_indicator"]);
|
|
||||||
auto final_layer = std::dynamic_pointer_cast<Ideogram4FinalLayer>(blocks["final_layer"]);
|
|
||||||
|
|
||||||
auto img = patchify(ctx->ggml_ctx, x, config);
|
|
||||||
img = input_proj->forward(ctx, img);
|
|
||||||
|
|
||||||
ggml_tensor* h = img;
|
|
||||||
int64_t context_len = 0;
|
|
||||||
if (context != nullptr) {
|
|
||||||
if (ggml_n_dims(context) < 3) {
|
|
||||||
context = ggml_reshape_3d(ctx->ggml_ctx, context, context->ne[0], context->ne[1], 1);
|
|
||||||
}
|
|
||||||
context = interleave_hidden_state_layers(ctx->ggml_ctx, context);
|
|
||||||
context_len = context->ne[1];
|
|
||||||
auto txt = llm_cond_norm->forward(ctx, context);
|
|
||||||
txt = llm_cond_proj->forward(ctx, txt);
|
|
||||||
h = ggml_concat(ctx->ggml_ctx, txt, img, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto indicator_embedding = embed_image_indicator->forward(ctx, image_indicator_ids);
|
|
||||||
h = ggml_add(ctx->ggml_ctx, h, indicator_embedding);
|
|
||||||
|
|
||||||
auto t_cond = t_embedding->forward(ctx, timestep);
|
|
||||||
auto adaln_input = ggml_silu(ctx->ggml_ctx, adaln_proj->forward(ctx, t_cond));
|
|
||||||
|
|
||||||
for (int i = 0; i < config.num_layers; ++i) {
|
|
||||||
auto block = std::dynamic_pointer_cast<Ideogram4TransformerBlock>(blocks["layers." + std::to_string(i)]);
|
|
||||||
h = block->forward(ctx, h, pe, adaln_input, nullptr);
|
|
||||||
sd::ggml_graph_cut::mark_graph_cut(h, "ideogram4.layers." + std::to_string(i), "hidden");
|
|
||||||
}
|
|
||||||
|
|
||||||
h = final_layer->forward(ctx, h, adaln_input);
|
|
||||||
if (context_len > 0) {
|
|
||||||
h = ggml_ext_slice(ctx->ggml_ctx, h, 1, context_len, h->ne[1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
h = unpatchify(ctx->ggml_ctx, h, H, W, config);
|
|
||||||
h = ggml_ext_scale(ctx->ggml_ctx, h, -1.f);
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class Ideogram4Runner : public DiffusionModelRunner {
|
|
||||||
protected:
|
|
||||||
static int64_t detect_num_layers(const String2TensorStorage& tensor_storage_map,
|
|
||||||
const std::string& prefix) {
|
|
||||||
int64_t detected_layers = 0;
|
|
||||||
std::string layer_prefix = prefix.empty() ? "layers." : prefix + ".layers.";
|
|
||||||
for (const auto& pair : tensor_storage_map) {
|
|
||||||
const std::string& name = pair.first;
|
|
||||||
if (name.find(layer_prefix) != 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
std::string tail = name.substr(layer_prefix.size());
|
|
||||||
size_t dot = tail.find('.');
|
|
||||||
if (dot == std::string::npos) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
int layer_idx = std::atoi(tail.substr(0, dot).c_str());
|
|
||||||
detected_layers = std::max<int64_t>(detected_layers, layer_idx + 1);
|
|
||||||
}
|
|
||||||
return detected_layers;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool should_use_uncond_model(const DiffusionParams& diffusion_params) const {
|
|
||||||
return has_uncond_model &&
|
|
||||||
diffusion_params.context == nullptr &&
|
|
||||||
diffusion_params.y != nullptr &&
|
|
||||||
!diffusion_params.y->empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
|
||||||
Ideogram4Config config;
|
|
||||||
Ideogram4Transformer model;
|
|
||||||
Ideogram4Transformer uncond_model;
|
|
||||||
bool has_uncond_model = false;
|
|
||||||
std::string uncond_prefix;
|
|
||||||
std::vector<float> pe_vec;
|
|
||||||
std::vector<int32_t> image_indicator_vec;
|
|
||||||
|
|
||||||
Ideogram4Runner(ggml_backend_t backend,
|
|
||||||
ggml_backend_t params_backend,
|
|
||||||
const String2TensorStorage& tensor_storage_map = {},
|
|
||||||
const std::string prefix = "")
|
|
||||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
|
||||||
uncond_prefix(prefix + ".uncond") {
|
|
||||||
int64_t detected_layers = detect_num_layers(tensor_storage_map, prefix);
|
|
||||||
if (detected_layers > 0) {
|
|
||||||
config.num_layers = detected_layers;
|
|
||||||
}
|
|
||||||
|
|
||||||
model = Ideogram4Transformer(config);
|
|
||||||
model.init(params_ctx, tensor_storage_map, prefix);
|
|
||||||
for (const auto& pair : tensor_storage_map) {
|
|
||||||
const std::string& name = pair.first;
|
|
||||||
if (starts_with(name, uncond_prefix)) {
|
|
||||||
has_uncond_model = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (has_uncond_model) {
|
|
||||||
LOG_DEBUG("using uncond model");
|
|
||||||
uncond_model = Ideogram4Transformer(config);
|
|
||||||
uncond_model.init(params_ctx, tensor_storage_map, uncond_prefix);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string get_desc() override {
|
|
||||||
return "ideogram4";
|
|
||||||
}
|
|
||||||
|
|
||||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
|
||||||
model.get_param_tensors(tensors, prefix);
|
|
||||||
if (has_uncond_model) {
|
|
||||||
uncond_model.get_param_tensors(tensors, this->uncond_prefix);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
|
|
||||||
const sd::Tensor<float>& timesteps_tensor,
|
|
||||||
const sd::Tensor<float>& context_tensor,
|
|
||||||
bool use_uncond_model = false) {
|
|
||||||
ggml_cgraph* gf = new_graph_custom(IDEOGRAM4_GRAPH_SIZE);
|
|
||||||
ggml_tensor* x = make_input(x_tensor);
|
|
||||||
ggml_tensor* timesteps = make_input(timesteps_tensor);
|
|
||||||
GGML_ASSERT(x->ne[3] == 1);
|
|
||||||
Ideogram4Transformer& active_model = use_uncond_model ? uncond_model : model;
|
|
||||||
|
|
||||||
ggml_tensor* context = nullptr;
|
|
||||||
int64_t context_len = 0;
|
|
||||||
if (!context_tensor.empty()) {
|
|
||||||
context = make_input(context_tensor);
|
|
||||||
context_len = context->ne[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t grid_w = x->ne[0];
|
|
||||||
int64_t grid_h = x->ne[1];
|
|
||||||
int64_t pos_len = context_len + grid_h * grid_w;
|
|
||||||
int64_t head_dim = config.emb_dim / config.num_heads;
|
|
||||||
|
|
||||||
pe_vec = gen_ideogram4_pe(static_cast<int>(grid_h),
|
|
||||||
static_cast<int>(grid_w),
|
|
||||||
static_cast<int>(x->ne[3]),
|
|
||||||
static_cast<int>(context_len),
|
|
||||||
static_cast<int>(head_dim),
|
|
||||||
static_cast<int>(config.rope_theta),
|
|
||||||
config.mrope_section);
|
|
||||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
|
|
||||||
set_backend_tensor_data(pe, pe_vec.data());
|
|
||||||
|
|
||||||
image_indicator_vec.assign(static_cast<size_t>(pos_len), 1);
|
|
||||||
for (int64_t i = 0; i < context_len; ++i) {
|
|
||||||
image_indicator_vec[static_cast<size_t>(i)] = 0;
|
|
||||||
}
|
|
||||||
auto indicator = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_I32, pos_len, x->ne[3]);
|
|
||||||
set_backend_tensor_data(indicator, image_indicator_vec.data());
|
|
||||||
|
|
||||||
auto runner_ctx = get_context();
|
|
||||||
ggml_tensor* out = active_model.forward(&runner_ctx, x, timesteps, context, pe, indicator);
|
|
||||||
ggml_build_forward_expand(gf, out);
|
|
||||||
return gf;
|
|
||||||
}
|
|
||||||
|
|
||||||
sd::Tensor<float> compute(int n_threads,
|
|
||||||
const sd::Tensor<float>& x,
|
|
||||||
const sd::Tensor<float>& timesteps,
|
|
||||||
const sd::Tensor<float>& context,
|
|
||||||
bool use_uncond_model = false) {
|
|
||||||
auto get_graph = [&]() -> ggml_cgraph* {
|
|
||||||
return build_graph(x, timesteps, context, use_uncond_model);
|
|
||||||
};
|
|
||||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
|
||||||
}
|
|
||||||
|
|
||||||
sd::Tensor<float> compute(int n_threads,
|
|
||||||
const DiffusionParams& diffusion_params) override {
|
|
||||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
|
||||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
|
||||||
bool use_uncond_model = should_use_uncond_model(diffusion_params);
|
|
||||||
return compute(n_threads,
|
|
||||||
*diffusion_params.x,
|
|
||||||
*diffusion_params.timesteps,
|
|
||||||
tensor_or_empty(diffusion_params.context),
|
|
||||||
use_uncond_model);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} // namespace Ideogram4
|
|
||||||
|
|
||||||
#endif // __IDEOGRAM4_HPP__
|
|
||||||
@ -1460,18 +1460,13 @@ namespace LLM {
|
|||||||
params.num_kv_heads = 8;
|
params.num_kv_heads = 8;
|
||||||
params.qkv_bias = false;
|
params.qkv_bias = false;
|
||||||
params.rms_norm_eps = 1e-5f;
|
params.rms_norm_eps = 1e-5f;
|
||||||
} else if (arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) {
|
} else if (arch == LLMArch::QWEN3) {
|
||||||
params.head_dim = 128;
|
params.head_dim = 128;
|
||||||
params.num_heads = 32;
|
params.num_heads = 32;
|
||||||
params.num_kv_heads = 8;
|
params.num_kv_heads = 8;
|
||||||
params.qkv_bias = false;
|
params.qkv_bias = false;
|
||||||
params.qk_norm = true;
|
params.qk_norm = true;
|
||||||
params.rms_norm_eps = 1e-6f;
|
params.rms_norm_eps = 1e-6f;
|
||||||
if (arch == LLMArch::QWEN3_VL) {
|
|
||||||
params.max_position_embeddings = 262144;
|
|
||||||
params.rope_thetas = {5000000.f};
|
|
||||||
params.vision.arch = LLMVisionArch::QWEN3_VL;
|
|
||||||
}
|
|
||||||
} else if (arch == LLMArch::GEMMA3_12B) {
|
} else if (arch == LLMArch::GEMMA3_12B) {
|
||||||
params.head_dim = 256;
|
params.head_dim = 256;
|
||||||
params.num_heads = 16;
|
params.num_heads = 16;
|
||||||
|
|||||||
@ -435,9 +435,6 @@ SDVersion ModelLoader::get_sd_version() {
|
|||||||
if (tensor_storage.name.find("model.diffusion_model.net.lq_proj.latent_proj.0.weight") != std::string::npos) {
|
if (tensor_storage.name.find("model.diffusion_model.net.lq_proj.latent_proj.0.weight") != std::string::npos) {
|
||||||
return VERSION_PID;
|
return VERSION_PID;
|
||||||
}
|
}
|
||||||
if (tensor_storage.name.find("embed_image_indicator.weight") != std::string::npos) {
|
|
||||||
return VERSION_IDEOGRAM4;
|
|
||||||
}
|
|
||||||
if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {
|
if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {
|
||||||
return VERSION_CHROMA_RADIANCE;
|
return VERSION_CHROMA_RADIANCE;
|
||||||
}
|
}
|
||||||
@ -1257,8 +1254,6 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
|
|||||||
// Pass, do not convert
|
// Pass, do not convert
|
||||||
} else if (ends_with(name, ".scale")) {
|
} else if (ends_with(name, ".scale")) {
|
||||||
// Pass, do not convert
|
// Pass, do not convert
|
||||||
} else if (ends_with(name, ".weight_scale")) {
|
|
||||||
// Pass, do not convert
|
|
||||||
} else if (contains(name, "img_in.") ||
|
} else if (contains(name, "img_in.") ||
|
||||||
contains(name, "txt_in.") ||
|
contains(name, "txt_in.") ||
|
||||||
contains(name, "time_in.") ||
|
contains(name, "time_in.") ||
|
||||||
|
|||||||
13
src/model.h
13
src/model.h
@ -50,7 +50,6 @@ enum SDVersion {
|
|||||||
VERSION_LENS,
|
VERSION_LENS,
|
||||||
VERSION_LONGCAT,
|
VERSION_LONGCAT,
|
||||||
VERSION_PID,
|
VERSION_PID,
|
||||||
VERSION_IDEOGRAM4,
|
|
||||||
VERSION_COUNT,
|
VERSION_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -173,15 +172,8 @@ static inline bool sd_version_is_pid(SDVersion version) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool sd_version_is_ideogram4(SDVersion version) {
|
|
||||||
if (version == VERSION_IDEOGRAM4) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool sd_version_uses_flux2_vae(SDVersion version) {
|
static inline bool sd_version_uses_flux2_vae(SDVersion version) {
|
||||||
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
|
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
@ -211,8 +203,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
|
|||||||
sd_version_is_ernie_image(version) ||
|
sd_version_is_ernie_image(version) ||
|
||||||
sd_version_is_lens(version) ||
|
sd_version_is_lens(version) ||
|
||||||
sd_version_is_longcat(version) ||
|
sd_version_is_longcat(version) ||
|
||||||
sd_version_is_pid(version) ||
|
sd_version_is_pid(version)) {
|
||||||
sd_version_is_ideogram4(version)) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
34
src/rope.hpp
34
src/rope.hpp
@ -249,40 +249,6 @@ namespace Rope {
|
|||||||
return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims, layout);
|
return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims, layout);
|
||||||
}
|
}
|
||||||
|
|
||||||
__STATIC_INLINE__ std::vector<float> embed_interleaved_mrope(const std::vector<std::vector<float>>& ids,
|
|
||||||
int bs,
|
|
||||||
float theta,
|
|
||||||
int head_dim,
|
|
||||||
const std::vector<int>& mrope_section) {
|
|
||||||
GGML_ASSERT(bs > 0);
|
|
||||||
GGML_ASSERT(head_dim % 2 == 0);
|
|
||||||
GGML_ASSERT(mrope_section.size() >= 3);
|
|
||||||
|
|
||||||
std::vector<std::vector<float>> trans_ids = transpose(ids);
|
|
||||||
size_t pos_len = ids.size() / bs;
|
|
||||||
int half_dim = head_dim / 2;
|
|
||||||
|
|
||||||
std::vector<std::vector<std::vector<float>>> axis_embs;
|
|
||||||
axis_embs.reserve(3);
|
|
||||||
for (int axis = 0; axis < 3; ++axis) {
|
|
||||||
axis_embs.push_back(rope(trans_ids[axis], head_dim, theta));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::vector<float>> emb = axis_embs[0];
|
|
||||||
for (int axis = 1; axis < 3; ++axis) {
|
|
||||||
int length = std::min<int>(mrope_section[axis] * 3, half_dim);
|
|
||||||
for (int freq_idx = axis; freq_idx < length; freq_idx += 3) {
|
|
||||||
for (size_t pos_idx = 0; pos_idx < bs * pos_len; ++pos_idx) {
|
|
||||||
for (int k = 0; k < 4; ++k) {
|
|
||||||
emb[pos_idx][4 * freq_idx + k] = axis_embs[axis][pos_idx][4 * freq_idx + k];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return flatten(emb);
|
|
||||||
}
|
|
||||||
|
|
||||||
__STATIC_INLINE__ std::vector<float> embed_2d_interleaved(int height,
|
__STATIC_INLINE__ std::vector<float> embed_2d_interleaved(int height,
|
||||||
int width,
|
int width,
|
||||||
int dim,
|
int dim,
|
||||||
|
|||||||
@ -23,7 +23,6 @@
|
|||||||
#include "flux.hpp"
|
#include "flux.hpp"
|
||||||
#include "guidance.h"
|
#include "guidance.h"
|
||||||
#include "hidream_o1.hpp"
|
#include "hidream_o1.hpp"
|
||||||
#include "ideogram4.hpp"
|
|
||||||
#include "lens.hpp"
|
#include "lens.hpp"
|
||||||
#include "lora.hpp"
|
#include "lora.hpp"
|
||||||
#include "ltx_audio_vae.h"
|
#include "ltx_audio_vae.h"
|
||||||
@ -85,7 +84,6 @@ const char* model_version_to_str[] = {
|
|||||||
"Lens",
|
"Lens",
|
||||||
"Longcat-Image",
|
"Longcat-Image",
|
||||||
"PiD",
|
"PiD",
|
||||||
"Ideogram 4",
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const char* sampling_methods_str[] = {
|
const char* sampling_methods_str[] = {
|
||||||
@ -317,13 +315,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strlen(SAFE_STR(sd_ctx_params->uncond_diffusion_model_path)) > 0) {
|
|
||||||
LOG_INFO("loading unconditional diffusion model from '%s'", sd_ctx_params->uncond_diffusion_model_path);
|
|
||||||
if (!model_loader.init_from_file(sd_ctx_params->uncond_diffusion_model_path, "model.diffusion_model.uncond.")) {
|
|
||||||
LOG_WARN("loading unconditional diffusion model from '%s' failed", sd_ctx_params->uncond_diffusion_model_path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_unet = sd_version_is_unet(model_loader.get_sd_version());
|
bool is_unet = sd_version_is_unet(model_loader.get_sd_version());
|
||||||
|
|
||||||
if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) {
|
if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) {
|
||||||
@ -556,17 +547,6 @@ public:
|
|||||||
params_backend_for(SDBackendModule::DIFFUSION),
|
params_backend_for(SDBackendModule::DIFFUSION),
|
||||||
tensor_storage_map,
|
tensor_storage_map,
|
||||||
"model.diffusion_model.net");
|
"model.diffusion_model.net");
|
||||||
} else if (sd_version_is_ideogram4(version)) {
|
|
||||||
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
|
||||||
params_backend_for(SDBackendModule::TE),
|
|
||||||
tensor_storage_map,
|
|
||||||
version,
|
|
||||||
"",
|
|
||||||
false);
|
|
||||||
diffusion_model = std::make_shared<Ideogram4::Ideogram4Runner>(backend_for(SDBackendModule::DIFFUSION),
|
|
||||||
params_backend_for(SDBackendModule::DIFFUSION),
|
|
||||||
tensor_storage_map,
|
|
||||||
"model.diffusion_model");
|
|
||||||
} else if (sd_version_is_flux(version)) {
|
} else if (sd_version_is_flux(version)) {
|
||||||
bool is_chroma = false;
|
bool is_chroma = false;
|
||||||
for (auto pair : tensor_storage_map) {
|
for (auto pair : tensor_storage_map) {
|
||||||
@ -1044,12 +1024,6 @@ public:
|
|||||||
ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.gate_up_proj.weight_scale_2");
|
ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.gate_up_proj.weight_scale_2");
|
||||||
ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.down_proj.weight_scale_2");
|
ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.down_proj.weight_scale_2");
|
||||||
}
|
}
|
||||||
if (sd_version_is_ideogram4(version)) {
|
|
||||||
ignore_tensors.insert("text_encoders.llm.lm_head.");
|
|
||||||
ignore_tensors.insert("text_encoders.llm.visual.");
|
|
||||||
ignore_tensors.insert("text_encoders.llm.vision_model.");
|
|
||||||
ignore_tensors.insert("text_encoders.llm.tokenizer_json");
|
|
||||||
}
|
|
||||||
if (version == VERSION_HIDREAM_O1) {
|
if (version == VERSION_HIDREAM_O1) {
|
||||||
ignore_tensors.insert("lm_head.");
|
ignore_tensors.insert("lm_head.");
|
||||||
ignore_tensors.insert("model.visual.deepstack_merger_list.");
|
ignore_tensors.insert("model.visual.deepstack_merger_list.");
|
||||||
@ -1225,8 +1199,7 @@ public:
|
|||||||
sd_version_is_anima(version) ||
|
sd_version_is_anima(version) ||
|
||||||
sd_version_is_ernie_image(version) ||
|
sd_version_is_ernie_image(version) ||
|
||||||
sd_version_is_z_image(version) ||
|
sd_version_is_z_image(version) ||
|
||||||
sd_version_is_pid(version) ||
|
sd_version_is_pid(version)) {
|
||||||
sd_version_is_ideogram4(version)) {
|
|
||||||
pred_type = FLOW_PRED;
|
pred_type = FLOW_PRED;
|
||||||
if (sd_version_is_wan(version)) {
|
if (sd_version_is_wan(version)) {
|
||||||
default_flow_shift = 5.f;
|
default_flow_shift = 5.f;
|
||||||
@ -1234,8 +1207,6 @@ public:
|
|||||||
default_flow_shift = 4.f;
|
default_flow_shift = 4.f;
|
||||||
} else if (sd_version_is_pid(version)) {
|
} else if (sd_version_is_pid(version)) {
|
||||||
default_flow_shift = 1.5f;
|
default_flow_shift = 1.5f;
|
||||||
} else if (sd_version_is_ideogram4(version)) {
|
|
||||||
default_flow_shift = 1.0f;
|
|
||||||
} else {
|
} else {
|
||||||
default_flow_shift = 3.f;
|
default_flow_shift = 3.f;
|
||||||
}
|
}
|
||||||
@ -1898,7 +1869,7 @@ public:
|
|||||||
if (version == VERSION_HIDREAM_O1) {
|
if (version == VERSION_HIDREAM_O1) {
|
||||||
return std::vector<float>{1.0f - (t / static_cast<float>(TIMESTEPS))};
|
return std::vector<float>{1.0f - (t / static_cast<float>(TIMESTEPS))};
|
||||||
}
|
}
|
||||||
if (sd_version_is_z_image(version) || sd_version_is_ideogram4(version)) {
|
if (sd_version_is_z_image(version)) {
|
||||||
return std::vector<float>{1000.f - t};
|
return std::vector<float>{1000.f - t};
|
||||||
}
|
}
|
||||||
return std::vector<float>{t};
|
return std::vector<float>{t};
|
||||||
@ -2800,7 +2771,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
"llm_vision_path: %s\n"
|
"llm_vision_path: %s\n"
|
||||||
"diffusion_model_path: %s\n"
|
"diffusion_model_path: %s\n"
|
||||||
"high_noise_diffusion_model_path: %s\n"
|
"high_noise_diffusion_model_path: %s\n"
|
||||||
"uncond_diffusion_model_path: %s\n"
|
|
||||||
"embeddings_connectors_path: %s\n"
|
"embeddings_connectors_path: %s\n"
|
||||||
"vae_path: %s\n"
|
"vae_path: %s\n"
|
||||||
"audio_vae_path: %s\n"
|
"audio_vae_path: %s\n"
|
||||||
@ -2840,7 +2810,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
SAFE_STR(sd_ctx_params->llm_vision_path),
|
SAFE_STR(sd_ctx_params->llm_vision_path),
|
||||||
SAFE_STR(sd_ctx_params->diffusion_model_path),
|
SAFE_STR(sd_ctx_params->diffusion_model_path),
|
||||||
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
|
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
|
||||||
SAFE_STR(sd_ctx_params->uncond_diffusion_model_path),
|
|
||||||
SAFE_STR(sd_ctx_params->embeddings_connectors_path),
|
SAFE_STR(sd_ctx_params->embeddings_connectors_path),
|
||||||
SAFE_STR(sd_ctx_params->vae_path),
|
SAFE_STR(sd_ctx_params->vae_path),
|
||||||
SAFE_STR(sd_ctx_params->audio_vae_path),
|
SAFE_STR(sd_ctx_params->audio_vae_path),
|
||||||
@ -4209,9 +4178,6 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
|
|||||||
|
|
||||||
SDCondition uncond;
|
SDCondition uncond;
|
||||||
if (request->use_uncond || request->use_high_noise_uncond) {
|
if (request->use_uncond || request->use_high_noise_uncond) {
|
||||||
if (sd_version_is_ideogram4(sd_ctx->sd->version)) {
|
|
||||||
uncond.c_vector = sd::Tensor<float>::from_vector({1.0f});
|
|
||||||
} else {
|
|
||||||
bool zero_out_masked = false;
|
bool zero_out_masked = false;
|
||||||
if (sd_version_is_sdxl(sd_ctx->sd->version) &&
|
if (sd_version_is_sdxl(sd_ctx->sd->version) &&
|
||||||
request->negative_prompt.empty() &&
|
request->negative_prompt.empty() &&
|
||||||
@ -4222,7 +4188,6 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
|
|||||||
condition_params.zero_out_masked = zero_out_masked;
|
condition_params.zero_out_masked = zero_out_masked;
|
||||||
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
|
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
|
||||||
condition_params);
|
condition_params);
|
||||||
}
|
|
||||||
if (uncond.c_concat.empty()) {
|
if (uncond.c_concat.empty()) {
|
||||||
uncond.c_concat = latents->concat_latent; // TODO: optimize
|
uncond.c_concat = latents->concat_latent; // TODO: optimize
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user