Compare commits

...

11 Commits

Author SHA1 Message Date
leejet
8823dc48bc
feat: align the spatial size to the corresponding multiple (#1073) 2025-12-10 23:15:08 +08:00
Pedrito
1ac5a616de
feat: support custom upscale tile size (#896) 2025-12-10 22:25:19 +08:00
leejet
d939f6e86a
refactor: optimize the handling of LoRA models (#1070) 2025-12-10 00:26:07 +08:00
Wagner Bruna
e72aea796e
feat: embed version string and git commit hash (#1008) 2025-12-09 22:38:54 +08:00
wuhei
a908436729
docs: update download link for Stable Diffusion v1.5 (#1063) 2025-12-09 22:06:16 +08:00
stduhpf
583a02e29e
feat: add Flux.2 VAE proj matrix for previews (#1017) 2025-12-09 22:00:45 +08:00
leejet
96c3e64057
refactor: optimize the handling of embedding (#1068)
* optimize the handling of embedding

* support case-insensitive embedding names
2025-12-08 23:59:04 +08:00
Weiqi Gao
0392273e10
chore: add compute kernels to Windows CUDA build (#1062)
* Fix syntax for CUDA architecture definitions

* Extend CUDA support to GTX 10 Series to RTX 50 Series

* update cuda installer step version to install cuda 12.8.1

* Remove unsupported compute capability
2025-12-07 22:12:50 +08:00
leejet
bf1a388b44 docs: update logo 2025-12-07 15:09:32 +08:00
leejet
c9005337a8 docs: update logo 2025-12-07 14:56:21 +08:00
leejet
2f0bd31a84
feat: add ovis image support (#1057) 2025-12-07 12:32:56 +08:00
23 changed files with 789 additions and 288 deletions

View File

@ -163,7 +163,7 @@ jobs:
- build: "avx512"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;86;80;75"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120'"
- build: 'vulkan'
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
steps:
@ -176,9 +176,9 @@ jobs:
- name: Install cuda-toolkit
id: cuda-toolkit
if: ${{ matrix.build == 'cuda12' }}
uses: Jimver/cuda-toolkit@v0.2.19
uses: Jimver/cuda-toolkit@v0.2.22
with:
cuda: "12.6.2"
cuda: "12.8.1"
method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

View File

@ -87,6 +87,38 @@ file(GLOB SD_LIB_SOURCES
"*.hpp"
)
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
endif()
if(NOT SDCPP_BUILD_VERSION)
set(SDCPP_BUILD_VERSION unknown)
endif()
message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
if(NOT SDCPP_BUILD_COMMIT)
set(SDCPP_BUILD_COMMIT unknown)
endif()
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
set_property(
SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
APPEND PROPERTY COMPILE_DEFINITIONS
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
)
if(SD_BUILD_SHARED_LIBS)
message("-- Build shared library")
message(${SD_LIB_SOURCES})

View File

@ -1,5 +1,5 @@
<p align="center">
<img src="./assets/cat_with_sd_cpp_42.png" width="360x">
<img src="./assets/logo.png" width="360x">
</p>
# stable-diffusion.cpp
@ -49,6 +49,7 @@ API and command-line option may change frequently.***
- [Chroma1-Radiance](./docs/chroma_radiance.md)
- [Qwen Image](./docs/qwen_image.md)
- [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
@ -104,7 +105,7 @@ API and command-line option may change frequently.***
### Download model weights
- download weights(.ckpt or .safetensors or .gguf). For example
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
- Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
```sh
curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
@ -134,6 +135,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)

BIN
assets/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 401 KiB

103
clip.hpp
View File

@ -3,34 +3,10 @@
#include "ggml_extend.hpp"
#include "model.h"
#include "tokenize_util.h"
/*================================================== CLIPTokenizer ===================================================*/
__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
std::regex re("<lora:([^:]+):([^>]+)>");
std::smatch matches;
std::unordered_map<std::string, float> filename2multiplier;
while (std::regex_search(text, matches, re)) {
std::string filename = matches[1].str();
float multiplier = std::stof(matches[2].str());
text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
if (multiplier == 0.f) {
continue;
}
if (filename2multiplier.find(filename) == filename2multiplier.end()) {
filename2multiplier[filename] = multiplier;
} else {
filename2multiplier[filename] += multiplier;
}
}
return std::make_pair(filename2multiplier, text);
}
__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
std::set<int> byte_set;
@ -72,6 +48,8 @@ private:
int encoder_len;
int bpe_len;
std::vector<std::string> special_tokens;
public:
const std::string UNK_TOKEN = "<|endoftext|>";
const std::string BOS_TOKEN = "<|startoftext|>";
@ -117,6 +95,15 @@ private:
return pairs;
}
bool is_special_token(const std::string& token) {
for (auto& special_token : special_tokens) {
if (special_token == token) {
return true;
}
}
return false;
}
public:
CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
: PAD_TOKEN_ID(pad_token_id) {
@ -125,6 +112,8 @@ public:
} else {
load_from_merges(ModelLoader::load_merges());
}
add_special_token("<|startoftext|>");
add_special_token("<|endoftext|>");
}
void load_from_merges(const std::string& merges_utf8_str) {
@ -201,6 +190,10 @@ public:
}
}
void add_special_token(const std::string& token) {
special_tokens.push_back(token);
}
std::u32string bpe(const std::u32string& token) {
std::vector<std::u32string> word;
@ -379,25 +372,54 @@ public:
return trim(text);
}
std::vector<std::string> token_split(const std::string& text) {
std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::sregex_iterator iter(text.begin(), text.end(), pat);
std::sregex_iterator end;
std::vector<std::string> result;
for (; iter != end; ++iter) {
result.emplace_back(iter->str());
}
return result;
}
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
std::string original_text = text;
std::vector<int32_t> bpe_tokens;
text = whitespace_clean(text);
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::smatch matches;
std::string str = text;
std::vector<std::string> token_strs;
while (std::regex_search(str, matches, pat)) {
bool skip = on_new_token_cb(str, bpe_tokens);
if (skip) {
auto splited_texts = split_with_special_tokens(text, special_tokens);
for (auto& splited_text : splited_texts) {
LOG_DEBUG("token %s", splited_text.c_str());
if (is_special_token(splited_text)) {
LOG_DEBUG("special %s", splited_text.c_str());
bool skip = on_new_token_cb(splited_text, bpe_tokens);
if (skip) {
token_strs.push_back(splited_text);
continue;
}
continue;
}
for (auto& token : matches) {
std::string token_str = token.str();
auto tokens = token_split(splited_text);
for (auto& token : tokens) {
if (on_new_token_cb != nullptr) {
bool skip = on_new_token_cb(token, bpe_tokens);
if (skip) {
token_strs.push_back(token);
continue;
}
}
std::string token_str = token;
std::u32string utf32_token;
for (int i = 0; i < token_str.length(); i++) {
unsigned char b = token_str[i];
@ -417,14 +439,13 @@ public:
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
}
str = matches.suffix();
}
std::stringstream ss;
ss << "[";
for (auto token : token_strs) {
ss << "\"" << token << "\", ";
}
ss << "]";
// std::stringstream ss;
// ss << "[";
// for (auto token : token_strs) {
// ss << "\"" << token << "\", ";
// }
// ss << "]";
// LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
return bpe_tokens;

View File

@ -56,7 +56,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::shared_ptr<CLIPTextModelRunner> text_model2;
std::string trigger_word = "img"; // should be user settable
std::string embd_dir;
std::map<std::string, std::string> embedding_map;
int32_t num_custom_embeddings = 0;
int32_t num_custom_embeddings_2 = 0;
std::vector<uint8_t> token_embed_custom;
@ -65,11 +65,17 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::string& embd_dir,
const std::map<std::string, std::string>& orig_embedding_map,
SDVersion version = VERSION_SD1,
PMVersion pv = PM_VERSION_1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
bool force_clip_f32 = embd_dir.size() > 0;
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
for (const auto& kv : orig_embedding_map) {
std::string name = kv.first;
std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
embedding_map[name] = kv.second;
tokenizer.add_special_token(name);
}
bool force_clip_f32 = !embedding_map.empty();
if (sd_version_is_sd1(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
} else if (sd_version_is_sd2(version)) {
@ -196,25 +202,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::vector<int> convert_token_to_id(std::string text) {
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
size_t word_end = str.find(",");
std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
embd_name = trim(embd_name);
std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
if (embd_path.size() == 0) {
embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
auto iter = embedding_map.find(str);
if (iter == embedding_map.end()) {
return false;
}
if (embd_path.size() == 0) {
embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
}
if (embd_path.size() > 0) {
if (load_embedding(embd_name, embd_path, bpe_tokens)) {
if (word_end != std::string::npos) {
str = str.substr(word_end);
} else {
str = "";
}
return true;
}
std::string embedding_path = iter->second;
if (load_embedding(str, embedding_path, bpe_tokens)) {
return true;
}
return false;
};
@ -245,25 +239,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
}
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
size_t word_end = str.find(",");
std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
embd_name = trim(embd_name);
std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
if (embd_path.size() == 0) {
embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
auto iter = embedding_map.find(str);
if (iter == embedding_map.end()) {
return false;
}
if (embd_path.size() == 0) {
embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
}
if (embd_path.size() > 0) {
if (load_embedding(embd_name, embd_path, bpe_tokens)) {
if (word_end != std::string::npos) {
str = str.substr(word_end);
} else {
str = "";
}
return true;
}
std::string embedding_path = iter->second;
if (load_embedding(str, embedding_path, bpe_tokens)) {
return true;
}
return false;
};
@ -376,25 +358,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
}
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
size_t word_end = str.find(",");
std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
embd_name = trim(embd_name);
std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
if (embd_path.size() == 0) {
embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
auto iter = embedding_map.find(str);
if (iter == embedding_map.end()) {
return false;
}
if (embd_path.size() == 0) {
embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
}
if (embd_path.size() > 0) {
if (load_embedding(embd_name, embd_path, bpe_tokens)) {
if (word_end != std::string::npos) {
str = str.substr(word_end);
} else {
str = "";
}
return true;
}
std::string embedding_path = iter->second;
if (load_embedding(str, embedding_path, bpe_tokens)) {
return true;
}
return false;
};
@ -1638,7 +1608,7 @@ struct LLMEmbedder : public Conditioner {
LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
if (sd_version_is_flux2(version)) {
arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
} else if (sd_version_is_z_image(version)) {
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE) {
arch = LLM::LLMArch::QWEN3;
}
if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
@ -1728,6 +1698,7 @@ struct LLMEmbedder : public Conditioner {
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
std::pair<int, int> prompt_attn_range;
int prompt_template_encode_start_idx = 34;
int max_length = 0;
std::set<int> out_layers;
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
LOG_INFO("QwenImageEditPlusPipeline");
@ -1825,6 +1796,17 @@ struct LLMEmbedder : public Conditioner {
prompt_attn_range.second = prompt.size();
prompt += "[/INST]";
} else if (version == VERSION_OVIS_IMAGE) {
prompt_template_encode_start_idx = 28;
max_length = prompt_template_encode_start_idx + 256;
prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
prompt_attn_range.first = static_cast<int>(prompt.size());
prompt += " " + conditioner_params.text;
prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
} else {
prompt_template_encode_start_idx = 34;
@ -1837,7 +1819,7 @@ struct LLMEmbedder : public Conditioner {
prompt += "<|im_end|>\n<|im_start|>assistant\n";
}
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false);
auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0);
auto& tokens = std::get<0>(tokens_and_weights);
auto& weights = std::get<1>(tokens_and_weights);
@ -1870,9 +1852,13 @@ struct LLMEmbedder : public Conditioner {
GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
int64_t zero_pad_len = 0;
int64_t min_length = 0;
if (sd_version_is_flux2(version)) {
int64_t min_length = 512;
min_length = 512;
}
int64_t zero_pad_len = 0;
if (min_length > 0) {
if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
}
@ -1892,6 +1878,8 @@ struct LLMEmbedder : public Conditioner {
ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
});
// print_ggml_tensor(new_hidden_states);
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
return {new_hidden_states, nullptr, nullptr};

19
docs/ovis_image.md Normal file
View File

@ -0,0 +1,19 @@
# How to Use
## Download weights
- Download Ovis-Image-7B
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Ovis-Image-7B-GGUF
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Ovis 2.5
- safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/text_encoders
## Examples
```
.\bin\Release\sd.exe --diffusion-model ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
```
<img alt="ovis image example" src="../assets/ovis_image/example.png" />

View File

@ -156,9 +156,10 @@ struct ESRGAN : public GGMLRunner {
ESRGAN(ggml_backend_t backend,
bool offload_params_to_cpu,
int tile_size = 128,
const String2TensorStorage& tensor_storage_map = {})
: GGMLRunner(backend, offload_params_to_cpu) {
// rrdb_net will be created in load_from_file
this->tile_size = tile_size;
}
std::string get_desc() override {

View File

@ -324,6 +324,7 @@ struct SDCliParams {
std::string output_path = "output.png";
bool verbose = false;
bool version = false;
bool canny_preprocess = false;
preview_t preview_method = PREVIEW_NONE;
@ -366,6 +367,10 @@ struct SDCliParams {
"--verbose",
"print extra info",
true, &verbose},
{"",
"--version",
"print stable-diffusion.cpp version",
true, &version},
{"",
"--color",
"colors the logging tags according to level",
@ -501,6 +506,9 @@ struct SDContextParams {
std::string tensor_type_rules;
std::string lora_model_dir;
std::map<std::string, std::string> embedding_map;
std::vector<sd_embedding_t> embedding_vec;
rng_type_t rng_type = CUDA_RNG;
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
bool offload_params_to_cpu = false;
@ -828,6 +836,37 @@ struct SDContextParams {
return options;
}
void build_embedding_map() {
static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
if (!fs::exists(embedding_dir) || !fs::is_directory(embedding_dir)) {
return;
}
for (auto& p : fs::directory_iterator(embedding_dir)) {
if (!p.is_regular_file())
continue;
auto path = p.path();
std::string ext = path.extension().string();
bool valid = false;
for (auto& e : valid_ext) {
if (ext == e) {
valid = true;
break;
}
}
if (!valid)
continue;
std::string key = path.stem().string();
std::string value = path.string();
embedding_map[key] = value;
}
}
bool process_and_check(SDMode mode) {
if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) {
fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
@ -845,10 +884,24 @@ struct SDContextParams {
n_threads = sd_get_num_physical_cores();
}
build_embedding_map();
return true;
}
std::string to_string() const {
std::ostringstream emb_ss;
emb_ss << "{\n";
for (auto it = embedding_map.begin(); it != embedding_map.end(); ++it) {
emb_ss << " \"" << it->first << "\": \"" << it->second << "\"";
if (std::next(it) != embedding_map.end()) {
emb_ss << ",";
}
emb_ss << "\n";
}
emb_ss << " }";
std::string embeddings_str = emb_ss.str();
std::ostringstream oss;
oss << "SDContextParams {\n"
<< " n_threads: " << n_threads << ",\n"
@ -866,6 +919,7 @@ struct SDContextParams {
<< " esrgan_path: \"" << esrgan_path << "\",\n"
<< " control_net_path: \"" << control_net_path << "\",\n"
<< " embedding_dir: \"" << embedding_dir << "\",\n"
<< " embeddings: " << embeddings_str << "\n"
<< " wtype: " << sd_type_name(wtype) << ",\n"
<< " tensor_type_rules: \"" << tensor_type_rules << "\",\n"
<< " lora_model_dir: \"" << lora_model_dir << "\",\n"
@ -898,6 +952,15 @@ struct SDContextParams {
}
sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
embedding_vec.clear();
embedding_vec.reserve(embedding_map.size());
for (const auto& kv : embedding_map) {
sd_embedding_t item;
item.name = kv.first.c_str();
item.path = kv.second.c_str();
embedding_vec.emplace_back(item);
}
sd_ctx_params_t sd_ctx_params = {
model_path.c_str(),
clip_l_path.c_str(),
@ -912,7 +975,8 @@ struct SDContextParams {
taesd_path.c_str(),
control_net_path.c_str(),
lora_model_dir.c_str(),
embedding_dir.c_str(),
embedding_vec.data(),
static_cast<uint32_t>(embedding_vec.size()),
photo_maker_path.c_str(),
tensor_type_rules.c_str(),
vae_decode_only,
@ -966,6 +1030,15 @@ static std::string vec_str_to_string(const std::vector<std::string>& v) {
return oss.str();
}
static bool is_absolute_path(const std::string& p) {
#ifdef _WIN32
// Windows: C:/path or C:\path
return p.size() > 1 && std::isalpha(static_cast<unsigned char>(p[0])) && p[1] == ':';
#else
return !p.empty() && p[0] == '/';
#endif
}
struct SDGenerationParams {
std::string prompt;
std::string negative_prompt;
@ -1006,7 +1079,12 @@ struct SDGenerationParams {
std::string pm_id_embed_path;
float pm_style_strength = 20.f;
int upscale_repeats = 1;
int upscale_repeats = 1;
int upscale_tile_size = 128;
std::map<std::string, float> lora_map;
std::map<std::string, float> high_noise_lora_map;
std::vector<sd_lora_t> lora_vec;
SDGenerationParams() {
sd_sample_params_init(&sample_params);
@ -1099,6 +1177,10 @@ struct SDGenerationParams {
"--upscale-repeats",
"Run the ESRGAN upscaler this many times (default: 1)",
&upscale_repeats},
{"",
"--upscale-tile-size",
"tile size for ESRGAN upscaling (default: 128)",
&upscale_tile_size},
};
options.float_options = {
@ -1378,7 +1460,88 @@ struct SDGenerationParams {
return options;
}
bool process_and_check(SDMode mode) {
void extract_and_remove_lora(const std::string& lora_model_dir) {
static const std::regex re(R"(<lora:([^:>]+):([^>]+)>)");
static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
std::smatch m;
std::string tmp = prompt;
while (std::regex_search(tmp, m, re)) {
std::string raw_path = m[1].str();
const std::string raw_mul = m[2].str();
float mul = 0.f;
try {
mul = std::stof(raw_mul);
} catch (...) {
tmp = m.suffix().str();
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
continue;
}
bool is_high_noise = false;
static const std::string prefix = "|high_noise|";
if (raw_path.rfind(prefix, 0) == 0) {
raw_path.erase(0, prefix.size());
is_high_noise = true;
}
fs::path final_path;
if (is_absolute_path(raw_path)) {
final_path = raw_path;
} else {
final_path = fs::path(lora_model_dir) / raw_path;
}
if (!fs::exists(final_path)) {
bool found = false;
for (const auto& ext : valid_ext) {
fs::path try_path = final_path;
try_path += ext;
if (fs::exists(try_path)) {
final_path = try_path;
found = true;
break;
}
}
if (!found) {
printf("can not found lora %s\n", final_path.lexically_normal().string().c_str());
tmp = m.suffix().str();
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
continue;
}
}
const std::string key = final_path.lexically_normal().string();
if (is_high_noise)
high_noise_lora_map[key] += mul;
else
lora_map[key] += mul;
prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
tmp = m.suffix().str();
}
for (const auto& kv : lora_map) {
sd_lora_t item;
item.is_high_noise = false;
item.path = kv.first.c_str();
item.multiplier = kv.second;
lora_vec.emplace_back(item);
}
for (const auto& kv : high_noise_lora_map) {
sd_lora_t item;
item.is_high_noise = true;
item.path = kv.first.c_str();
item.multiplier = kv.second;
lora_vec.emplace_back(item);
}
}
bool process_and_check(SDMode mode, const std::string& lora_model_dir) {
if (width <= 0) {
fprintf(stderr, "error: the width must be greater than 0\n");
return false;
@ -1477,6 +1640,10 @@ struct SDGenerationParams {
return false;
}
if (upscale_tile_size < 1) {
return false;
}
if (mode == UPSCALE) {
if (init_image_path.length() == 0) {
fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
@ -1489,14 +1656,44 @@ struct SDGenerationParams {
seed = rand();
}
extract_and_remove_lora(lora_model_dir);
return true;
}
std::string to_string() const {
char* sample_params_str = sd_sample_params_to_str(&sample_params);
char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params);
std::ostringstream lora_ss;
lora_ss << "{\n";
for (auto it = lora_map.begin(); it != lora_map.end(); ++it) {
lora_ss << " \"" << it->first << "\": \"" << it->second << "\"";
if (std::next(it) != lora_map.end()) {
lora_ss << ",";
}
lora_ss << "\n";
}
lora_ss << " }";
std::string loras_str = lora_ss.str();
lora_ss = std::ostringstream();
;
lora_ss << "{\n";
for (auto it = high_noise_lora_map.begin(); it != high_noise_lora_map.end(); ++it) {
lora_ss << " \"" << it->first << "\": \"" << it->second << "\"";
if (std::next(it) != high_noise_lora_map.end()) {
lora_ss << ",";
}
lora_ss << "\n";
}
lora_ss << " }";
std::string high_noise_loras_str = lora_ss.str();
std::ostringstream oss;
oss << "SDGenerationParams {\n"
<< " loras: \"" << loras_str << "\",\n"
<< " high_noise_loras: \"" << high_noise_loras_str << "\",\n"
<< " prompt: \"" << prompt << "\",\n"
<< " negative_prompt: \"" << negative_prompt << "\",\n"
<< " clip_skip: " << clip_skip << ",\n"
@ -1532,6 +1729,7 @@ struct SDGenerationParams {
<< " control_strength: " << control_strength << ",\n"
<< " seed: " << seed << ",\n"
<< " upscale_repeats: " << upscale_repeats << ",\n"
<< " upscale_tile_size: " << upscale_tile_size << ",\n"
<< "}";
free(sample_params_str);
free(high_noise_sample_params_str);
@ -1539,7 +1737,12 @@ struct SDGenerationParams {
}
};
static std::string version_string() {
return std::string("stable-diffusion.cpp version ") + sd_version() + ", commit " + sd_commit();
}
void print_usage(int argc, const char* argv[], const std::vector<ArgOptions>& options_list) {
std::cout << version_string() << "\n";
std::cout << "Usage: " << argv[0] << " [options]\n\n";
std::cout << "CLI Options:\n";
options_list[0].print();
@ -1557,7 +1760,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
exit(cli_params.normal_exit ? 0 : 1);
}
if (!cli_params.process_and_check() || !ctx_params.process_and_check(cli_params.mode) || !gen_params.process_and_check(cli_params.mode)) {
if (!cli_params.process_and_check() ||
!ctx_params.process_and_check(cli_params.mode) ||
!gen_params.process_and_check(cli_params.mode, ctx_params.lora_model_dir)) {
print_usage(argc, argv, options_vec);
exit(1);
}
@ -1822,11 +2027,19 @@ void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy,
}
int main(int argc, const char* argv[]) {
if (argc > 1 && std::string(argv[1]) == "--version") {
std::cout << version_string() << "\n";
return EXIT_SUCCESS;
}
SDCliParams cli_params;
SDContextParams ctx_params;
SDGenerationParams gen_params;
parse_args(argc, argv, cli_params, ctx_params, gen_params);
if (cli_params.verbose || cli_params.version) {
std::cout << version_string() << "\n";
}
if (gen_params.video_frames > 4) {
size_t last_dot_pos = cli_params.preview_path.find_last_of(".");
std::string base_path = cli_params.preview_path;
@ -2062,6 +2275,8 @@ int main(int argc, const char* argv[]) {
if (cli_params.mode == IMG_GEN) {
sd_img_gen_params_t img_gen_params = {
gen_params.lora_vec.data(),
static_cast<uint32_t>(gen_params.lora_vec.size()),
gen_params.prompt.c_str(),
gen_params.negative_prompt.c_str(),
gen_params.clip_skip,
@ -2093,6 +2308,8 @@ int main(int argc, const char* argv[]) {
num_results = gen_params.batch_count;
} else if (cli_params.mode == VID_GEN) {
sd_vid_gen_params_t vid_gen_params = {
gen_params.lora_vec.data(),
static_cast<uint32_t>(gen_params.lora_vec.size()),
gen_params.prompt.c_str(),
gen_params.negative_prompt.c_str(),
gen_params.clip_skip,
@ -2129,7 +2346,8 @@ int main(int argc, const char* argv[]) {
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
ctx_params.offload_params_to_cpu,
ctx_params.diffusion_conv_direct,
ctx_params.n_threads);
ctx_params.n_threads,
gen_params.upscale_tile_size);
if (upscaler_ctx == nullptr) {
printf("new_upscaler_ctx failed\n");

134
flux.hpp
View File

@ -134,6 +134,54 @@ namespace Flux {
}
};
struct MLP : public UnaryBlock {
bool use_mlp_silu_act;
public:
MLP(int64_t hidden_size, int64_t intermediate_size, bool use_mlp_silu_act = false, bool bias = false)
: use_mlp_silu_act(use_mlp_silu_act) {
int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1;
blocks["0"] = std::make_shared<Linear>(hidden_size, intermediate_size * mlp_mult_factor, bias);
blocks["2"] = std::make_shared<Linear>(intermediate_size, hidden_size, bias);
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
x = mlp_0->forward(ctx, x);
if (use_mlp_silu_act) {
x = ggml_ext_silu_act(ctx->ggml_ctx, x);
} else {
x = ggml_gelu_inplace(ctx->ggml_ctx, x);
}
x = mlp_2->forward(ctx, x);
return x;
}
};
struct YakMLP : public UnaryBlock {
public:
YakMLP(int64_t hidden_size, int64_t intermediate_size, bool bias = true) {
blocks["gate_proj"] = std::make_shared<Linear>(hidden_size, intermediate_size, bias);
blocks["up_proj"] = std::make_shared<Linear>(hidden_size, intermediate_size, bias);
blocks["down_proj"] = std::make_shared<Linear>(intermediate_size, hidden_size, bias);
}
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
auto up_proj = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
auto gate = gate_proj->forward(ctx, x);
gate = ggml_silu_inplace(ctx->ggml_ctx, gate);
x = up_proj->forward(ctx, x);
x = ggml_mul(ctx->ggml_ctx, x, gate);
x = down_proj->forward(ctx, x);
return x;
}
};
struct ModulationOut {
ggml_tensor* shift = nullptr;
ggml_tensor* scale = nullptr;
@ -199,7 +247,6 @@ namespace Flux {
struct DoubleStreamBlock : public GGMLBlock {
bool prune_mod;
int idx = 0;
bool use_mlp_silu_act;
public:
DoubleStreamBlock(int64_t hidden_size,
@ -210,10 +257,10 @@ namespace Flux {
bool prune_mod = false,
bool share_modulation = false,
bool mlp_proj_bias = true,
bool use_yak_mlp = false,
bool use_mlp_silu_act = false)
: idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) {
int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1;
: idx(idx), prune_mod(prune_mod) {
int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
if (!prune_mod && !share_modulation) {
blocks["img_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
@ -222,9 +269,11 @@ namespace Flux {
blocks["img_attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));
blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
// img_mlp.1 is nn.GELU(approximate="tanh")
blocks["img_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size, mlp_proj_bias));
if (use_yak_mlp) {
blocks["img_mlp"] = std::shared_ptr<GGMLBlock>(new YakMLP(hidden_size, mlp_hidden_dim, mlp_proj_bias));
} else {
blocks["img_mlp"] = std::shared_ptr<GGMLBlock>(new MLP(hidden_size, mlp_hidden_dim, use_mlp_silu_act, mlp_proj_bias));
}
if (!prune_mod && !share_modulation) {
blocks["txt_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
@ -233,9 +282,11 @@ namespace Flux {
blocks["txt_attn"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));
blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
blocks["txt_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
// img_mlp.1 is nn.GELU(approximate="tanh")
blocks["txt_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size, mlp_proj_bias));
if (use_yak_mlp) {
blocks["txt_mlp"] = std::shared_ptr<GGMLBlock>(new YakMLP(hidden_size, mlp_hidden_dim, mlp_proj_bias));
} else {
blocks["txt_mlp"] = std::shared_ptr<GGMLBlock>(new MLP(hidden_size, mlp_hidden_dim, use_mlp_silu_act, mlp_proj_bias));
}
}
std::vector<ModulationOut> get_distil_img_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
@ -272,15 +323,13 @@ namespace Flux {
auto img_attn = std::dynamic_pointer_cast<SelfAttention>(blocks["img_attn"]);
auto img_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm2"]);
auto img_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.0"]);
auto img_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.2"]);
auto img_mlp = std::dynamic_pointer_cast<UnaryBlock>(blocks["img_mlp"]);
auto txt_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm1"]);
auto txt_attn = std::dynamic_pointer_cast<SelfAttention>(blocks["txt_attn"]);
auto txt_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm2"]);
auto txt_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.0"]);
auto txt_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.2"]);
auto txt_mlp = std::dynamic_pointer_cast<UnaryBlock>(blocks["txt_mlp"]);
if (img_mods.empty()) {
if (prune_mod) {
@ -348,27 +397,15 @@ namespace Flux {
// calculate the img bloks
img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn->post_attention(ctx, img_attn_out), img_mod1.gate));
auto img_mlp_out = img_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));
if (use_mlp_silu_act) {
img_mlp_out = ggml_ext_silu_act(ctx->ggml_ctx, img_mlp_out);
} else {
img_mlp_out = ggml_gelu_inplace(ctx->ggml_ctx, img_mlp_out);
}
img_mlp_out = img_mlp_2->forward(ctx, img_mlp_out);
auto img_mlp_out = img_mlp->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));
img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_mod2.gate));
// calculate the txt bloks
txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn->post_attention(ctx, txt_attn_out), txt_mod1.gate));
auto txt_mlp_out = txt_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
if (use_mlp_silu_act) {
txt_mlp_out = ggml_ext_silu_act(ctx->ggml_ctx, txt_mlp_out);
} else {
txt_mlp_out = ggml_gelu_inplace(ctx->ggml_ctx, txt_mlp_out);
}
txt_mlp_out = txt_mlp_2->forward(ctx, txt_mlp_out);
txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate));
auto txt_mlp_out = txt_mlp->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate));
return {img, txt};
}
@ -381,6 +418,7 @@ namespace Flux {
int64_t mlp_hidden_dim;
bool prune_mod;
int idx = 0;
bool use_yak_mlp;
bool use_mlp_silu_act;
int64_t mlp_mult_factor;
@ -393,8 +431,9 @@ namespace Flux {
bool prune_mod = false,
bool share_modulation = false,
bool mlp_proj_bias = true,
bool use_yak_mlp = false,
bool use_mlp_silu_act = false)
: hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) {
: hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_yak_mlp(use_yak_mlp), use_mlp_silu_act(use_mlp_silu_act) {
int64_t head_dim = hidden_size / num_heads;
float scale = qk_scale;
if (scale <= 0.f) {
@ -402,7 +441,7 @@ namespace Flux {
}
mlp_hidden_dim = hidden_size * mlp_ratio;
mlp_mult_factor = 1;
if (use_mlp_silu_act) {
if (use_yak_mlp || use_mlp_silu_act) {
mlp_mult_factor = 2;
}
@ -481,7 +520,9 @@ namespace Flux {
k = norm->key_norm(ctx, k);
auto attn = Rope::attention(ctx, q, k, v, pe, mask); // [N, n_token, hidden_size]
if (use_mlp_silu_act) {
if (use_yak_mlp) {
mlp = ggml_ext_silu_act(ctx->ggml_ctx, mlp, false);
} else if (use_mlp_silu_act) {
mlp = ggml_ext_silu_act(ctx->ggml_ctx, mlp);
} else {
mlp = ggml_gelu_inplace(ctx->ggml_ctx, mlp);
@ -726,6 +767,8 @@ namespace Flux {
int64_t in_dim = 64;
bool disable_bias = false;
bool share_modulation = false;
bool semantic_txt_norm = false;
bool use_yak_mlp = false;
bool use_mlp_silu_act = false;
float ref_index_scale = 1.f;
ChromaRadianceParams chroma_radiance_params;
@ -759,6 +802,9 @@ namespace Flux {
blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
}
}
if (params.semantic_txt_norm) {
blocks["txt_norm"] = std::make_shared<RMSNorm>(params.context_in_dim);
}
blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, !params.disable_bias);
for (int i = 0; i < params.depth; i++) {
@ -770,6 +816,7 @@ namespace Flux {
params.is_chroma,
params.share_modulation,
!params.disable_bias,
params.use_yak_mlp,
params.use_mlp_silu_act);
}
@ -782,6 +829,7 @@ namespace Flux {
params.is_chroma,
params.share_modulation,
!params.disable_bias,
params.use_yak_mlp,
params.use_mlp_silu_act);
}
@ -948,6 +996,12 @@ namespace Flux {
ss_mods = single_stream_modulation->forward(ctx, vec);
}
if (params.semantic_txt_norm) {
auto semantic_txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
txt = semantic_txt_norm->forward(ctx, txt);
}
txt = txt_in->forward(ctx, txt);
for (int i = 0; i < params.depth; i++) {
@ -1206,6 +1260,11 @@ namespace Flux {
} else if (version == VERSION_CHROMA_RADIANCE) {
flux_params.in_channels = 3;
flux_params.patch_size = 16;
} else if (version == VERSION_OVIS_IMAGE) {
flux_params.semantic_txt_norm = true;
flux_params.use_yak_mlp = true;
flux_params.context_in_dim = 2048;
flux_params.vec_in_dim = 0;
} else if (sd_version_is_flux2(version)) {
flux_params.context_in_dim = 15360;
flux_params.in_channels = 128;
@ -1364,13 +1423,22 @@ namespace Flux {
ref_latents[i] = to_backend(ref_latents[i]);
}
std::set<int> txt_arange_dims;
if (sd_version_is_flux2(version)) {
txt_arange_dims = {3};
increase_ref_index = true;
} else if (version == VERSION_OVIS_IMAGE) {
txt_arange_dims = {1, 2};
}
pe_vec = Rope::gen_flux_pe(x->ne[1],
x->ne[0],
flux_params.patch_size,
x->ne[3],
context->ne[1],
txt_arange_dims,
ref_latents,
sd_version_is_flux2(version) ? true : increase_ref_index,
increase_ref_index,
flux_params.ref_index_scale,
flux_params.theta,
flux_params.axes_dim);

View File

@ -60,6 +60,14 @@
#define SD_UNUSED(x) (void)(x)
#endif
__STATIC_INLINE__ int align_up_offset(int n, int multiple) {
return (multiple - n % multiple) % multiple;
}
__STATIC_INLINE__ int align_up(int n, int multiple) {
return n + align_up_offset(n, multiple);
}
__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
switch (level) {
case GGML_LOG_LEVEL_DEBUG:
@ -760,17 +768,23 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_ext_chunk(struct ggml_co
return chunks;
}
__STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor* x) {
__STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor* x, bool gate_first = true) {
// x: [ne3, ne2, ne1, ne0]
// return: [ne3, ne2, ne1, ne0/2]
auto x_vec = ggml_ext_chunk(ctx, x, 2, 0);
auto x1 = x_vec[0]; // [ne3, ne2, ne1, ne0/2]
auto x2 = x_vec[1]; // [ne3, ne2, ne1, ne0/2]
ggml_tensor* gate;
if (gate_first) {
gate = x_vec[0];
x = x_vec[1];
} else {
x = x_vec[0];
gate = x_vec[1];
}
x1 = ggml_silu_inplace(ctx, x1);
gate = ggml_silu_inplace(ctx, gate);
x = ggml_mul(ctx, x1, x2); // [ne3, ne2, ne1, ne0/2]
x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, ne0/2]
return x;
}

View File

@ -91,6 +91,41 @@ const float flux_latent_rgb_proj[16][3] = {
{-0.111849f, -0.055589f, -0.032361f}};
float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
const float flux2_latent_rgb_proj[32][3] = {
{0.000736f, -0.008385f, -0.019710f},
{-0.001352f, -0.016392f, 0.020693f},
{-0.006376f, 0.002428f, 0.036736f},
{0.039384f, 0.074167f, 0.119789f},
{0.007464f, -0.005705f, -0.004734f},
{-0.004086f, 0.005287f, -0.000409f},
{-0.032835f, 0.050802f, -0.028120f},
{-0.003158f, -0.000835f, 0.000406f},
{-0.112840f, -0.084337f, -0.023083f},
{0.001462f, -0.006656f, 0.000549f},
{-0.009980f, -0.007480f, 0.009702f},
{0.032540f, 0.000214f, -0.061388f},
{0.011023f, 0.000694f, 0.007143f},
{-0.001468f, -0.006723f, -0.001678f},
{-0.005921f, -0.010320f, -0.003907f},
{-0.028434f, 0.027584f, 0.018457f},
{0.014349f, 0.011523f, 0.000441f},
{0.009874f, 0.003081f, 0.001507f},
{0.002218f, 0.005712f, 0.001563f},
{0.053010f, -0.019844f, 0.008683f},
{-0.002507f, 0.005384f, 0.000938f},
{-0.002177f, -0.011366f, 0.003559f},
{-0.000261f, 0.015121f, -0.003240f},
{-0.003944f, -0.002083f, 0.005043f},
{-0.009138f, 0.011336f, 0.003781f},
{0.011429f, 0.003985f, -0.003855f},
{0.010518f, -0.005586f, 0.010131f},
{0.007883f, 0.002912f, -0.001473f},
{-0.003318f, -0.003160f, 0.003684f},
{-0.034560f, -0.008740f, 0.012996f},
{0.000166f, 0.001079f, -0.012153f},
{0.017772f, 0.000937f, -0.011953f}};
float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
// This one was taken straight from
// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
// (MiT Licence)
@ -128,16 +163,42 @@ const float sd_latent_rgb_proj[4][3] = {
{-0.178022f, -0.200862f, -0.678514f}};
float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
size_t buffer_head = 0;
uint32_t latent_width = latents->ne[0];
uint32_t latent_height = latents->ne[1];
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
uint32_t frames = 1;
if (ggml_n_dims(latents) == 4) {
frames = latents->ne[2];
}
uint32_t rgb_width = latent_width * patch_size;
uint32_t rgb_height = latent_height * patch_size;
uint32_t unpatched_dim = dim / (patch_size * patch_size);
for (int k = 0; k < frames; k++) {
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
int latent_x = rgb_x / patch_size;
int latent_y = rgb_y / patch_size;
int channel_offset = 0;
if (patch_size > 1) {
channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
}
size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
// should be incremented by 1 for each pixel
size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
float r = 0, g = 0, b = 0;
if (latent_rgb_proj != nullptr) {
for (int d = 0; d < dim; d++) {
float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
for (int d = 0; d < unpatched_dim; d++) {
float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
r += value * latent_rgb_proj[d][0];
g += value * latent_rgb_proj[d][1];
b += value * latent_rgb_proj[d][2];
@ -164,9 +225,9 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
g = g >= 0 ? g <= 1 ? g : 1 : 0;
b = b >= 0 ? b <= 1 ? b : 1 : 0;
buffer[buffer_head++] = (uint8_t)(r * 255);
buffer[buffer_head++] = (uint8_t)(g * 255);
buffer[buffer_head++] = (uint8_t)(b * 255);
buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
}
}
}

71
llm.hpp
View File

@ -356,6 +356,10 @@ namespace LLM {
"<|fim_pad|>",
"<|repo_name|>",
"<|file_sep|>",
"<tool_response>",
"</tool_response>",
"<think>",
"</think>",
};
if (merges_utf8_str.size() > 0) {
@ -859,11 +863,11 @@ namespace LLM {
}
if (arch == LLMArch::MISTRAL_SMALL_3_2) {
q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 131072, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 131072, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 8192, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 8192, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
} else if (arch == LLMArch::QWEN3) {
q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 151936, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 151936, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
} else {
int sections[4] = {16, 24, 24, 0};
q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
@ -1073,29 +1077,22 @@ namespace LLM {
: GGMLRunner(backend, offload_params_to_cpu), enable_vision(enable_vision_) {
params.arch = arch;
if (arch == LLMArch::MISTRAL_SMALL_3_2) {
params.num_layers = 40;
params.hidden_size = 5120;
params.intermediate_size = 32768;
params.head_dim = 128;
params.num_heads = 32;
params.num_kv_heads = 8;
params.qkv_bias = false;
params.vocab_size = 131072;
params.rms_norm_eps = 1e-5f;
params.head_dim = 128;
params.num_heads = 32;
params.num_kv_heads = 8;
params.qkv_bias = false;
params.rms_norm_eps = 1e-5f;
} else if (arch == LLMArch::QWEN3) {
params.num_layers = 36;
params.hidden_size = 2560;
params.intermediate_size = 9728;
params.head_dim = 128;
params.num_heads = 32;
params.num_kv_heads = 8;
params.qkv_bias = false;
params.qk_norm = true;
params.vocab_size = 151936;
params.rms_norm_eps = 1e-6f;
params.head_dim = 128;
params.num_heads = 32;
params.num_kv_heads = 8;
params.qkv_bias = false;
params.qk_norm = true;
params.rms_norm_eps = 1e-6f;
}
bool have_vision_weight = false;
bool llama_cpp_style = false;
params.num_layers = 0;
for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first;
if (tensor_name.find(prefix) == std::string::npos)
@ -1105,10 +1102,36 @@ namespace LLM {
have_vision_weight = true;
if (contains(tensor_name, "attn.q_proj")) {
llama_cpp_style = true;
break;
}
continue;
}
pos = tensor_name.find("layers.");
if (pos != std::string::npos) {
tensor_name = tensor_name.substr(pos); // remove prefix
auto items = split_string(tensor_name, '.');
if (items.size() > 1) {
int block_index = atoi(items[1].c_str());
if (block_index + 1 > params.num_layers) {
params.num_layers = block_index + 1;
}
}
}
if (contains(tensor_name, "embed_tokens.weight")) {
params.hidden_size = pair.second.ne[0];
params.vocab_size = pair.second.ne[1];
}
if (contains(tensor_name, "layers.0.mlp.gate_proj.weight")) {
params.intermediate_size = pair.second.ne[1];
}
}
if (arch == LLMArch::QWEN3 && params.num_layers == 28) { // Qwen3 2B
params.num_heads = 16;
}
LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
params.num_layers,
params.vocab_size,
params.hidden_size,
params.intermediate_size);
if (enable_vision && !have_vision_weight) {
LOG_WARN("no vision weights detected, vision disabled");
enable_vision = false;

View File

@ -1056,6 +1056,9 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
return VERSION_FLUX2;
}
if (tensor_storage.name.find("model.diffusion_model.double_blocks.0.img_mlp.gate_proj.weight") != std::string::npos) {
return VERSION_OVIS_IMAGE;
}
if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
return VERSION_Z_IMAGE;
}

View File

@ -45,6 +45,7 @@ enum SDVersion {
VERSION_QWEN_IMAGE,
VERSION_FLUX2,
VERSION_Z_IMAGE,
VERSION_OVIS_IMAGE,
VERSION_COUNT,
};
@ -90,6 +91,7 @@ static inline bool sd_version_is_flux(SDVersion version) {
version == VERSION_FLUX_FILL ||
version == VERSION_FLUX_CONTROLS ||
version == VERSION_FLEX_2 ||
version == VERSION_OVIS_IMAGE ||
version == VERSION_CHROMA_RADIANCE) {
return true;
}

View File

@ -72,11 +72,13 @@ namespace Rope {
}
// Generate IDs for image patches and text
__STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num) {
__STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num, std::set<int> arange_dims) {
auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
if (axes_dim_num == 4) {
for (int i = 0; i < bs * context_len; i++) {
txt_ids[i][3] = (i % context_len);
for (int dim = 0; dim < axes_dim_num; dim++) {
if (arange_dims.find(dim) != arange_dims.end()) {
for (int i = 0; i < bs * context_len; i++) {
txt_ids[i][dim] = (i % context_len);
}
}
}
return txt_ids;
@ -211,10 +213,11 @@ namespace Rope {
int bs,
int axes_dim_num,
int context_len,
std::set<int> txt_arange_dims,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index,
float ref_index_scale) {
auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num);
auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
auto ids = concat_ids(txt_ids, img_ids, bs);
@ -231,6 +234,7 @@ namespace Rope {
int patch_size,
int bs,
int context_len,
std::set<int> txt_arange_dims,
const std::vector<ggml_tensor*>& ref_latents,
bool increase_ref_index,
float ref_index_scale,
@ -242,6 +246,7 @@ namespace Rope {
bs,
static_cast<int>(axes_dim.size()),
context_len,
txt_arange_dims,
ref_latents,
increase_ref_index,
ref_index_scale);

View File

@ -46,6 +46,7 @@ const char* model_version_to_str[] = {
"Qwen Image",
"Flux.2",
"Z-Image",
"Ovis Image",
};
const char* sampling_methods_str[] = {
@ -424,6 +425,13 @@ public:
tensor_storage_map,
sd_ctx_params->chroma_use_t5_mask,
sd_ctx_params->chroma_t5_mask_pad);
} else if (version == VERSION_OVIS_IMAGE) {
cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
offload_params_to_cpu,
tensor_storage_map,
version,
"",
false);
} else {
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
offload_params_to_cpu,
@ -500,18 +508,22 @@ public:
"model.diffusion_model",
version);
} else { // SD1.x SD2.x SDXL
std::map<std::string, std::string> embbeding_map;
for (int i = 0; i < sd_ctx_params->embedding_count; i++) {
embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
}
if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
offload_params_to_cpu,
tensor_storage_map,
SAFE_STR(sd_ctx_params->embedding_dir),
embbeding_map,
version,
PM_VERSION_2);
} else {
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
offload_params_to_cpu,
tensor_storage_map,
SAFE_STR(sd_ctx_params->embedding_dir),
embbeding_map,
version);
}
diffusion_model = std::make_shared<UNetModel>(backend,
@ -690,6 +702,11 @@ public:
ignore_tensors.insert("first_stage_model.quant");
ignore_tensors.insert("text_encoders.llm.visual.");
}
if (version == VERSION_OVIS_IMAGE) {
ignore_tensors.insert("text_encoders.llm.vision_model.");
ignore_tensors.insert("text_encoders.llm.visual_tokenizer.");
ignore_tensors.insert("text_encoders.llm.vte.");
}
if (version == VERSION_SVD) {
ignore_tensors.insert("conditioner.embedders.3");
}
@ -920,28 +937,17 @@ public:
float multiplier,
ggml_backend_t backend,
LoraModel::filter_t lora_tensor_filter = nullptr) {
std::string lora_name = lora_id;
std::string high_noise_tag = "|high_noise|";
bool is_high_noise = false;
if (starts_with(lora_name, high_noise_tag)) {
lora_name = lora_name.substr(high_noise_tag.size());
std::string lora_path = lora_id;
static std::string high_noise_tag = "|high_noise|";
bool is_high_noise = false;
if (starts_with(lora_path, high_noise_tag)) {
lora_path = lora_path.substr(high_noise_tag.size());
is_high_noise = true;
LOG_DEBUG("high noise lora: %s", lora_name.c_str());
LOG_DEBUG("high noise lora: %s", lora_path.c_str());
}
std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors");
std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
std::string file_path;
if (file_exists(st_file_path)) {
file_path = st_file_path;
} else if (file_exists(ckpt_file_path)) {
file_path = ckpt_file_path;
} else {
LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
return nullptr;
}
auto lora = std::make_shared<LoraModel>(lora_id, backend, file_path, is_high_noise ? "model.high_noise_" : "", version);
auto lora = std::make_shared<LoraModel>(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
LOG_WARN("load lora tensors from %s failed", file_path.c_str());
LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
return nullptr;
}
@ -1126,12 +1132,15 @@ public:
}
}
std::string apply_loras_from_prompt(const std::string& prompt) {
auto result_pair = extract_and_remove_lora(prompt);
std::unordered_map<std::string, float> lora_f2m = result_pair.first; // lora_name -> multiplier
for (auto& kv : lora_f2m) {
LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
void apply_loras(const sd_lora_t* loras, uint32_t lora_count) {
std::unordered_map<std::string, float> lora_f2m;
for (int i = 0; i < lora_count; i++) {
std::string lora_id = SAFE_STR(loras[i].path);
if (loras[i].is_high_noise) {
lora_id = "|high_noise|" + lora_id;
}
lora_f2m[lora_id] = loras[i].multiplier;
LOG_DEBUG("lora %s:%.2f", lora_id.c_str(), loras[i].multiplier);
}
int64_t t0 = ggml_time_ms();
if (apply_lora_immediately) {
@ -1142,9 +1151,7 @@ public:
int64_t t1 = ggml_time_ms();
if (!lora_f2m.empty()) {
LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str());
}
return result_pair.second;
}
ggml_tensor* id_encoder(ggml_context* work_ctx,
@ -1309,10 +1316,17 @@ public:
uint32_t dim = latents->ne[ggml_n_dims(latents) - 1];
if (preview_mode == PREVIEW_PROJ) {
int64_t patch_sz = 1;
const float(*latent_rgb_proj)[channel] = nullptr;
float* latent_rgb_bias = nullptr;
if (dim == 48) {
if (dim == 128) {
if (sd_version_is_flux2(version)) {
latent_rgb_proj = flux2_latent_rgb_proj;
latent_rgb_bias = flux2_latent_rgb_bias;
patch_sz = 2;
}
} else if (dim == 48) {
if (sd_version_is_wan(version)) {
latent_rgb_proj = wan_22_latent_rgb_proj;
latent_rgb_bias = wan_22_latent_rgb_bias;
@ -1365,12 +1379,15 @@ public:
frames = latents->ne[2];
}
uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
uint32_t img_width = width * patch_sz;
uint32_t img_height = height * patch_sz;
preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim);
uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t));
preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz);
sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
for (int i = 0; i < frames; i++) {
images[i] = {width, height, channel, data + i * width * height * channel};
images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel};
}
step_callback(step, frames, images, is_noisy, step_callback_data);
free(data);
@ -1881,6 +1898,18 @@ public:
return vae_scale_factor;
}
int get_diffusion_model_down_factor() {
int down_factor = 8; // unet
if (sd_version_is_dit(version)) {
if (sd_version_is_wan(version)) {
down_factor = 2;
} else {
down_factor = 1;
}
}
return down_factor;
}
int get_latent_channel() {
int latent_channel = 4;
if (sd_version_is_dit(version)) {
@ -2508,7 +2537,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
"taesd_path: %s\n"
"control_net_path: %s\n"
"lora_model_dir: %s\n"
"embedding_dir: %s\n"
"photo_maker_path: %s\n"
"tensor_type_rules: %s\n"
"vae_decode_only: %s\n"
@ -2539,7 +2567,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
SAFE_STR(sd_ctx_params->taesd_path),
SAFE_STR(sd_ctx_params->control_net_path),
SAFE_STR(sd_ctx_params->lora_model_dir),
SAFE_STR(sd_ctx_params->embedding_dir),
SAFE_STR(sd_ctx_params->photo_maker_path),
SAFE_STR(sd_ctx_params->tensor_type_rules),
BOOL_STR(sd_ctx_params->vae_decode_only),
@ -2790,8 +2817,6 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
int sample_steps = sigmas.size() - 1;
int64_t t0 = ggml_time_ms();
// Apply lora
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
// Photo Maker
std::string prompt_text_only;
@ -3120,22 +3145,19 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
int width = sd_img_gen_params->width;
int height = sd_img_gen_params->height;
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
if (sd_version_is_dit(sd_ctx->sd->version)) {
if (width % 16 || height % 16) {
LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)",
model_version_to_str[sd_ctx->sd->version],
width,
height);
return nullptr;
}
} else if (width % 64 || height % 64) {
LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)",
model_version_to_str[sd_ctx->sd->version],
width,
height);
return nullptr;
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
int spatial_multiple = vae_scale_factor * diffusion_model_down_factor;
int width_offset = align_up_offset(width, spatial_multiple);
int height_offset = align_up_offset(height, spatial_multiple);
if (width_offset > 0 || height_offset > 0) {
width += width_offset;
height += height_offset;
LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple);
}
LOG_DEBUG("generate_image %dx%d", width, height);
if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
return nullptr;
@ -3163,6 +3185,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
size_t t0 = ggml_time_ms();
// Apply lora
sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
if (sample_method == SAMPLE_METHOD_COUNT) {
sample_method = sd_get_default_sample_method(sd_ctx);
@ -3406,9 +3431,19 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
int frames = sd_vid_gen_params->video_frames;
frames = (frames - 1) / 4 * 4 + 1;
int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
LOG_INFO("generate_video %dx%dx%d", width, height, frames);
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
int spatial_multiple = vae_scale_factor * diffusion_model_down_factor;
int width_offset = align_up_offset(width, spatial_multiple);
int height_offset = align_up_offset(height, spatial_multiple);
if (width_offset > 0 || height_offset > 0) {
width += width_offset;
height += height_offset;
LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple);
}
LOG_INFO("generate_video %dx%dx%d", width, height, frames);
enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
if (sample_method == SAMPLE_METHOD_COUNT) {
@ -3462,7 +3497,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
int64_t t0 = ggml_time_ms();
// Apply lora
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
ggml_tensor* init_latent = nullptr;
ggml_tensor* clip_vision_output = nullptr;

View File

@ -150,6 +150,11 @@ typedef struct {
float rel_size_y;
} sd_tiling_params_t;
typedef struct {
const char* name;
const char* path;
} sd_embedding_t;
typedef struct {
const char* model_path;
const char* clip_l_path;
@ -164,7 +169,8 @@ typedef struct {
const char* taesd_path;
const char* control_net_path;
const char* lora_model_dir;
const char* embedding_dir;
const sd_embedding_t* embeddings;
uint32_t embedding_count;
const char* photo_maker_path;
const char* tensor_type_rules;
bool vae_decode_only;
@ -236,6 +242,14 @@ typedef struct {
} sd_easycache_params_t;
typedef struct {
bool is_high_noise;
float multiplier;
const char* path;
} sd_lora_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt;
const char* negative_prompt;
int clip_skip;
@ -259,6 +273,8 @@ typedef struct {
} sd_img_gen_params_t;
typedef struct {
const sd_lora_t* loras;
uint32_t lora_count;
const char* prompt;
const char* negative_prompt;
int clip_skip;
@ -331,7 +347,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
bool offload_params_to_cpu,
bool direct,
int n_threads);
int n_threads,
int tile_size);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
@ -353,6 +370,9 @@ SD_API bool preprocess_canny(sd_image_t image,
float strong,
bool inverse);
SD_API const char* sd_commit(void);
SD_API const char* sd_version(void);
#ifdef __cplusplus
}
#endif

View File

@ -9,12 +9,15 @@ struct UpscalerGGML {
std::shared_ptr<ESRGAN> esrgan_upscaler;
std::string esrgan_path;
int n_threads;
bool direct = false;
bool direct = false;
int tile_size = 128;
UpscalerGGML(int n_threads,
bool direct = false)
bool direct = false,
int tile_size = 128)
: n_threads(n_threads),
direct(direct) {
direct(direct),
tile_size(tile_size) {
}
bool load_from_file(const std::string& esrgan_path,
@ -51,7 +54,7 @@ struct UpscalerGGML {
backend = ggml_backend_cpu_init();
}
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.get_tensor_storage_map());
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
if (direct) {
esrgan_upscaler->set_conv2d_direct_enabled(true);
}
@ -113,14 +116,15 @@ struct upscaler_ctx_t {
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
bool offload_params_to_cpu,
bool direct,
int n_threads) {
int n_threads,
int tile_size) {
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
if (upscaler_ctx == nullptr) {
return nullptr;
}
std::string esrgan_path(esrgan_path_c_str);
upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
if (upscaler_ctx->upscaler == nullptr) {
return nullptr;
}

View File

@ -95,20 +95,6 @@ bool is_directory(const std::string& path) {
return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
}
std::string get_full_path(const std::string& dir, const std::string& filename) {
std::string full_path = dir + "\\" + filename;
WIN32_FIND_DATA find_file_data;
HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
if (hFind != INVALID_HANDLE_VALUE) {
FindClose(hFind);
return full_path;
} else {
return "";
}
}
#else // Unix
#include <dirent.h>
#include <sys/stat.h>
@ -123,26 +109,6 @@ bool is_directory(const std::string& path) {
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
}
// TODO: add windows version
std::string get_full_path(const std::string& dir, const std::string& filename) {
DIR* dp = opendir(dir.c_str());
if (dp != nullptr) {
struct dirent* entry;
while ((entry = readdir(dp)) != nullptr) {
if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
closedir(dp);
return dir + "/" + entry->d_name;
}
}
closedir(dp);
}
return "";
}
#endif
// get_num_physical_cores is copy from

1
util.h
View File

@ -22,7 +22,6 @@ int round_up_to(int value, int base);
bool file_exists(const std::string& filename);
bool is_directory(const std::string& path);
std::string get_full_path(const std::string& dir, const std::string& filename);
std::u32string utf8_to_utf32(const std::string& utf8_str);
std::string utf32_to_utf8(const std::u32string& utf32_str);

20
version.cpp Normal file
View File

@ -0,0 +1,20 @@
#include "stable-diffusion.h"
#ifndef SDCPP_BUILD_COMMIT
#define SDCPP_BUILD_COMMIT unknown
#endif
#ifndef SDCPP_BUILD_VERSION
#define SDCPP_BUILD_VERSION unknown
#endif
#define STRINGIZE2(x) #x
#define STRINGIZE(x) STRINGIZE2(x)
const char* sd_commit(void) {
return STRINGIZE(SDCPP_BUILD_COMMIT);
}
const char* sd_version(void) {
return STRINGIZE(SDCPP_BUILD_VERSION);
}