Compare commits

...

3 Commits

Author SHA1 Message Date
leejet
9e28be6479
feat: add chroma radiance support (#910)
* add chroma radiance support

* fix ci

* simply generate_init_latent

* workaround: avoid ggml cuda error

* format code

* add chroma radiance doc
2025-10-25 23:56:14 +08:00
akleine
062490aa7c
feat: add SSD1B and tiny-sd support (#897)
* feat: add code and doc for running SSD1B models

* Added some more lines to support SD1.x with TINY U-Nets too.

* support SSD-1B.safetensors

* fix sdv1.5 diffusers format loader

---------

Co-authored-by: leejet <leejet714@gmail.com>
2025-10-25 23:35:54 +08:00
stduhpf
faabc5ad3c
feat: allow models to run without all text encoder(s) (#645) 2025-10-25 22:00:56 +08:00
13 changed files with 1085 additions and 356 deletions

View File

@ -35,9 +35,11 @@ API and command-line option may change frequently.***
- Image Models - Image Models
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- [SD3/SD3.5](./docs/sd3.md) - [SD3/SD3.5](./docs/sd3.md)
- [Flux-dev/Flux-schnell](./docs/flux.md) - [Flux-dev/Flux-schnell](./docs/flux.md)
- [Chroma](./docs/chroma.md) - [Chroma](./docs/chroma.md)
- [Chroma1-Radiance](./docs/chroma_radiance.md)
- [Qwen Image](./docs/qwen_image.md) - [Qwen Image](./docs/qwen_image.md)
- Image Edit Models - Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md) - [FLUX.1-Kontext-dev](./docs/kontext.md)

Binary file not shown.

After

Width:  |  Height:  |  Size: 477 KiB

View File

@ -673,33 +673,80 @@ struct SD3CLIPEmbedder : public Conditioner {
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}) const String2GGMLType& tensor_types = {})
: clip_g_tokenizer(0) { : clip_g_tokenizer(0) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); bool use_clip_l = false;
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); bool use_clip_g = false;
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); bool use_t5 = false;
for (auto pair : tensor_types) {
if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
use_clip_l = true;
} else if (pair.first.find("text_encoders.clip_g") != std::string::npos) {
use_clip_g = true;
} else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
use_t5 = true;
}
}
if (!use_clip_l && !use_clip_g && !use_t5) {
LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
return;
}
if (use_clip_l) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
}
if (use_clip_g) {
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
}
if (use_t5) {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
}
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); if (clip_l) {
clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model"); clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); }
if (clip_g) {
clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
}
if (t5) {
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
}
} }
void alloc_params_buffer() override { void alloc_params_buffer() override {
clip_l->alloc_params_buffer(); if (clip_l) {
clip_g->alloc_params_buffer(); clip_l->alloc_params_buffer();
t5->alloc_params_buffer(); }
if (clip_g) {
clip_g->alloc_params_buffer();
}
if (t5) {
t5->alloc_params_buffer();
}
} }
void free_params_buffer() override { void free_params_buffer() override {
clip_l->free_params_buffer(); if (clip_l) {
clip_g->free_params_buffer(); clip_l->free_params_buffer();
t5->free_params_buffer(); }
if (clip_g) {
clip_g->free_params_buffer();
}
if (t5) {
t5->free_params_buffer();
}
} }
size_t get_params_buffer_size() override { size_t get_params_buffer_size() override {
size_t buffer_size = clip_l->get_params_buffer_size(); size_t buffer_size = 0;
buffer_size += clip_g->get_params_buffer_size(); if (clip_l) {
buffer_size += t5->get_params_buffer_size(); buffer_size += clip_l->get_params_buffer_size();
}
if (clip_g) {
buffer_size += clip_g->get_params_buffer_size();
}
if (t5) {
buffer_size += t5->get_params_buffer_size();
}
return buffer_size; return buffer_size;
} }
@ -731,23 +778,32 @@ struct SD3CLIPEmbedder : public Conditioner {
for (const auto& item : parsed_attention) { for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first; const std::string& curr_text = item.first;
float curr_weight = item.second; float curr_weight = item.second;
if (clip_l) {
std::vector<int> curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb); std::vector<int> curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end()); clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight); clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
}
curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb); if (clip_g) {
clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end()); std::vector<int> curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb);
clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight); clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end());
clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight);
curr_tokens = t5_tokenizer.Encode(curr_text, true); }
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); if (t5) {
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}
} }
clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding); if (clip_l) {
clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding); clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding); }
if (clip_g) {
clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
}
if (t5) {
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
}
// for (int i = 0; i < clip_l_tokens.size(); i++) { // for (int i = 0; i < clip_l_tokens.size(); i++) {
// std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", "; // std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
@ -795,10 +851,10 @@ struct SD3CLIPEmbedder : public Conditioner {
std::vector<float> hidden_states_vec; std::vector<float> hidden_states_vec;
size_t chunk_len = 77; size_t chunk_len = 77;
size_t chunk_count = clip_l_tokens.size() / chunk_len; size_t chunk_count = std::max(std::max(clip_l_tokens.size(), clip_g_tokens.size()), t5_tokens.size()) / chunk_len;
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
// clip_l // clip_l
{ if (clip_l) {
std::vector<int> chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len, std::vector<int> chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len,
clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len); clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector<float> chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len, std::vector<float> chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len,
@ -845,10 +901,17 @@ struct SD3CLIPEmbedder : public Conditioner {
&pooled_l, &pooled_l,
work_ctx); work_ctx);
} }
} else {
chunk_hidden_states_l = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, chunk_len);
ggml_set_f32(chunk_hidden_states_l, 0.f);
if (chunk_idx == 0) {
pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
ggml_set_f32(pooled_l, 0.f);
}
} }
// clip_g // clip_g
{ if (clip_g) {
std::vector<int> chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len, std::vector<int> chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len,
clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len); clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector<float> chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len, std::vector<float> chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len,
@ -896,10 +959,17 @@ struct SD3CLIPEmbedder : public Conditioner {
&pooled_g, &pooled_g,
work_ctx); work_ctx);
} }
} else {
chunk_hidden_states_g = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 1280, chunk_len);
ggml_set_f32(chunk_hidden_states_g, 0.f);
if (chunk_idx == 0) {
pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280);
ggml_set_f32(pooled_g, 0.f);
}
} }
// t5 // t5
{ if (t5) {
std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
t5_tokens.begin() + (chunk_idx + 1) * chunk_len); t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
@ -927,6 +997,9 @@ struct SD3CLIPEmbedder : public Conditioner {
float new_mean = ggml_tensor_mean(tensor); float new_mean = ggml_tensor_mean(tensor);
ggml_tensor_scale(tensor, (original_mean / new_mean)); ggml_tensor_scale(tensor, (original_mean / new_mean));
} }
} else {
chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
ggml_set_f32(chunk_hidden_states_t5, 0.f);
} }
auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx, auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx,
@ -969,11 +1042,20 @@ struct SD3CLIPEmbedder : public Conditioner {
((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
} }
hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); if (hidden_states_vec.size() > 0) {
hidden_states = ggml_reshape_2d(work_ctx, hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
hidden_states, hidden_states = ggml_reshape_2d(work_ctx,
chunk_hidden_states->ne[0], hidden_states,
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); chunk_hidden_states->ne[0],
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
} else {
hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
ggml_set_f32(hidden_states, 0.f);
}
if (pooled == nullptr) {
pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2048);
ggml_set_f32(pooled, 0.f);
}
return {hidden_states, pooled, nullptr}; return {hidden_states, pooled, nullptr};
} }
@ -999,28 +1081,68 @@ struct FluxCLIPEmbedder : public Conditioner {
FluxCLIPEmbedder(ggml_backend_t backend, FluxCLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}) { const String2GGMLType& tensor_types = {}) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); bool use_clip_l = false;
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); bool use_t5 = false;
for (auto pair : tensor_types) {
if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
use_clip_l = true;
} else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
use_t5 = true;
}
}
if (!use_clip_l && !use_t5) {
LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
return;
}
if (use_clip_l) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
} else {
LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
}
if (use_t5) {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
} else {
LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
}
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); if (clip_l) {
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
}
if (t5) {
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
}
} }
void alloc_params_buffer() override { void alloc_params_buffer() override {
clip_l->alloc_params_buffer(); if (clip_l) {
t5->alloc_params_buffer(); clip_l->alloc_params_buffer();
}
if (t5) {
t5->alloc_params_buffer();
}
} }
void free_params_buffer() override { void free_params_buffer() override {
clip_l->free_params_buffer(); if (clip_l) {
t5->free_params_buffer(); clip_l->free_params_buffer();
}
if (t5) {
t5->free_params_buffer();
}
} }
size_t get_params_buffer_size() override { size_t get_params_buffer_size() override {
size_t buffer_size = clip_l->get_params_buffer_size(); size_t buffer_size = 0;
buffer_size += t5->get_params_buffer_size(); if (clip_l) {
buffer_size += clip_l->get_params_buffer_size();
}
if (t5) {
buffer_size += t5->get_params_buffer_size();
}
return buffer_size; return buffer_size;
} }
@ -1050,18 +1172,24 @@ struct FluxCLIPEmbedder : public Conditioner {
for (const auto& item : parsed_attention) { for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first; const std::string& curr_text = item.first;
float curr_weight = item.second; float curr_weight = item.second;
if (clip_l) {
std::vector<int> curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb); std::vector<int> curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end()); clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight); clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
}
curr_tokens = t5_tokenizer.Encode(curr_text, true); if (t5) {
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}
} }
clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding); if (clip_l) {
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding); clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding);
}
if (t5) {
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
}
// for (int i = 0; i < clip_l_tokens.size(); i++) { // for (int i = 0; i < clip_l_tokens.size(); i++) {
// std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", "; // std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
@ -1096,35 +1224,37 @@ struct FluxCLIPEmbedder : public Conditioner {
struct ggml_tensor* pooled = nullptr; // [768,] struct ggml_tensor* pooled = nullptr; // [768,]
std::vector<float> hidden_states_vec; std::vector<float> hidden_states_vec;
size_t chunk_count = t5_tokens.size() / chunk_len; size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len;
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
// clip_l // clip_l
if (chunk_idx == 0) { if (chunk_idx == 0) {
size_t chunk_len_l = 77; if (clip_l) {
std::vector<int> chunk_tokens(clip_l_tokens.begin(), size_t chunk_len_l = 77;
clip_l_tokens.begin() + chunk_len_l); std::vector<int> chunk_tokens(clip_l_tokens.begin(),
std::vector<float> chunk_weights(clip_l_weights.begin(), clip_l_tokens.begin() + chunk_len_l);
clip_l_weights.begin() + chunk_len_l); std::vector<float> chunk_weights(clip_l_weights.begin(),
clip_l_weights.begin() + chunk_len_l);
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
size_t max_token_idx = 0; size_t max_token_idx = 0;
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
clip_l->compute(n_threads, clip_l->compute(n_threads,
input_ids, input_ids,
0, 0,
nullptr, nullptr,
max_token_idx, max_token_idx,
true, true,
clip_skip, clip_skip,
&pooled, &pooled,
work_ctx); work_ctx);
}
} }
// t5 // t5
{ if (t5) {
std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
t5_tokens.begin() + (chunk_idx + 1) * chunk_len); t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
@ -1152,6 +1282,9 @@ struct FluxCLIPEmbedder : public Conditioner {
float new_mean = ggml_tensor_mean(tensor); float new_mean = ggml_tensor_mean(tensor);
ggml_tensor_scale(tensor, (original_mean / new_mean)); ggml_tensor_scale(tensor, (original_mean / new_mean));
} }
} else {
chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
ggml_set_f32(chunk_hidden_states, 0.f);
} }
int64_t t1 = ggml_time_ms(); int64_t t1 = ggml_time_ms();
@ -1168,11 +1301,20 @@ struct FluxCLIPEmbedder : public Conditioner {
((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
} }
hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); if (hidden_states_vec.size() > 0) {
hidden_states = ggml_reshape_2d(work_ctx, hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
hidden_states, hidden_states = ggml_reshape_2d(work_ctx,
chunk_hidden_states->ne[0], hidden_states,
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); chunk_hidden_states->ne[0],
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
} else {
hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
ggml_set_f32(hidden_states, 0.f);
}
if (pooled == nullptr) {
pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
ggml_set_f32(pooled, 0.f);
}
return {hidden_states, pooled, nullptr}; return {hidden_states, pooled, nullptr};
} }
@ -1203,26 +1345,44 @@ struct T5CLIPEmbedder : public Conditioner {
int mask_pad = 1, int mask_pad = 1,
bool is_umt5 = false) bool is_umt5 = false)
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) { : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); bool use_t5 = false;
for (auto pair : tensor_types) {
if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
use_t5 = true;
}
}
if (!use_t5) {
LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
return;
} else {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
}
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); if (t5) {
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
}
} }
void alloc_params_buffer() override { void alloc_params_buffer() override {
t5->alloc_params_buffer(); if (t5) {
t5->alloc_params_buffer();
}
} }
void free_params_buffer() override { void free_params_buffer() override {
t5->free_params_buffer(); if (t5) {
t5->free_params_buffer();
}
} }
size_t get_params_buffer_size() override { size_t get_params_buffer_size() override {
size_t buffer_size = 0; size_t buffer_size = 0;
if (t5) {
buffer_size += t5->get_params_buffer_size(); buffer_size += t5->get_params_buffer_size();
}
return buffer_size; return buffer_size;
} }
@ -1248,17 +1408,18 @@ struct T5CLIPEmbedder : public Conditioner {
std::vector<int> t5_tokens; std::vector<int> t5_tokens;
std::vector<float> t5_weights; std::vector<float> t5_weights;
std::vector<float> t5_mask; std::vector<float> t5_mask;
for (const auto& item : parsed_attention) { if (t5) {
const std::string& curr_text = item.first; for (const auto& item : parsed_attention) {
float curr_weight = item.second; const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true); std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding);
} }
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding);
return {t5_tokens, t5_weights, t5_mask}; return {t5_tokens, t5_weights, t5_mask};
} }
@ -1282,6 +1443,13 @@ struct T5CLIPEmbedder : public Conditioner {
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> token_and_weights, std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> token_and_weights,
int clip_skip, int clip_skip,
bool zero_out_masked = false) { bool zero_out_masked = false) {
if (!t5) {
auto hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
ggml_set_f32(hidden_states, 0.f);
auto t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 256);
ggml_set_f32(t5_attn_mask, -HUGE_VALF);
return {hidden_states, t5_attn_mask, nullptr};
}
auto& t5_tokens = std::get<0>(token_and_weights); auto& t5_tokens = std::get<0>(token_and_weights);
auto& t5_weights = std::get<1>(token_and_weights); auto& t5_weights = std::get<1>(token_and_weights);
auto& t5_attn_mask_vec = std::get<2>(token_and_weights); auto& t5_attn_mask_vec = std::get<2>(token_and_weights);

21
docs/chroma_radiance.md Normal file
View File

@ -0,0 +1,21 @@
# How to Use
## Download weights
- Download Chroma1-Radiance
- safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
- gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
- Download t5xxl
- safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
## Examples
```
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'chroma radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
```
<img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />

86
docs/distilled_sd.md Normal file
View File

@ -0,0 +1,86 @@
# Running distilled models: SSD1B and SD1.x with tiny U-Nets
## Preface
This kind of models have a reduced U-Net part.
Unlike other SDXL models the U-Net of SSD1B has only one middle block and lesser attention layers in up and down blocks, resulting in relatively smaller files. Running these models saves more than 33% of the time. For more details, refer to Segmind's paper on https://arxiv.org/abs/2401.02677v1 .
Unlike other SD 1.x models Tiny-UNet models consist of only 6 U-Net blocks, resulting in relatively smaller files (approximately 1 GB). Running these models saves almost 50% of the time. For more details, refer to the paper: https://arxiv.org/pdf/2305.15798.pdf .
## SSD1B
Unfortunately not all of this models follow the standard model parameter naming mapping.
Anyway there are some very useful SSD1B models available online, such as:
* https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
* https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
Also there are useful LORAs available:
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
You can use this files **out-of-the-box** - unlike models in next section.
## SD1.x with tiny U-Nets
There are some Tiny SD 1.x models available online, such as:
* https://huggingface.co/segmind/tiny-sd
* https://huggingface.co/segmind/portrait-finetuned
* https://huggingface.co/nota-ai/bk-sdm-tiny
These models need some conversion, for example because partially tensors are **non contiguous** stored. To create a usable checkpoint file, follow these **easy** steps:
### Download model from Hugging Face
Download the model using Python on your computer, for example this way:
```python
import torch
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
unet=pipe.unet
for param in unet.parameters():
param.data = param.data.contiguous() # <- important here
pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
```
### Convert that to a ckpt file
To convert the downloaded model to a checkpoint file, you need another Python script. Download the conversion script from here:
* https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
### Run convert script
Now, run that conversion script:
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path ./segmindtiny-sd \
--checkpoint_path ./segmind_tiny-sd.ckpt --half
```
The file **segmind_tiny-sd.ckpt** will be generated and is now ready to use with sd.cpp
You can follow a similar process for other models mentioned above from Hugging Face.
### Another ckpt file on the net
There is another model file available online:
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
If you want to use that, you have to adjust some **non-contiguous tensors** first:
```python
import torch
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
for key, value in ckpt['state_dict'].items():
if isinstance(value, torch.Tensor):
ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```

594
flux.hpp
View File

@ -399,7 +399,7 @@ namespace Flux {
ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) { ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
int64_t offset = 3 * idx; int64_t offset = 3 * idx;
return {ctx, vec, offset}; return ModulationOut(ctx, vec, offset);
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* forward(struct ggml_context* ctx,
@ -549,7 +549,135 @@ namespace Flux {
} }
}; };
struct NerfEmbedder : public GGMLBlock {
NerfEmbedder(int64_t in_channels,
int64_t hidden_size_input,
int64_t max_freqs) {
blocks["embedder.0"] = std::make_shared<Linear>(in_channels + max_freqs * max_freqs, hidden_size_input);
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* dct) {
// x: (B, P^2, C)
// dct: (1, P^2, max_freqs^2)
// return: (B, P^2, hidden_size_input)
auto embedder = std::dynamic_pointer_cast<Linear>(blocks["embedder.0"]);
dct = ggml_repeat_4d(ctx, dct, dct->ne[0], dct->ne[1], x->ne[2], x->ne[3]);
x = ggml_concat(ctx, x, dct, 0);
x = embedder->forward(ctx, x);
return x;
}
};
struct NerfGLUBlock : public GGMLBlock {
int64_t mlp_ratio;
NerfGLUBlock(int64_t hidden_size_s,
int64_t hidden_size_x,
int64_t mlp_ratio)
: mlp_ratio(mlp_ratio) {
int64_t total_params = 3 * hidden_size_x * hidden_size_x * mlp_ratio;
blocks["param_generator"] = std::make_shared<Linear>(hidden_size_s, total_params);
blocks["norm"] = std::make_shared<RMSNorm>(hidden_size_x);
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* s) {
// x: (batch_size, n_token, hidden_size_x)
// s: (batch_size, hidden_size_s)
// return: (batch_size, n_token, hidden_size_x)
auto param_generator = std::dynamic_pointer_cast<Linear>(blocks["param_generator"]);
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
int64_t batch_size = x->ne[2];
int64_t hidden_size_x = x->ne[0];
auto mlp_params = param_generator->forward(ctx, s);
auto fc_params = ggml_chunk(ctx, mlp_params, 3, 0);
auto fc1_gate = ggml_reshape_3d(ctx, fc_params[0], hidden_size_x * mlp_ratio, hidden_size_x, batch_size);
auto fc1_value = ggml_reshape_3d(ctx, fc_params[1], hidden_size_x * mlp_ratio, hidden_size_x, batch_size);
auto fc2 = ggml_reshape_3d(ctx, fc_params[2], hidden_size_x, mlp_ratio * hidden_size_x, batch_size);
fc1_gate = ggml_cont(ctx, ggml_torch_permute(ctx, fc1_gate, 1, 0, 2, 3)); // [batch_size, hidden_size_x*mlp_ratio, hidden_size_x]
fc1_gate = ggml_l2_norm(ctx, fc1_gate, 1e-12f);
fc1_value = ggml_cont(ctx, ggml_torch_permute(ctx, fc1_value, 1, 0, 2, 3)); // [batch_size, hidden_size_x*mlp_ratio, hidden_size_x]
fc1_value = ggml_l2_norm(ctx, fc1_value, 1e-12f);
fc2 = ggml_cont(ctx, ggml_torch_permute(ctx, fc2, 1, 0, 2, 3)); // [batch_size, hidden_size_x, hidden_size_x*mlp_ratio]
fc2 = ggml_l2_norm(ctx, fc2, 1e-12f);
auto res_x = x;
x = norm->forward(ctx, x); // [batch_size, n_token, hidden_size_x]
auto x1 = ggml_mul_mat(ctx, fc1_gate, x); // [batch_size, n_token, hidden_size_x*mlp_ratio]
x1 = ggml_silu_inplace(ctx, x1);
auto x2 = ggml_mul_mat(ctx, fc1_value, x); // [batch_size, n_token, hidden_size_x*mlp_ratio]
x = ggml_mul_inplace(ctx, x1, x2); // [batch_size, n_token, hidden_size_x*mlp_ratio]
x = ggml_mul_mat(ctx, fc2, x); // [batch_size, n_token, hidden_size_x]
x = ggml_add_inplace(ctx, x, res_x);
return x;
}
};
struct NerfFinalLayer : public GGMLBlock {
NerfFinalLayer(int64_t hidden_size,
int64_t out_channels) {
blocks["norm"] = std::make_shared<RMSNorm>(hidden_size);
blocks["linear"] = std::make_shared<Linear>(hidden_size, out_channels);
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x) {
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
x = norm->forward(ctx, x);
x = linear->forward(ctx, x);
return x;
}
};
struct NerfFinalLayerConv : public GGMLBlock {
NerfFinalLayerConv(int64_t hidden_size,
int64_t out_channels) {
blocks["norm"] = std::make_shared<RMSNorm>(hidden_size);
blocks["conv"] = std::make_shared<Conv2d>(hidden_size, out_channels, std::pair{3, 3}, std::pair{1, 1}, std::pair{1, 1});
}
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* x) {
// x: [N, C, H, W]
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
x = ggml_cont(ctx, ggml_torch_permute(ctx, x, 2, 0, 1, 3)); // [N, H, W, C]
x = norm->forward(ctx, x);
x = ggml_cont(ctx, ggml_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, H, W]
x = conv->forward(ctx, x);
return x;
}
};
struct ChromaRadianceParams {
int64_t nerf_hidden_size = 64;
int64_t nerf_mlp_ratio = 4;
int64_t nerf_depth = 4;
int64_t nerf_max_freqs = 8;
};
struct FluxParams { struct FluxParams {
SDVersion version = VERSION_FLUX;
bool is_chroma = false;
int64_t patch_size = 2;
int64_t in_channels = 64; int64_t in_channels = 64;
int64_t out_channels = 64; int64_t out_channels = 64;
int64_t vec_in_dim = 768; int64_t vec_in_dim = 768;
@ -565,8 +693,8 @@ namespace Flux {
bool qkv_bias = true; bool qkv_bias = true;
bool guidance_embed = true; bool guidance_embed = true;
bool flash_attn = true; bool flash_attn = true;
bool is_chroma = false; int64_t in_dim = 64;
SDVersion version = VERSION_FLUX; ChromaRadianceParams chroma_radiance_params;
}; };
struct Flux : public GGMLBlock { struct Flux : public GGMLBlock {
@ -575,53 +703,89 @@ namespace Flux {
Flux() {} Flux() {}
Flux(FluxParams params) Flux(FluxParams params)
: params(params) { : params(params) {
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true)); if (params.version == VERSION_CHROMA_RADIANCE) {
if (params.is_chroma) { std::pair<int, int> kernel_size = {(int)params.patch_size, (int)params.patch_size};
blocks["distilled_guidance_layer"] = std::shared_ptr<GGMLBlock>(new ChromaApproximator(params.in_channels, params.hidden_size)); std::pair<int, int> stride = kernel_size;
blocks["img_in_patch"] = std::make_shared<Conv2d>(params.in_channels,
params.hidden_size,
kernel_size,
stride);
} else { } else {
blocks["time_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size)); blocks["img_in"] = std::make_shared<Linear>(params.in_channels, params.hidden_size, true);
blocks["vector_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(params.vec_in_dim, params.hidden_size)); }
if (params.is_chroma) {
blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(params.in_dim, params.hidden_size);
} else {
blocks["time_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size);
blocks["vector_in"] = std::make_shared<MLPEmbedder>(params.vec_in_dim, params.hidden_size);
if (params.guidance_embed) { if (params.guidance_embed) {
blocks["guidance_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size)); blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size);
} }
} }
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size, true)); blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, true);
for (int i = 0; i < params.depth; i++) { for (int i = 0; i < params.depth; i++) {
blocks["double_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new DoubleStreamBlock(params.hidden_size, blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(params.hidden_size,
params.num_heads, params.num_heads,
params.mlp_ratio, params.mlp_ratio,
i, i,
params.qkv_bias, params.qkv_bias,
params.flash_attn, params.flash_attn,
params.is_chroma)); params.is_chroma);
} }
for (int i = 0; i < params.depth_single_blocks; i++) { for (int i = 0; i < params.depth_single_blocks; i++) {
blocks["single_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new SingleStreamBlock(params.hidden_size, blocks["single_blocks." + std::to_string(i)] = std::make_shared<SingleStreamBlock>(params.hidden_size,
params.num_heads, params.num_heads,
params.mlp_ratio, params.mlp_ratio,
i, i,
0.f, 0.f,
params.flash_attn, params.flash_attn,
params.is_chroma)); params.is_chroma);
} }
blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new LastLayer(params.hidden_size, 1, params.out_channels, params.is_chroma)); if (params.version == VERSION_CHROMA_RADIANCE) {
blocks["nerf_image_embedder"] = std::make_shared<NerfEmbedder>(params.in_channels,
params.chroma_radiance_params.nerf_hidden_size,
params.chroma_radiance_params.nerf_max_freqs);
for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) {
blocks["nerf_blocks." + std::to_string(i)] = std::make_shared<NerfGLUBlock>(params.hidden_size,
params.chroma_radiance_params.nerf_hidden_size,
params.chroma_radiance_params.nerf_mlp_ratio);
}
blocks["nerf_final_layer_conv"] = std::make_shared<NerfFinalLayerConv>(params.chroma_radiance_params.nerf_hidden_size,
params.in_channels);
} else {
blocks["final_layer"] = std::make_shared<LastLayer>(params.hidden_size, 1, params.out_channels, params.is_chroma);
}
}
struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
struct ggml_tensor* x) {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w]
return x;
} }
struct ggml_tensor* patchify(struct ggml_context* ctx, struct ggml_tensor* patchify(struct ggml_context* ctx,
struct ggml_tensor* x, struct ggml_tensor* x) {
int64_t patch_size) {
// x: [N, C, H, W] // x: [N, C, H, W]
// return: [N, h*w, C * patch_size * patch_size] // return: [N, h*w, C * patch_size * patch_size]
int64_t N = x->ne[3]; int64_t N = x->ne[3];
int64_t C = x->ne[2]; int64_t C = x->ne[2];
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int64_t W = x->ne[0]; int64_t W = x->ne[0];
int64_t p = patch_size; int64_t p = params.patch_size;
int64_t h = H / patch_size; int64_t h = H / params.patch_size;
int64_t w = W / patch_size; int64_t w = W / params.patch_size;
GGML_ASSERT(h * p == H && w * p == W); GGML_ASSERT(h * p == H && w * p == W);
@ -633,18 +797,25 @@ namespace Flux {
return x; return x;
} }
struct ggml_tensor* process_img(struct ggml_context* ctx,
struct ggml_tensor* x) {
// img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
x = pad_to_patch_size(ctx, x);
x = patchify(ctx, x);
return x;
}
struct ggml_tensor* unpatchify(struct ggml_context* ctx, struct ggml_tensor* unpatchify(struct ggml_context* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
int64_t h, int64_t h,
int64_t w, int64_t w) {
int64_t patch_size) {
// x: [N, h*w, C*patch_size*patch_size] // x: [N, h*w, C*patch_size*patch_size]
// return: [N, C, H, W] // return: [N, C, H, W]
int64_t N = x->ne[2]; int64_t N = x->ne[2];
int64_t C = x->ne[0] / patch_size / patch_size; int64_t C = x->ne[0] / params.patch_size / params.patch_size;
int64_t H = h * patch_size; int64_t H = h * params.patch_size;
int64_t W = w * patch_size; int64_t W = w * params.patch_size;
int64_t p = patch_size; int64_t p = params.patch_size;
GGML_ASSERT(C * p * p == x->ne[0]); GGML_ASSERT(C * p * p == x->ne[0]);
@ -671,7 +842,10 @@ namespace Flux {
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]); auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]); auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
img = img_in->forward(ctx, img); if (img_in) {
img = img_in->forward(ctx, img);
}
struct ggml_tensor* vec; struct ggml_tensor* vec;
struct ggml_tensor* txt_img_mask = nullptr; struct ggml_tensor* txt_img_mask = nullptr;
if (params.is_chroma) { if (params.is_chroma) {
@ -682,7 +856,7 @@ namespace Flux {
// auto mod_index_arange = ggml_arange(ctx, 0, (float)mod_index_length, 1); // auto mod_index_arange = ggml_arange(ctx, 0, (float)mod_index_length, 1);
// ggml_arange tot working on a lot of backends, precomputing it on CPU instead // ggml_arange tot working on a lot of backends, precomputing it on CPU instead
GGML_ASSERT(arange != nullptr); GGML_ASSERT(mod_index_arange != nullptr);
auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f); // [1, 344, 32] auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f); // [1, 344, 32]
// Batch broadcast (will it ever be useful) // Batch broadcast (will it ever be useful)
@ -749,52 +923,96 @@ namespace Flux {
txt_img->nb[2] * txt->ne[1]); // [n_img_token, N, hidden_size] txt_img->nb[2] * txt->ne[1]); // [n_img_token, N, hidden_size]
img = ggml_cont(ctx, ggml_permute(ctx, img, 0, 2, 1, 3)); // [N, n_img_token, hidden_size] img = ggml_cont(ctx, ggml_permute(ctx, img, 0, 2, 1, 3)); // [N, n_img_token, hidden_size]
img = final_layer->forward(ctx, img, vec); // (N, T, patch_size ** 2 * out_channels) if (final_layer) {
img = final_layer->forward(ctx, img, vec); // (N, T, patch_size ** 2 * out_channels)
}
return img; return img;
} }
struct ggml_tensor* process_img(struct ggml_context* ctx, struct ggml_tensor* forward_chroma_radiance(struct ggml_context* ctx,
struct ggml_tensor* x) { ggml_backend_t backend,
int64_t W = x->ne[0]; struct ggml_tensor* x,
int64_t H = x->ne[1]; struct ggml_tensor* timestep,
int64_t patch_size = 2; struct ggml_tensor* context,
int pad_h = (patch_size - H % patch_size) % patch_size; struct ggml_tensor* c_concat,
int pad_w = (patch_size - W % patch_size) % patch_size; struct ggml_tensor* y,
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] struct ggml_tensor* guidance,
struct ggml_tensor* pe,
// img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size) struct ggml_tensor* mod_index_arange = nullptr,
auto img = patchify(ctx, x, patch_size); // [N, h*w, C * patch_size * patch_size] struct ggml_tensor* dct = nullptr,
return img; std::vector<ggml_tensor*> ref_latents = {},
} std::vector<int> skip_layers = {}) {
struct ggml_tensor* forward(struct ggml_context* ctx,
ggml_backend_t backend,
struct ggml_tensor* x,
struct ggml_tensor* timestep,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
struct ggml_tensor* pe,
struct ggml_tensor* mod_index_arange = nullptr,
std::vector<ggml_tensor*> ref_latents = {},
std::vector<int> skip_layers = {}) {
// Forward pass of DiT.
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
// timestep: (N,) tensor of diffusion timesteps
// context: (N, L, D)
// c_concat: nullptr, or for (N,C+M, H, W) for Fill
// y: (N, adm_in_channels) tensor of class labels
// guidance: (N,)
// pe: (L, d_head/2, 2, 2)
// return: (N, C, H, W)
GGML_ASSERT(x->ne[3] == 1); GGML_ASSERT(x->ne[3] == 1);
int64_t W = x->ne[0]; int64_t W = x->ne[0];
int64_t H = x->ne[1]; int64_t H = x->ne[1];
int64_t C = x->ne[2]; int64_t C = x->ne[2];
int64_t patch_size = 2; int64_t patch_size = params.patch_size;
int pad_h = (patch_size - H % patch_size) % patch_size;
int pad_w = (patch_size - W % patch_size) % patch_size;
auto img = pad_to_patch_size(ctx, x);
auto orig_img = img;
auto img_in_patch = std::dynamic_pointer_cast<Conv2d>(blocks["img_in_patch"]);
img = img_in_patch->forward(ctx, img); // [N, hidden_size, H/patch_size, W/patch_size]
img = ggml_reshape_3d(ctx, img, img->ne[0] * img->ne[1], img->ne[2], img->ne[3]); // [N, hidden_size, H/patch_size*W/patch_size]
img = ggml_cont(ctx, ggml_torch_permute(ctx, img, 1, 0, 2, 3)); // [N, H/patch_size*W/patch_size, hidden_size]
auto out = forward_orig(ctx, backend, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers); // [N, n_img_token, hidden_size]
// nerf decode
auto nerf_image_embedder = std::dynamic_pointer_cast<NerfEmbedder>(blocks["nerf_image_embedder"]);
auto nerf_final_layer_conv = std::dynamic_pointer_cast<NerfFinalLayerConv>(blocks["nerf_final_layer_conv"]);
auto nerf_pixels = patchify(ctx, orig_img); // [N, num_patches, C * patch_size * patch_size]
int64_t num_patches = nerf_pixels->ne[1];
nerf_pixels = ggml_reshape_3d(ctx,
nerf_pixels,
nerf_pixels->ne[0] / C,
C,
nerf_pixels->ne[1] * nerf_pixels->ne[2]); // [N*num_patches, C, patch_size*patch_size]
nerf_pixels = ggml_cont(ctx, ggml_torch_permute(ctx, nerf_pixels, 1, 0, 2, 3)); // [N*num_patches, patch_size*patch_size, C]
auto nerf_hidden = ggml_reshape_2d(ctx, out, out->ne[0], out->ne[1] * out->ne[2]); // [N*num_patches, hidden_size]
auto img_dct = nerf_image_embedder->forward(ctx, nerf_pixels, dct); // [N*num_patches, patch_size*patch_size, nerf_hidden_size]
for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) {
auto block = std::dynamic_pointer_cast<NerfGLUBlock>(blocks["nerf_blocks." + std::to_string(i)]);
img_dct = block->forward(ctx, img_dct, nerf_hidden);
}
img_dct = ggml_cont(ctx, ggml_torch_permute(ctx, img_dct, 1, 0, 2, 3)); // [N*num_patches, nerf_hidden_size, patch_size*patch_size]
img_dct = ggml_reshape_3d(ctx, img_dct, img_dct->ne[0] * img_dct->ne[1], num_patches, img_dct->ne[2] / num_patches); // [N, num_patches, nerf_hidden_size*patch_size*patch_size]
img_dct = unpatchify(ctx, img_dct, (H + pad_h) / patch_size, (W + pad_w) / patch_size); // [N, nerf_hidden_size, H, W]
out = nerf_final_layer_conv->forward(ctx, img_dct); // [N, C, H, W]
return out;
}
struct ggml_tensor* forward_flux_chroma(struct ggml_context* ctx,
ggml_backend_t backend,
struct ggml_tensor* x,
struct ggml_tensor* timestep,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
struct ggml_tensor* pe,
struct ggml_tensor* mod_index_arange = nullptr,
struct ggml_tensor* dct = nullptr,
std::vector<ggml_tensor*> ref_latents = {},
std::vector<int> skip_layers = {}) {
GGML_ASSERT(x->ne[3] == 1);
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t C = x->ne[2];
int64_t patch_size = params.patch_size;
int pad_h = (patch_size - H % patch_size) % patch_size; int pad_h = (patch_size - H % patch_size) % patch_size;
int pad_w = (patch_size - W % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size;
@ -816,21 +1034,16 @@ namespace Flux {
ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1)); ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
masked = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0); masked = process_img(ctx, masked);
mask = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0); mask = process_img(ctx, mask);
control = ggml_pad(ctx, control, pad_w, pad_h, 0, 0); control = process_img(ctx, control);
masked = patchify(ctx, masked, patch_size);
mask = patchify(ctx, mask, patch_size);
control = patchify(ctx, control, patch_size);
img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0); img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0);
} else if (params.version == VERSION_FLUX_CONTROLS) { } else if (params.version == VERSION_FLUX_CONTROLS) {
GGML_ASSERT(c_concat != nullptr); GGML_ASSERT(c_concat != nullptr);
ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0); auto control = process_img(ctx, c_concat);
control = patchify(ctx, control, patch_size); img = ggml_concat(ctx, img, control, 0);
img = ggml_concat(ctx, img, control, 0);
} }
if (ref_latents.size() > 0) { if (ref_latents.size() > 0) {
@ -849,10 +1062,63 @@ namespace Flux {
} }
// rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2) // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
out = unpatchify(ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size); // [N, C, H + pad_h, W + pad_w] out = unpatchify(ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size); // [N, C, H + pad_h, W + pad_w]
return out; return out;
} }
struct ggml_tensor* forward(struct ggml_context* ctx,
ggml_backend_t backend,
struct ggml_tensor* x,
struct ggml_tensor* timestep,
struct ggml_tensor* context,
struct ggml_tensor* c_concat,
struct ggml_tensor* y,
struct ggml_tensor* guidance,
struct ggml_tensor* pe,
struct ggml_tensor* mod_index_arange = nullptr,
struct ggml_tensor* dct = nullptr,
std::vector<ggml_tensor*> ref_latents = {},
std::vector<int> skip_layers = {}) {
// Forward pass of DiT.
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
// timestep: (N,) tensor of diffusion timesteps
// context: (N, L, D)
// c_concat: nullptr, or for (N,C+M, H, W) for Fill
// y: (N, adm_in_channels) tensor of class labels
// guidance: (N,)
// pe: (L, d_head/2, 2, 2)
// return: (N, C, H, W)
if (params.version == VERSION_CHROMA_RADIANCE) {
return forward_chroma_radiance(ctx,
backend,
x,
timestep,
context,
c_concat,
y,
guidance,
pe,
mod_index_arange,
dct,
ref_latents,
skip_layers);
} else {
return forward_flux_chroma(ctx,
backend,
x,
timestep,
context,
c_concat,
y,
guidance,
pe,
mod_index_arange,
dct,
ref_latents,
skip_layers);
}
}
}; };
struct FluxRunner : public GGMLRunner { struct FluxRunner : public GGMLRunner {
@ -860,7 +1126,8 @@ namespace Flux {
FluxParams flux_params; FluxParams flux_params;
Flux flux; Flux flux;
std::vector<float> pe_vec; std::vector<float> pe_vec;
std::vector<float> mod_index_arange_vec; // for cache std::vector<float> mod_index_arange_vec;
std::vector<float> dct_vec;
SDVersion version; SDVersion version;
bool use_mask = false; bool use_mask = false;
@ -883,6 +1150,9 @@ namespace Flux {
flux_params.in_channels = 128; flux_params.in_channels = 128;
} else if (version == VERSION_FLEX_2) { } else if (version == VERSION_FLEX_2) {
flux_params.in_channels = 196; flux_params.in_channels = 196;
} else if (version == VERSION_CHROMA_RADIANCE) {
flux_params.in_channels = 3;
flux_params.patch_size = 16;
} }
for (auto pair : tensor_types) { for (auto pair : tensor_types) {
std::string tensor_name = pair.first; std::string tensor_name = pair.first;
@ -933,6 +1203,56 @@ namespace Flux {
flux.get_param_tensors(tensors, prefix); flux.get_param_tensors(tensors, prefix);
} }
std::vector<float> fetch_dct_pos(int patch_size, int max_freqs) {
const float PI = 3.14159265358979323846f;
std::vector<float> pos(patch_size);
for (int i = 0; i < patch_size; ++i) {
pos[i] = static_cast<float>(i) / static_cast<float>(patch_size - 1);
}
std::vector<float> pos_x(patch_size * patch_size);
std::vector<float> pos_y(patch_size * patch_size);
for (int i = 0; i < patch_size; ++i) {
for (int j = 0; j < patch_size; ++j) {
pos_x[i * patch_size + j] = pos[j];
pos_y[i * patch_size + j] = pos[i];
}
}
std::vector<float> freqs(max_freqs);
for (int i = 0; i < max_freqs; ++i) {
freqs[i] = static_cast<float>(i);
}
std::vector<float> coeffs(max_freqs * max_freqs);
for (int fx = 0; fx < max_freqs; ++fx) {
for (int fy = 0; fy < max_freqs; ++fy) {
coeffs[fx * max_freqs + fy] = 1.0f / (1.0f + freqs[fx] * freqs[fy]);
}
}
int num_positions = patch_size * patch_size;
int num_features = max_freqs * max_freqs;
std::vector<float> dct(num_positions * num_features);
for (int p = 0; p < num_positions; ++p) {
float px = pos_x[p];
float py = pos_y[p];
for (int fx = 0; fx < max_freqs; ++fx) {
float cx = std::cos(px * freqs[fx] * PI);
for (int fy = 0; fy < max_freqs; ++fy) {
float cy = std::cos(py * freqs[fy] * PI);
float val = cx * cy * coeffs[fx * max_freqs + fy];
dct[p * num_features + (fx * max_freqs + fy)] = val;
}
}
}
return dct;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x, struct ggml_cgraph* build_graph(struct ggml_tensor* x,
struct ggml_tensor* timesteps, struct ggml_tensor* timesteps,
struct ggml_tensor* context, struct ggml_tensor* context,
@ -946,6 +1266,7 @@ namespace Flux {
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false); struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
struct ggml_tensor* mod_index_arange = nullptr; struct ggml_tensor* mod_index_arange = nullptr;
struct ggml_tensor* dct = nullptr; // for chroma radiance
x = to_backend(x); x = to_backend(x);
context = to_backend(context); context = to_backend(context);
@ -976,7 +1297,7 @@ namespace Flux {
pe_vec = Rope::gen_flux_pe(x->ne[1], pe_vec = Rope::gen_flux_pe(x->ne[1],
x->ne[0], x->ne[0],
2, flux_params.patch_size,
x->ne[3], x->ne[3],
context->ne[1], context->ne[1],
ref_latents, ref_latents,
@ -991,6 +1312,17 @@ namespace Flux {
// pe->data = nullptr; // pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data()); set_backend_tensor_data(pe, pe_vec.data());
if (version == VERSION_CHROMA_RADIANCE) {
int64_t patch_size = flux_params.patch_size;
int64_t nerf_max_freqs = flux_params.chroma_radiance_params.nerf_max_freqs;
dct_vec = fetch_dct_pos(patch_size, nerf_max_freqs);
dct = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, nerf_max_freqs * nerf_max_freqs, patch_size * patch_size);
// dct->data = dct_vec.data();
// print_ggml_tensor(dct);
// dct->data = nullptr;
set_backend_tensor_data(dct, dct_vec.data());
}
struct ggml_tensor* out = flux.forward(compute_ctx, struct ggml_tensor* out = flux.forward(compute_ctx,
runtime_backend, runtime_backend,
x, x,
@ -1001,6 +1333,7 @@ namespace Flux {
guidance, guidance,
pe, pe,
mod_index_arange, mod_index_arange,
dct,
ref_latents, ref_latents,
skip_layers); skip_layers);
@ -1035,7 +1368,7 @@ namespace Flux {
void test() { void test() {
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = static_cast<size_t>(20 * 1024 * 1024); // 20 MB params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
params.mem_buffer = nullptr; params.mem_buffer = nullptr;
params.no_alloc = false; params.no_alloc = false;
@ -1046,22 +1379,25 @@ namespace Flux {
// cpu f16: // cpu f16:
// cuda f16: nan // cuda f16: nan
// cuda q8_0: pass // cuda q8_0: pass
auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1); // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
ggml_set_f32(x, 0.01f); // ggml_set_f32(x, 0.01f);
auto x = load_tensor_from_file(work_ctx, "chroma_x.bin");
// print_ggml_tensor(x); // print_ggml_tensor(x);
std::vector<float> timesteps_vec(1, 999.f); std::vector<float> timesteps_vec(1, 1.f);
auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
std::vector<float> guidance_vec(1, 3.5f); std::vector<float> guidance_vec(1, 0.f);
auto guidance = vector_to_ggml_tensor(work_ctx, guidance_vec); auto guidance = vector_to_ggml_tensor(work_ctx, guidance_vec);
auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 256, 1); // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 256, 1);
ggml_set_f32(context, 0.01f); // ggml_set_f32(context, 0.01f);
auto context = load_tensor_from_file(work_ctx, "chroma_context.bin");
// print_ggml_tensor(context); // print_ggml_tensor(context);
auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, 1); // auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, 1);
ggml_set_f32(y, 0.01f); // ggml_set_f32(y, 0.01f);
auto y = nullptr;
// print_ggml_tensor(y); // print_ggml_tensor(y);
struct ggml_tensor* out = nullptr; struct ggml_tensor* out = nullptr;
@ -1076,32 +1412,44 @@ namespace Flux {
} }
static void load_from_file_and_test(const std::string& file_path) { static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0); // ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init(); ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_Q8_0; ggml_type model_data_type = GGML_TYPE_Q8_0;
std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend, false);
{
LOG_INFO("loading from '%s'", file_path.c_str());
flux->alloc_params_buffer(); ModelLoader model_loader;
std::map<std::string, ggml_tensor*> tensors; if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
flux->get_param_tensors(tensors, "model.diffusion_model"); LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return;
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
return;
}
bool success = model_loader.load_tensors(tensors);
if (!success) {
LOG_ERROR("load tensors from model loader failed");
return;
}
LOG_INFO("flux model loaded");
} }
auto tensor_types = model_loader.tensor_storages_types;
for (auto& item : tensor_types) {
// LOG_DEBUG("%s %u", item.first.c_str(), item.second);
if (ends_with(item.first, "weight")) {
// item.second = model_data_type;
}
}
std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend,
false,
tensor_types,
"model.diffusion_model",
VERSION_CHROMA_RADIANCE,
false,
true);
flux->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
flux->get_param_tensors(tensors, "model.diffusion_model");
bool success = model_loader.load_tensors(tensors);
if (!success) {
LOG_ERROR("load tensors from model loader failed");
return;
}
LOG_INFO("flux model loaded");
flux->test(); flux->test();
} }
}; };

View File

@ -954,7 +954,16 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
if (scale != 1.f) { if (scale != 1.f) {
x = ggml_scale(ctx, x, scale); x = ggml_scale(ctx, x, scale);
} }
x = ggml_mul_mat(ctx, w, x); if (x->ne[2] * x->ne[3] > 1024) {
// workaround: avoid ggml cuda error
int64_t ne2 = x->ne[2];
int64_t ne3 = x->ne[3];
x = ggml_reshape_2d(ctx, x, x->ne[0], x->ne[1] * x->ne[2] * x->ne[3]);
x = ggml_mul_mat(ctx, w, x);
x = ggml_reshape_4d(ctx, x, x->ne[0], x->ne[1] / ne2 / ne3, ne2, ne3);
} else {
x = ggml_mul_mat(ctx, w, x);
}
if (force_prec_f32) { if (force_prec_f32) {
ggml_mul_mat_set_prec(x, GGML_PREC_F32); ggml_mul_mat_set_prec(x, GGML_PREC_F32);
} }

View File

@ -330,6 +330,10 @@ std::string convert_cond_model_name(const std::string& name) {
return new_name; return new_name;
} }
if (new_name == "model.text_projection.weight") {
new_name = "transformer.text_model.text_projection";
}
if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) { if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) {
new_name = open_clip_to_hf_clip_model[new_name]; new_name = open_clip_to_hf_clip_model[new_name];
} }
@ -623,6 +627,14 @@ std::string convert_tensor_name(std::string name) {
if (starts_with(name, "diffusion_model")) { if (starts_with(name, "diffusion_model")) {
name = "model." + name; name = "model." + name;
} }
if (starts_with(name, "model.diffusion_model.up_blocks.0.attentions.0.")) {
name.replace(0, sizeof("model.diffusion_model.up_blocks.0.attentions.0.") - 1,
"model.diffusion_model.output_blocks.0.1.");
}
if (starts_with(name, "model.diffusion_model.up_blocks.0.attentions.1.")) {
name.replace(0, sizeof("model.diffusion_model.up_blocks.0.attentions.1.") - 1,
"model.diffusion_model.output_blocks.1.1.");
}
// size_t pos = name.find("lora_A"); // size_t pos = name.find("lora_A");
// if (pos != std::string::npos) { // if (pos != std::string::npos) {
// name.replace(pos, strlen("lora_A"), "lora_up"); // name.replace(pos, strlen("lora_A"), "lora_up");
@ -1766,7 +1778,6 @@ bool ModelLoader::model_is_unet() {
SDVersion ModelLoader::get_sd_version() { SDVersion ModelLoader::get_sd_version() {
TensorStorage token_embedding_weight, input_block_weight; TensorStorage token_embedding_weight, input_block_weight;
bool input_block_checked = false;
bool has_multiple_encoders = false; bool has_multiple_encoders = false;
bool is_unet = false; bool is_unet = false;
@ -1776,14 +1787,15 @@ SDVersion ModelLoader::get_sd_version() {
bool is_wan = false; bool is_wan = false;
int64_t patch_embedding_channels = 0; int64_t patch_embedding_channels = 0;
bool has_img_emb = false; bool has_img_emb = false;
bool has_middle_block_1 = false;
for (auto& tensor_storage : tensor_storages) { for (auto& tensor_storage : tensor_storages) {
if (!(is_xl || is_flux)) { if (!(is_xl)) {
if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
is_flux = true; is_flux = true;
if (input_block_checked) { }
break; if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {
} return VERSION_CHROMA_RADIANCE;
} }
if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
return VERSION_SD3; return VERSION_SD3;
@ -1800,28 +1812,29 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.img_emb") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.img_emb") != std::string::npos) {
has_img_emb = true; has_img_emb = true;
} }
if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos || tensor_storage.name.find("unet.down_blocks.") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos ||
tensor_storage.name.find("unet.down_blocks.") != std::string::npos) {
is_unet = true; is_unet = true;
if (has_multiple_encoders) { if (has_multiple_encoders) {
is_xl = true; is_xl = true;
if (input_block_checked) {
break;
}
} }
} }
if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos || tensor_storage.name.find("te.1") != std::string::npos) { if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos ||
tensor_storage.name.find("cond_stage_model.1") != std::string::npos ||
tensor_storage.name.find("te.1") != std::string::npos) {
has_multiple_encoders = true; has_multiple_encoders = true;
if (is_unet) { if (is_unet) {
is_xl = true; is_xl = true;
if (input_block_checked) {
break;
}
} }
} }
if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
return VERSION_SVD; return VERSION_SVD;
} }
} }
if (tensor_storage.name.find("model.diffusion_model.middle_block.1.") != std::string::npos ||
tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) {
has_middle_block_1 = true;
}
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" || if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" || tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
tensor_storage.name == "text_model.embeddings.token_embedding.weight" || tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
@ -1831,12 +1844,10 @@ SDVersion ModelLoader::get_sd_version() {
token_embedding_weight = tensor_storage; token_embedding_weight = tensor_storage;
// break; // break;
} }
if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight" || tensor_storage.name == "unet.conv_in.weight") { if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" ||
input_block_weight = tensor_storage; tensor_storage.name == "model.diffusion_model.img_in.weight" ||
input_block_checked = true; tensor_storage.name == "unet.conv_in.weight") {
if (is_xl || is_flux) { input_block_weight = tensor_storage;
break;
}
} }
} }
if (is_wan) { if (is_wan) {
@ -1858,6 +1869,9 @@ SDVersion ModelLoader::get_sd_version() {
if (is_ip2p) { if (is_ip2p) {
return VERSION_SDXL_PIX2PIX; return VERSION_SDXL_PIX2PIX;
} }
if (!has_middle_block_1) {
return VERSION_SDXL_SSD1B;
}
return VERSION_SDXL; return VERSION_SDXL;
} }
@ -1881,6 +1895,9 @@ SDVersion ModelLoader::get_sd_version() {
if (is_ip2p) { if (is_ip2p) {
return VERSION_SD1_PIX2PIX; return VERSION_SD1_PIX2PIX;
} }
if (!has_middle_block_1) {
return VERSION_SD1_TINY_UNET;
}
return VERSION_SD1; return VERSION_SD1;
} else if (token_embedding_weight.ne[0] == 1024) { } else if (token_embedding_weight.ne[0] == 1024) {
if (is_inpaint) { if (is_inpaint) {

13
model.h
View File

@ -23,17 +23,20 @@ enum SDVersion {
VERSION_SD1, VERSION_SD1,
VERSION_SD1_INPAINT, VERSION_SD1_INPAINT,
VERSION_SD1_PIX2PIX, VERSION_SD1_PIX2PIX,
VERSION_SD1_TINY_UNET,
VERSION_SD2, VERSION_SD2,
VERSION_SD2_INPAINT, VERSION_SD2_INPAINT,
VERSION_SDXL, VERSION_SDXL,
VERSION_SDXL_INPAINT, VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX, VERSION_SDXL_PIX2PIX,
VERSION_SDXL_SSD1B,
VERSION_SVD, VERSION_SVD,
VERSION_SD3, VERSION_SD3,
VERSION_FLUX, VERSION_FLUX,
VERSION_FLUX_FILL, VERSION_FLUX_FILL,
VERSION_FLUX_CONTROLS, VERSION_FLUX_CONTROLS,
VERSION_FLEX_2, VERSION_FLEX_2,
VERSION_CHROMA_RADIANCE,
VERSION_WAN2, VERSION_WAN2,
VERSION_WAN2_2_I2V, VERSION_WAN2_2_I2V,
VERSION_WAN2_2_TI2V, VERSION_WAN2_2_TI2V,
@ -42,7 +45,7 @@ enum SDVersion {
}; };
static inline bool sd_version_is_sd1(SDVersion version) { static inline bool sd_version_is_sd1(SDVersion version) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) { if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) {
return true; return true;
} }
return false; return false;
@ -56,7 +59,7 @@ static inline bool sd_version_is_sd2(SDVersion version) {
} }
static inline bool sd_version_is_sdxl(SDVersion version) { static inline bool sd_version_is_sdxl(SDVersion version) {
if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) { if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX || version == VERSION_SDXL_SSD1B) {
return true; return true;
} }
return false; return false;
@ -70,7 +73,11 @@ static inline bool sd_version_is_sd3(SDVersion version) {
} }
static inline bool sd_version_is_flux(SDVersion version) { static inline bool sd_version_is_flux(SDVersion version) {
if (version == VERSION_FLUX || version == VERSION_FLUX_FILL || version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2) { if (version == VERSION_FLUX ||
version == VERSION_FLUX_FILL ||
version == VERSION_FLUX_CONTROLS ||
version == VERSION_FLEX_2 ||
version == VERSION_CHROMA_RADIANCE) {
return true; return true;
} }
return false; return false;

View File

@ -649,7 +649,7 @@ namespace Qwen {
static void load_from_file_and_test(const std::string& file_path) { static void load_from_file_and_test(const std::string& file_path) {
// cuda q8: pass // cuda q8: pass
// cuda q8 fa: nan // cuda q8 fa: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0); // ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init(); ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_Q8_0; ggml_type model_data_type = GGML_TYPE_Q8_0;

View File

@ -28,17 +28,20 @@ const char* model_version_to_str[] = {
"SD 1.x", "SD 1.x",
"SD 1.x Inpaint", "SD 1.x Inpaint",
"Instruct-Pix2Pix", "Instruct-Pix2Pix",
"SD 1.x Tiny UNet",
"SD 2.x", "SD 2.x",
"SD 2.x Inpaint", "SD 2.x Inpaint",
"SDXL", "SDXL",
"SDXL Inpaint", "SDXL Inpaint",
"SDXL Instruct-Pix2Pix", "SDXL Instruct-Pix2Pix",
"SDXL (SSD1B)",
"SVD", "SVD",
"SD3.x", "SD3.x",
"Flux", "Flux",
"Flux Fill", "Flux Fill",
"Flux Control", "Flux Control",
"Flex.2", "Flex.2",
"Chroma Radiance",
"Wan 2.x", "Wan 2.x",
"Wan 2.2 I2V", "Wan 2.2 I2V",
"Wan 2.2 TI2V", "Wan 2.2 TI2V",
@ -492,6 +495,9 @@ public:
version); version);
first_stage_model->alloc_params_buffer(); first_stage_model->alloc_params_buffer();
first_stage_model->get_param_tensors(tensors, "first_stage_model"); first_stage_model->get_param_tensors(tensors, "first_stage_model");
} else if (version == VERSION_CHROMA_RADIANCE) {
first_stage_model = std::make_shared<FakeVAE>(vae_backend,
offload_params_to_cpu);
} else if (!use_tiny_autoencoder) { } else if (!use_tiny_autoencoder) {
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
offload_params_to_cpu, offload_params_to_cpu,
@ -1039,7 +1045,7 @@ public:
struct ggml_tensor* c_concat = nullptr; struct ggml_tensor* c_concat = nullptr;
{ {
if (zero_out_masked) { if (zero_out_masked) {
c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1); c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / get_vae_scale_factor(), height / get_vae_scale_factor(), 4, 1);
ggml_set_f32(c_concat, 0.f); ggml_set_f32(c_concat, 0.f);
} else { } else {
ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
@ -1373,6 +1379,53 @@ public:
return x; return x;
} }
int get_vae_scale_factor() {
int vae_scale_factor = 8;
if (version == VERSION_WAN2_2_TI2V) {
vae_scale_factor = 16;
} else if (version == VERSION_CHROMA_RADIANCE) {
vae_scale_factor = 1;
}
return vae_scale_factor;
}
int get_latent_channel() {
int latent_channel = 4;
if (sd_version_is_dit(version)) {
if (version == VERSION_WAN2_2_TI2V) {
latent_channel = 48;
} else if (version == VERSION_CHROMA_RADIANCE) {
latent_channel = 3;
} else {
latent_channel = 16;
}
}
return latent_channel;
}
ggml_tensor* generate_init_latent(ggml_context* work_ctx,
int width,
int height,
int frames = 1,
bool video = false) {
int vae_scale_factor = get_vae_scale_factor();
int W = width / vae_scale_factor;
int H = height / vae_scale_factor;
int T = frames;
if (sd_version_is_wan(version)) {
T = ((T - 1) / 4) + 1;
}
int C = get_latent_channel();
ggml_tensor* init_latent;
if (video) {
init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C);
} else {
init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
}
ggml_set_f32(init_latent, shift_factor);
return init_latent;
}
void process_latent_in(ggml_tensor* latent) { void process_latent_in(ggml_tensor* latent) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
GGML_ASSERT(latent->ne[3] == 16 || latent->ne[3] == 48); GGML_ASSERT(latent->ne[3] == 16 || latent->ne[3] == 48);
@ -1408,6 +1461,8 @@ public:
} }
} }
} }
} else if (version == VERSION_CHROMA_RADIANCE) {
// pass
} else { } else {
ggml_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { ggml_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = ggml_tensor_get_f32(latent, i0, i1, i2, i3); float value = ggml_tensor_get_f32(latent, i0, i1, i2, i3);
@ -1452,6 +1507,8 @@ public:
} }
} }
} }
} else if (version == VERSION_CHROMA_RADIANCE) {
// pass
} else { } else {
ggml_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { ggml_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = ggml_tensor_get_f32(latent, i0, i1, i2, i3); float value = ggml_tensor_get_f32(latent, i0, i1, i2, i3);
@ -1493,11 +1550,11 @@ public:
ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) { ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
int64_t t0 = ggml_time_ms(); int64_t t0 = ggml_time_ms();
ggml_tensor* result = nullptr; ggml_tensor* result = nullptr;
int W = x->ne[0] / 8; int W = x->ne[0] / get_vae_scale_factor();
int H = x->ne[1] / 8; int H = x->ne[1] / get_vae_scale_factor();
int C = get_latent_channel();
if (vae_tiling_params.enabled && !encode_video) { if (vae_tiling_params.enabled && !encode_video) {
// TODO wan2.2 vae support? // TODO wan2.2 vae support?
int C = sd_version_is_dit(version) ? 16 : 4;
int ne2; int ne2;
int ne3; int ne3;
if (sd_version_is_qwen_image(version)) { if (sd_version_is_qwen_image(version)) {
@ -1584,7 +1641,10 @@ public:
ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* vae_output) { ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* vae_output) {
ggml_tensor* latent; ggml_tensor* latent;
if (use_tiny_autoencoder || sd_version_is_qwen_image(version) || sd_version_is_wan(version)) { if (use_tiny_autoencoder ||
sd_version_is_qwen_image(version) ||
sd_version_is_wan(version) ||
version == VERSION_CHROMA_RADIANCE) {
latent = vae_output; latent = vae_output;
} else if (version == VERSION_SD1_PIX2PIX) { } else if (version == VERSION_SD1_PIX2PIX) {
latent = ggml_view_3d(work_ctx, latent = ggml_view_3d(work_ctx,
@ -1611,18 +1671,14 @@ public:
} }
ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) { ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) {
int64_t W = x->ne[0] * 8; int64_t W = x->ne[0] * get_vae_scale_factor();
int64_t H = x->ne[1] * 8; int64_t H = x->ne[1] * get_vae_scale_factor();
int64_t C = 3; int64_t C = 3;
ggml_tensor* result = nullptr; ggml_tensor* result = nullptr;
if (decode_video) { if (decode_video) {
int T = x->ne[2]; int T = x->ne[2];
if (sd_version_is_wan(version)) { if (sd_version_is_wan(version)) {
T = ((T - 1) * 4) + 1; T = ((T - 1) * 4) + 1;
if (version == VERSION_WAN2_2_TI2V) {
W = x->ne[0] * 16;
H = x->ne[1] * 16;
}
} }
result = ggml_new_tensor_4d(work_ctx, result = ggml_new_tensor_4d(work_ctx,
GGML_TYPE_F32, GGML_TYPE_F32,
@ -2233,16 +2289,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
// Sample // Sample
std::vector<struct ggml_tensor*> final_latents; // collect latents to decode std::vector<struct ggml_tensor*> final_latents; // collect latents to decode
int C = 4; int C = sd_ctx->sd->get_latent_channel();
if (sd_version_is_sd3(sd_ctx->sd->version)) { int W = width / sd_ctx->sd->get_vae_scale_factor();
C = 16; int H = height / sd_ctx->sd->get_vae_scale_factor();
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
C = 16;
} else if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
C = 16;
}
int W = width / 8;
int H = height / 8;
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
struct ggml_tensor* control_latent = nullptr; struct ggml_tensor* control_latent = nullptr;
@ -2420,51 +2469,11 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
return result_images; return result_images;
} }
ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
ggml_context* work_ctx,
int width,
int height,
int frames = 1,
bool video = false) {
int C = 4;
int T = frames;
int W = width / 8;
int H = height / 8;
if (sd_version_is_sd3(sd_ctx->sd->version)) {
C = 16;
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
C = 16;
} else if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
C = 16;
} else if (sd_version_is_wan(sd_ctx->sd->version)) {
C = 16;
T = ((T - 1) / 4) + 1;
if (sd_ctx->sd->version == VERSION_WAN2_2_TI2V) {
C = 48;
W = width / 16;
H = height / 16;
}
}
ggml_tensor* init_latent;
if (video) {
init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C);
} else {
init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
}
if (sd_version_is_sd3(sd_ctx->sd->version)) {
ggml_set_f32(init_latent, 0.0609f);
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
ggml_set_f32(init_latent, 0.1159f);
} else {
ggml_set_f32(init_latent, 0.f);
}
return init_latent;
}
sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params; sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
int width = sd_img_gen_params->width; int width = sd_img_gen_params->width;
int height = sd_img_gen_params->height; int height = sd_img_gen_params->height;
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
if (sd_version_is_dit(sd_ctx->sd->version)) { if (sd_version_is_dit(sd_ctx->sd->version)) {
if (width % 16 || height % 16) { if (width % 16 || height % 16) {
LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)", LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)",
@ -2560,20 +2569,20 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
1); 1);
for (int ix = 0; ix < masked_latent->ne[0]; ix++) { for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
for (int iy = 0; iy < masked_latent->ne[1]; iy++) { for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
int mx = ix * 8; int mx = ix * vae_scale_factor;
int my = iy * 8; int my = iy * vae_scale_factor;
if (sd_ctx->sd->version == VERSION_FLUX_FILL) { if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
for (int k = 0; k < masked_latent->ne[2]; k++) { for (int k = 0; k < masked_latent->ne[2]; k++) {
float v = ggml_tensor_get_f32(masked_latent, ix, iy, k); float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
ggml_tensor_set_f32(concat_latent, v, ix, iy, k); ggml_tensor_set_f32(concat_latent, v, ix, iy, k);
} }
// "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
for (int x = 0; x < 8; x++) { for (int x = 0; x < vae_scale_factor; x++) {
for (int y = 0; y < 8; y++) { for (int y = 0; y < vae_scale_factor; y++) {
float m = ggml_tensor_get_f32(mask_img, mx + x, my + y); float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
// TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?) // TODO: check if the way the mask is flattened is correct (is it supposed to be x*vae_scale_factor+y or x+vae_scale_factor*y?)
// python code was using "b (h 8) (w 8) -> b (8 8) h w" // python code was using "b (h vae_scale_factor) (w vae_scale_factor) -> b (vae_scale_factor vae_scale_factor) h w"
ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y); ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * vae_scale_factor + y);
} }
} }
} else if (sd_ctx->sd->version == VERSION_FLEX_2) { } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
@ -2596,11 +2605,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
{ {
// LOG_WARN("Inpainting with a base model is not great"); // LOG_WARN("Inpainting with a base model is not great");
denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / vae_scale_factor, height / vae_scale_factor, 1, 1);
for (int ix = 0; ix < denoise_mask->ne[0]; ix++) { for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
for (int iy = 0; iy < denoise_mask->ne[1]; iy++) { for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
int mx = ix * 8; int mx = ix * vae_scale_factor;
int my = iy * 8; int my = iy * vae_scale_factor;
float m = ggml_tensor_get_f32(mask_img, mx, my); float m = ggml_tensor_get_f32(mask_img, mx, my);
ggml_tensor_set_f32(denoise_mask, m, ix, iy); ggml_tensor_set_f32(denoise_mask, m, ix, iy);
} }
@ -2611,7 +2620,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
if (sd_version_is_inpaint(sd_ctx->sd->version)) { if (sd_version_is_inpaint(sd_ctx->sd->version)) {
LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
} }
init_latent = generate_init_latent(sd_ctx, work_ctx, width, height); init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height);
} }
sd_guidance_params_t guidance = sd_img_gen_params->sample_params.guidance; sd_guidance_params_t guidance = sd_img_gen_params->sample_params.guidance;
@ -2739,6 +2748,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
int sample_steps = sd_vid_gen_params->sample_params.sample_steps; int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
LOG_INFO("generate_video %dx%dx%d", width, height, frames); LOG_INFO("generate_video %dx%dx%d", width, height, frames);
int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
sd_ctx->sd->init_scheduler(sd_vid_gen_params->sample_params.scheduler); sd_ctx->sd->init_scheduler(sd_vid_gen_params->sample_params.scheduler);
int high_noise_sample_steps = 0; int high_noise_sample_steps = 0;
@ -2836,7 +2847,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
ggml_tensor_set_f32(image, value, i0, i1, i2, i3); ggml_tensor_set_f32(image, value, i0, i1, i2, i3);
}); });
concat_latent = sd_ctx->sd->encode_first_stage(work_ctx, image); // [b*c, t, h/8, w/8] concat_latent = sd_ctx->sd->encode_first_stage(work_ctx, image); // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
int64_t t2 = ggml_time_ms(); int64_t t2 = ggml_time_ms();
LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
@ -2846,7 +2857,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
concat_latent->ne[0], concat_latent->ne[0],
concat_latent->ne[1], concat_latent->ne[1],
concat_latent->ne[2], concat_latent->ne[2],
4); // [b*4, t, w/8, h/8] 4); // [b*4, t, w/vae_scale_factor, h/vae_scale_factor]
ggml_tensor_iter(concat_mask, [&](ggml_tensor* concat_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { ggml_tensor_iter(concat_mask, [&](ggml_tensor* concat_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = 0.0f; float value = 0.0f;
if (i2 == 0 && sd_vid_gen_params->init_image.data) { // start image if (i2 == 0 && sd_vid_gen_params->init_image.data) { // start image
@ -2857,7 +2868,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
ggml_tensor_set_f32(concat_mask, value, i0, i1, i2, i3); ggml_tensor_set_f32(concat_mask, value, i0, i1, i2, i3);
}); });
concat_latent = ggml_tensor_concat(work_ctx, concat_mask, concat_latent, 3); // [b*(c+4), t, h/8, w/8] concat_latent = ggml_tensor_concat(work_ctx, concat_mask, concat_latent, 3); // [b*(c+4), t, h/vae_scale_factor, w/vae_scale_factor]
} else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && sd_vid_gen_params->init_image.data) { } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && sd_vid_gen_params->init_image.data) {
LOG_INFO("IMG2VID"); LOG_INFO("IMG2VID");
@ -2868,7 +2879,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
auto init_image_latent = sd_ctx->sd->vae_encode(work_ctx, init_img); // [b*c, 1, h/16, w/16] auto init_image_latent = sd_ctx->sd->vae_encode(work_ctx, init_img); // [b*c, 1, h/16, w/16]
init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true); init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true);
denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1); denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
ggml_set_f32(denoise_mask, 1.f); ggml_set_f32(denoise_mask, 1.f);
@ -2925,8 +2936,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
ggml_tensor_set_f32(reactive, reactive_value, i0, i1, i2, i3); ggml_tensor_set_f32(reactive, reactive_value, i0, i1, i2, i3);
}); });
inactive = sd_ctx->sd->encode_first_stage(work_ctx, inactive); // [b*c, t, h/8, w/8] inactive = sd_ctx->sd->encode_first_stage(work_ctx, inactive); // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
reactive = sd_ctx->sd->encode_first_stage(work_ctx, reactive); // [b*c, t, h/8, w/8] reactive = sd_ctx->sd->encode_first_stage(work_ctx, reactive); // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
int64_t length = inactive->ne[2]; int64_t length = inactive->ne[2];
if (ref_image_latent) { if (ref_image_latent) {
@ -2934,7 +2945,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
frames = (length - 1) * 4 + 1; frames = (length - 1) * 4 + 1;
ref_image_num = 1; ref_image_num = 1;
} }
vace_context = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, inactive->ne[0], inactive->ne[1], length, 96); // [b*96, t, h/8, w/8] vace_context = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, inactive->ne[0], inactive->ne[1], length, 96); // [b*96, t, h/vae_scale_factor, w/vae_scale_factor]
ggml_tensor_iter(vace_context, [&](ggml_tensor* vace_context, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { ggml_tensor_iter(vace_context, [&](ggml_tensor* vace_context, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value; float value;
if (i3 < 32) { if (i3 < 32) {
@ -2951,7 +2962,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
if (ref_image_latent && i2 == 0) { if (ref_image_latent && i2 == 0) {
value = 0.f; value = 0.f;
} else { } else {
int64_t vae_stride = 8; int64_t vae_stride = vae_scale_factor;
int64_t mask_height_index = i1 * vae_stride + (i3 - 32) / vae_stride; int64_t mask_height_index = i1 * vae_stride + (i3 - 32) / vae_stride;
int64_t mask_width_index = i0 * vae_stride + (i3 - 32) % vae_stride; int64_t mask_width_index = i0 * vae_stride + (i3 - 32) % vae_stride;
value = ggml_tensor_get_f32(mask, mask_width_index, mask_height_index, i2 - ref_image_num, 0); value = ggml_tensor_get_f32(mask, mask_width_index, mask_height_index, i2 - ref_image_num, 0);
@ -2964,7 +2975,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
} }
if (init_latent == nullptr) { if (init_latent == nullptr) {
init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true); init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true);
} }
// Get learned condition // Get learned condition
@ -2995,16 +3006,10 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
sd_ctx->sd->cond_stage_model->free_params_buffer(); sd_ctx->sd->cond_stage_model->free_params_buffer();
} }
int W = width / 8; int W = width / vae_scale_factor;
int H = height / 8; int H = height / vae_scale_factor;
int T = init_latent->ne[2]; int T = init_latent->ne[2];
int C = 16; int C = sd_ctx->sd->get_latent_channel();
if (sd_ctx->sd->version == VERSION_WAN2_2_TI2V) {
W = width / 16;
H = height / 16;
C = 48;
}
struct ggml_tensor* final_latent; struct ggml_tensor* final_latent;
struct ggml_tensor* x_t = init_latent; struct ggml_tensor* x_t = init_latent;

View File

@ -204,6 +204,9 @@ public:
adm_in_channels = 768; adm_in_channels = 768;
num_head_channels = 64; num_head_channels = 64;
num_heads = -1; num_heads = -1;
} else if (version == VERSION_SD1_TINY_UNET) {
num_res_blocks = 1;
channel_mult = {1, 2, 4};
} }
if (sd_version_is_inpaint(version)) { if (sd_version_is_inpaint(version)) {
in_channels = 9; in_channels = 9;
@ -270,13 +273,22 @@ public:
n_head = ch / d_head; n_head = ch / d_head;
} }
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, int td = transformer_depth[i];
n_head, if (version == VERSION_SDXL_SSD1B) {
d_head, if (i == 2) {
transformer_depth[i], td = 4;
context_dim)); }
}
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
n_head,
d_head,
td,
context_dim));
} }
input_block_chans.push_back(ch); input_block_chans.push_back(ch);
if (version == VERSION_SD1_TINY_UNET) {
input_block_idx++;
}
} }
if (i != len_mults - 1) { if (i != len_mults - 1) {
input_block_idx += 1; input_block_idx += 1;
@ -295,14 +307,17 @@ public:
d_head = num_head_channels; d_head = num_head_channels;
n_head = ch / d_head; n_head = ch / d_head;
} }
blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch)); if (version != VERSION_SD1_TINY_UNET) {
blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
n_head, if (version != VERSION_SDXL_SSD1B) {
d_head, blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
transformer_depth[transformer_depth.size() - 1], n_head,
context_dim)); d_head,
blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch)); transformer_depth[transformer_depth.size() - 1],
context_dim));
blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
}
}
// output_blocks // output_blocks
int output_block_idx = 0; int output_block_idx = 0;
for (int i = (int)len_mults - 1; i >= 0; i--) { for (int i = (int)len_mults - 1; i >= 0; i--) {
@ -324,12 +339,27 @@ public:
n_head = ch / d_head; n_head = ch / d_head;
} }
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1"; std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, n_head, d_head, transformer_depth[i], context_dim)); int td = transformer_depth[i];
if (version == VERSION_SDXL_SSD1B) {
if (i == 2 && (j == 0 || j == 1)) {
td = 4;
}
if (i == 1 && (j == 1 || j == 2)) {
td = 1;
}
}
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, n_head, d_head, td, context_dim));
up_sample_idx++; up_sample_idx++;
} }
if (i > 0 && j == num_res_blocks) { if (i > 0 && j == num_res_blocks) {
if (version == VERSION_SD1_TINY_UNET) {
output_block_idx++;
if (output_block_idx == 2) {
up_sample_idx = 1;
}
}
std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
blocks[name] = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch)); blocks[name] = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
@ -463,6 +493,9 @@ public:
} }
hs.push_back(h); hs.push_back(h);
} }
if (version == VERSION_SD1_TINY_UNET) {
input_block_idx++;
}
if (i != len_mults - 1) { if (i != len_mults - 1) {
ds *= 2; ds *= 2;
input_block_idx += 1; input_block_idx += 1;
@ -477,10 +510,13 @@ public:
// [N, 4*model_channels, h/8, w/8] // [N, 4*model_channels, h/8, w/8]
// middle_block // middle_block
h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] if (version != VERSION_SD1_TINY_UNET) {
h = attention_layer_forward("middle_block.1", ctx, backend, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8] h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] if (version != VERSION_SDXL_SSD1B) {
h = attention_layer_forward("middle_block.1", ctx, backend, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8]
h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
}
}
if (controls.size() > 0) { if (controls.size() > 0) {
auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength); auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength);
h = ggml_add(ctx, h, cs); // middle control h = ggml_add(ctx, h, cs); // middle control
@ -516,6 +552,12 @@ public:
} }
if (i > 0 && j == num_res_blocks) { if (i > 0 && j == num_res_blocks) {
if (version == VERSION_SD1_TINY_UNET) {
output_block_idx++;
if (output_block_idx == 2) {
up_sample_idx = 1;
}
}
std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
auto block = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]); auto block = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);

24
vae.hpp
View File

@ -533,6 +533,30 @@ struct VAE : public GGMLRunner {
virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
}; };
struct FakeVAE : public VAE {
FakeVAE(ggml_backend_t backend, bool offload_params_to_cpu)
: VAE(backend, offload_params_to_cpu) {}
void compute(const int n_threads,
struct ggml_tensor* z,
bool decode_graph,
struct ggml_tensor** output,
struct ggml_context* output_ctx) override {
if (*output == nullptr && output_ctx != nullptr) {
*output = ggml_dup_tensor(output_ctx, z);
}
ggml_tensor_iter(z, [&](ggml_tensor* z, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
float value = ggml_tensor_get_f32(z, i0, i1, i2, i3);
ggml_tensor_set_f32(*output, value, i0, i1, i2, i3);
});
}
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {}
std::string get_desc() override {
return "fake_vae";
}
};
struct AutoEncoderKL : public VAE { struct AutoEncoderKL : public VAE {
bool decode_only = true; bool decode_only = true;
AutoencodingEngine ae; AutoencodingEngine ae;