Compare commits

...

6 Commits

Author SHA1 Message Date
leejet
1ba30ce005 sync: update ggml 2025-09-25 00:38:38 +08:00
leejet
2abe9451c4
fix: optimize the handling of CLIP embedding weight (#840) 2025-09-25 00:28:20 +08:00
Wagner Bruna
f3140eadbb
fix: tensor loading thread count (#854) 2025-09-25 00:26:38 +08:00
Stefan-Olt
98ba155fc6
docs: HipBLAS / ROCm build instruction fix (#843) 2025-09-25 00:03:05 +08:00
Wagner Bruna
513f36d495
docs: include Vulkan compatibility for LoRA quants (#845) 2025-09-25 00:01:10 +08:00
rmatif
1e0d2821bb
fix: correct tensor deduplication logic (#844) 2025-09-24 23:22:40 +08:00
13 changed files with 77 additions and 48 deletions

View File

@ -125,13 +125,14 @@ cmake --build . --config Release
##### Using HipBLAS
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
```
export GFX_NAME=$(rocminfo | grep -m 1 -E "gfx[^0]{1}" | sed -e 's/ *Name: *//' | awk '{$1=$1; print}' || echo "rocminfo missing")
echo $GFX_NAME
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
cmake --build . --config Release
```

View File

@ -553,12 +553,13 @@ protected:
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) {
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
if (tensor_type != tensor_types.end())
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
std::set<ggml_type> allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0};
if (tensor_type != tensor_types.end() && allow_types.find(tensor_type->second) != allow_types.end()) {
token_wtype = tensor_type->second;
}
}
enum ggml_type position_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32;
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
}

View File

@ -141,7 +141,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
}
return true;
};
model_loader.load_tensors(on_load);
model_loader.load_tensors(on_load, 1);
readed_embeddings.push_back(embd_name);
if (embd) {
int64_t hidden_size = text_model->model.hidden_size;

View File

@ -445,7 +445,7 @@ struct ControlNet : public GGMLRunner {
guided_hint_cached = true;
}
bool load_from_file(const std::string& file_path) {
bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading control net from '%s'", file_path.c_str());
alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors;
@ -458,7 +458,7 @@ struct ControlNet : public GGMLRunner {
return false;
}
bool success = model_loader.load_tensors(tensors, ignore_tensors);
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
if (!success) {
LOG_ERROR("load control net tensors from model loader failed");

View File

@ -20,20 +20,30 @@ Here's a simple example:
NOTE: The other backends may have different support.
| Quant / Type | CUDA |
|--------------|------|
| F32 | ✔️ |
| F16 | ✔️ |
| BF16 | ✔️ |
| I32 | ✔️ |
| Q4_0 | ✔️ |
| Q4_1 | ✔️ |
| Q5_0 | ✔️ |
| Q5_1 | ✔️ |
| Q8_0 | ✔️ |
| Q2_K | ❌ |
| Q3_K | ❌ |
| Q4_K | ❌ |
| Q5_K | ❌ |
| Q6_K | ❌ |
| Q8_K | ❌ |
| Quant / Type | CUDA | Vulkan |
|--------------|------|--------|
| F32 | ✔️ | ✔️ |
| F16 | ✔️ | ✔️ |
| BF16 | ✔️ | ✔️ |
| I32 | ✔️ | ❌ |
| Q4_0 | ✔️ | ✔️ |
| Q4_1 | ✔️ | ✔️ |
| Q5_0 | ✔️ | ✔️ |
| Q5_1 | ✔️ | ✔️ |
| Q8_0 | ✔️ | ✔️ |
| Q2_K | ❌ | ❌ |
| Q3_K | ❌ | ❌ |
| Q4_K | ❌ | ❌ |
| Q5_K | ❌ | ❌ |
| Q6_K | ❌ | ❌ |
| Q8_K | ❌ | ❌ |
| IQ1_S | ❌ | ✔️ |
| IQ1_M | ❌ | ✔️ |
| IQ2_XXS | ❌ | ✔️ |
| IQ2_XS | ❌ | ✔️ |
| IQ2_S | ❌ | ✔️ |
| IQ3_XXS | ❌ | ✔️ |
| IQ3_S | ❌ | ✔️ |
| IQ4_XS | ❌ | ✔️ |
| IQ4_NL | ❌ | ✔️ |
| MXFP4 | ❌ | ✔️ |

View File

@ -164,7 +164,7 @@ struct ESRGAN : public GGMLRunner {
return "esrgan";
}
bool load_from_file(const std::string& file_path) {
bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading esrgan from '%s'", file_path.c_str());
alloc_params_buffer();
@ -177,7 +177,7 @@ struct ESRGAN : public GGMLRunner {
return false;
}
bool success = model_loader.load_tensors(esrgan_tensors);
bool success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
if (!success) {
LOG_ERROR("load esrgan tensors from model loader failed");

2
ggml

@ -1 +1 @@
Subproject commit 5fdc78fff274094e2a1b155928131983362d8a71
Subproject commit 553c44706c3cc6e4077f4ab214923fc4c20a013c

View File

@ -116,7 +116,7 @@ struct LoraModel : public GGMLRunner {
return "lora";
}
bool load_from_file(bool filter_tensor = false, int n_threads = 0) {
bool load_from_file(bool filter_tensor, int n_threads) {
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
if (load_failed) {

View File

@ -1,4 +1,5 @@
#include <stdarg.h>
#include <algorithm>
#include <atomic>
#include <chrono>
#include <fstream>
@ -1956,7 +1957,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
std::atomic<int64_t> copy_to_backend_time_ms(0);
std::atomic<int64_t> convert_time_ms(0);
int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
int num_threads_to_use = n_threads_p > 0 ? n_threads_p : get_num_physical_cores();
LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
int64_t start_time = ggml_time_ms();
std::vector<TensorStorage> processed_tensor_storages;
@ -2006,13 +2008,25 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
w.join();
}
std::unordered_map<std::string, IndexedStorage> latest_map;
std::vector<IndexedStorage> deduplicated;
deduplicated.reserve(all_results.size());
std::unordered_map<std::string, size_t> name_to_pos;
for (auto& entry : all_results) {
latest_map[entry.ts.name] = entry;
auto it = name_to_pos.find(entry.ts.name);
if (it == name_to_pos.end()) {
name_to_pos.emplace(entry.ts.name, deduplicated.size());
deduplicated.push_back(entry);
} else if (deduplicated[it->second].index < entry.index) {
deduplicated[it->second] = entry;
}
}
processed_tensor_storages.reserve(latest_map.size());
for (auto& [name, entry] : latest_map) {
std::sort(deduplicated.begin(), deduplicated.end(), [](const IndexedStorage& a, const IndexedStorage& b) {
return a.index < b.index;
});
processed_tensor_storages.reserve(deduplicated.size());
for (auto& entry : deduplicated) {
processed_tensor_storages.push_back(entry.ts);
}
}
@ -2408,6 +2422,8 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
// Pass, do not convert. For MMDiT
} else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
// Pass, do not convert. For Unet
} else if (contains(name, "embedding")) {
// Pass, do not convert embedding
} else {
return true;
}

View File

@ -591,7 +591,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return "id_embeds";
}
bool load_from_file(bool filter_tensor = false) {
bool load_from_file(bool filter_tensor, int n_threads) {
LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
if (load_failed) {
@ -623,11 +623,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return true;
};
model_loader->load_tensors(on_new_tensor_cb);
model_loader->load_tensors(on_new_tensor_cb, n_threads);
alloc_params_buffer();
dry_run = false;
model_loader->load_tensors(on_new_tensor_cb);
model_loader->load_tensors(on_new_tensor_cb, n_threads);
LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
return true;

View File

@ -531,7 +531,7 @@ public:
}
if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "");
if (!pmid_lora->load_from_file(true)) {
if (!pmid_lora->load_from_file(true, n_threads)) {
LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
return false;
}
@ -599,14 +599,14 @@ public:
if (!use_tiny_autoencoder) {
vae_params_mem_size = first_stage_model->get_params_buffer_size();
} else {
if (!tae_first_stage->load_from_file(taesd_path)) {
if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
return false;
}
vae_params_mem_size = tae_first_stage->get_params_buffer_size();
}
size_t control_net_params_mem_size = 0;
if (control_net) {
if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) {
if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {
return false;
}
control_net_params_mem_size = control_net->get_params_buffer_size();
@ -836,7 +836,7 @@ public:
return;
}
LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "");
if (!lora.load_from_file()) {
if (!lora.load_from_file(false, n_threads)) {
LOG_WARN("load lora tensors from %s failed", file_path.c_str());
return;
}

View File

@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
return "taesd";
}
bool load_from_file(const std::string& file_path) {
bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
alloc_params_buffer();
std::map<std::string, ggml_tensor*> taesd_tensors;
@ -238,7 +238,7 @@ struct TinyAutoEncoder : public GGMLRunner {
return false;
}
bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);
bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);
if (!success) {
LOG_ERROR("load tae tensors from model loader failed");

View File

@ -18,7 +18,8 @@ struct UpscalerGGML {
}
bool load_from_file(const std::string& esrgan_path,
bool offload_params_to_cpu) {
bool offload_params_to_cpu,
int n_threads) {
ggml_log_set(ggml_log_callback_default, nullptr);
#ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend");
@ -54,7 +55,7 @@ struct UpscalerGGML {
if (direct) {
esrgan_upscaler->enable_conv2d_direct();
}
if (!esrgan_upscaler->load_from_file(esrgan_path)) {
if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {
return false;
}
return true;
@ -124,7 +125,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
return NULL;
}
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
delete upscaler_ctx->upscaler;
upscaler_ctx->upscaler = NULL;
free(upscaler_ctx);