Merge branch 'master' into qwen_image

This commit is contained in:
leejet 2025-09-25 00:39:57 +08:00
commit a8d3aa0415
13 changed files with 77 additions and 48 deletions

View File

@ -125,13 +125,14 @@ cmake --build . --config Release
##### Using HipBLAS ##### Using HipBLAS
This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed. This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide. Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
``` ```
export GFX_NAME=$(rocminfo | grep -m 1 -E "gfx[^0]{1}" | sed -e 's/ *Name: *//' | awk '{$1=$1; print}' || echo "rocminfo missing") if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
echo $GFX_NAME if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
cmake --build . --config Release cmake --build . --config Release
``` ```

View File

@ -553,12 +553,13 @@ protected:
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type token_wtype = GGML_TYPE_F32; enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) { if (!force_clip_f32) {
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight"); auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
if (tensor_type != tensor_types.end()) std::set<ggml_type> allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0};
if (tensor_type != tensor_types.end() && allow_types.find(tensor_type->second) != allow_types.end()) {
token_wtype = tensor_type->second; token_wtype = tensor_type->second;
}
} }
enum ggml_type position_wtype = GGML_TYPE_F32; enum ggml_type position_wtype = GGML_TYPE_F32;
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size); params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions); params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
} }

View File

@ -146,7 +146,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
} }
return true; return true;
}; };
model_loader.load_tensors(on_load); model_loader.load_tensors(on_load, 1);
readed_embeddings.push_back(embd_name); readed_embeddings.push_back(embd_name);
if (embd) { if (embd) {
int64_t hidden_size = text_model->model.hidden_size; int64_t hidden_size = text_model->model.hidden_size;

View File

@ -445,7 +445,7 @@ struct ControlNet : public GGMLRunner {
guided_hint_cached = true; guided_hint_cached = true;
} }
bool load_from_file(const std::string& file_path) { bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading control net from '%s'", file_path.c_str()); LOG_INFO("loading control net from '%s'", file_path.c_str());
alloc_params_buffer(); alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors; std::map<std::string, ggml_tensor*> tensors;
@ -458,7 +458,7 @@ struct ControlNet : public GGMLRunner {
return false; return false;
} }
bool success = model_loader.load_tensors(tensors, ignore_tensors); bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
if (!success) { if (!success) {
LOG_ERROR("load control net tensors from model loader failed"); LOG_ERROR("load control net tensors from model loader failed");

View File

@ -20,20 +20,30 @@ Here's a simple example:
NOTE: The other backends may have different support. NOTE: The other backends may have different support.
| Quant / Type | CUDA | | Quant / Type | CUDA | Vulkan |
|--------------|------| |--------------|------|--------|
| F32 | ✔️ | | F32 | ✔️ | ✔️ |
| F16 | ✔️ | | F16 | ✔️ | ✔️ |
| BF16 | ✔️ | | BF16 | ✔️ | ✔️ |
| I32 | ✔️ | | I32 | ✔️ | ❌ |
| Q4_0 | ✔️ | | Q4_0 | ✔️ | ✔️ |
| Q4_1 | ✔️ | | Q4_1 | ✔️ | ✔️ |
| Q5_0 | ✔️ | | Q5_0 | ✔️ | ✔️ |
| Q5_1 | ✔️ | | Q5_1 | ✔️ | ✔️ |
| Q8_0 | ✔️ | | Q8_0 | ✔️ | ✔️ |
| Q2_K | ❌ | | Q2_K | ❌ | ❌ |
| Q3_K | ❌ | | Q3_K | ❌ | ❌ |
| Q4_K | ❌ | | Q4_K | ❌ | ❌ |
| Q5_K | ❌ | | Q5_K | ❌ | ❌ |
| Q6_K | ❌ | | Q6_K | ❌ | ❌ |
| Q8_K | ❌ | | Q8_K | ❌ | ❌ |
| IQ1_S | ❌ | ✔️ |
| IQ1_M | ❌ | ✔️ |
| IQ2_XXS | ❌ | ✔️ |
| IQ2_XS | ❌ | ✔️ |
| IQ2_S | ❌ | ✔️ |
| IQ3_XXS | ❌ | ✔️ |
| IQ3_S | ❌ | ✔️ |
| IQ4_XS | ❌ | ✔️ |
| IQ4_NL | ❌ | ✔️ |
| MXFP4 | ❌ | ✔️ |

View File

@ -164,7 +164,7 @@ struct ESRGAN : public GGMLRunner {
return "esrgan"; return "esrgan";
} }
bool load_from_file(const std::string& file_path) { bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading esrgan from '%s'", file_path.c_str()); LOG_INFO("loading esrgan from '%s'", file_path.c_str());
alloc_params_buffer(); alloc_params_buffer();
@ -177,7 +177,7 @@ struct ESRGAN : public GGMLRunner {
return false; return false;
} }
bool success = model_loader.load_tensors(esrgan_tensors); bool success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
if (!success) { if (!success) {
LOG_ERROR("load esrgan tensors from model loader failed"); LOG_ERROR("load esrgan tensors from model loader failed");

2
ggml

@ -1 +1 @@
Subproject commit 5fdc78fff274094e2a1b155928131983362d8a71 Subproject commit 553c44706c3cc6e4077f4ab214923fc4c20a013c

View File

@ -116,7 +116,7 @@ struct LoraModel : public GGMLRunner {
return "lora"; return "lora";
} }
bool load_from_file(bool filter_tensor = false, int n_threads = 0) { bool load_from_file(bool filter_tensor, int n_threads) {
LOG_INFO("loading LoRA from '%s'", file_path.c_str()); LOG_INFO("loading LoRA from '%s'", file_path.c_str());
if (load_failed) { if (load_failed) {

View File

@ -1,4 +1,5 @@
#include <stdarg.h> #include <stdarg.h>
#include <algorithm>
#include <atomic> #include <atomic>
#include <chrono> #include <chrono>
#include <fstream> #include <fstream>
@ -1995,7 +1996,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
std::atomic<int64_t> copy_to_backend_time_ms(0); std::atomic<int64_t> copy_to_backend_time_ms(0);
std::atomic<int64_t> convert_time_ms(0); std::atomic<int64_t> convert_time_ms(0);
int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency(); int num_threads_to_use = n_threads_p > 0 ? n_threads_p : get_num_physical_cores();
LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
int64_t start_time = ggml_time_ms(); int64_t start_time = ggml_time_ms();
std::vector<TensorStorage> processed_tensor_storages; std::vector<TensorStorage> processed_tensor_storages;
@ -2045,13 +2047,25 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
w.join(); w.join();
} }
std::unordered_map<std::string, IndexedStorage> latest_map; std::vector<IndexedStorage> deduplicated;
deduplicated.reserve(all_results.size());
std::unordered_map<std::string, size_t> name_to_pos;
for (auto& entry : all_results) { for (auto& entry : all_results) {
latest_map[entry.ts.name] = entry; auto it = name_to_pos.find(entry.ts.name);
if (it == name_to_pos.end()) {
name_to_pos.emplace(entry.ts.name, deduplicated.size());
deduplicated.push_back(entry);
} else if (deduplicated[it->second].index < entry.index) {
deduplicated[it->second] = entry;
}
} }
processed_tensor_storages.reserve(latest_map.size()); std::sort(deduplicated.begin(), deduplicated.end(), [](const IndexedStorage& a, const IndexedStorage& b) {
for (auto& [name, entry] : latest_map) { return a.index < b.index;
});
processed_tensor_storages.reserve(deduplicated.size());
for (auto& entry : deduplicated) {
processed_tensor_storages.push_back(entry.ts); processed_tensor_storages.push_back(entry.ts);
} }
} }
@ -2447,6 +2461,8 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
// Pass, do not convert. For MMDiT // Pass, do not convert. For MMDiT
} else if (contains(name, "time_embed.") || contains(name, "label_emb.")) { } else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
// Pass, do not convert. For Unet // Pass, do not convert. For Unet
} else if (contains(name, "embedding")) {
// Pass, do not convert embedding
} else { } else {
return true; return true;
} }

View File

@ -591,7 +591,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return "id_embeds"; return "id_embeds";
} }
bool load_from_file(bool filter_tensor = false) { bool load_from_file(bool filter_tensor, int n_threads) {
LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str()); LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
if (load_failed) { if (load_failed) {
@ -623,11 +623,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return true; return true;
}; };
model_loader->load_tensors(on_new_tensor_cb); model_loader->load_tensors(on_new_tensor_cb, n_threads);
alloc_params_buffer(); alloc_params_buffer();
dry_run = false; dry_run = false;
model_loader->load_tensors(on_new_tensor_cb); model_loader->load_tensors(on_new_tensor_cb, n_threads);
LOG_DEBUG("finished loading PhotoMaker ID Embeds "); LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
return true; return true;

View File

@ -549,7 +549,7 @@ public:
} }
if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) { if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, ""); pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "");
if (!pmid_lora->load_from_file(true)) { if (!pmid_lora->load_from_file(true, n_threads)) {
LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path); LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
return false; return false;
} }
@ -617,14 +617,14 @@ public:
if (!use_tiny_autoencoder) { if (!use_tiny_autoencoder) {
vae_params_mem_size = first_stage_model->get_params_buffer_size(); vae_params_mem_size = first_stage_model->get_params_buffer_size();
} else { } else {
if (!tae_first_stage->load_from_file(taesd_path)) { if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
return false; return false;
} }
vae_params_mem_size = tae_first_stage->get_params_buffer_size(); vae_params_mem_size = tae_first_stage->get_params_buffer_size();
} }
size_t control_net_params_mem_size = 0; size_t control_net_params_mem_size = 0;
if (control_net) { if (control_net) {
if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) { if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {
return false; return false;
} }
control_net_params_mem_size = control_net->get_params_buffer_size(); control_net_params_mem_size = control_net->get_params_buffer_size();
@ -861,7 +861,7 @@ public:
return; return;
} }
LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : ""); LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "");
if (!lora.load_from_file()) { if (!lora.load_from_file(false, n_threads)) {
LOG_WARN("load lora tensors from %s failed", file_path.c_str()); LOG_WARN("load lora tensors from %s failed", file_path.c_str());
return; return;
} }

View File

@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
return "taesd"; return "taesd";
} }
bool load_from_file(const std::string& file_path) { bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false"); LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
alloc_params_buffer(); alloc_params_buffer();
std::map<std::string, ggml_tensor*> taesd_tensors; std::map<std::string, ggml_tensor*> taesd_tensors;
@ -238,7 +238,7 @@ struct TinyAutoEncoder : public GGMLRunner {
return false; return false;
} }
bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors); bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);
if (!success) { if (!success) {
LOG_ERROR("load tae tensors from model loader failed"); LOG_ERROR("load tae tensors from model loader failed");

View File

@ -18,7 +18,8 @@ struct UpscalerGGML {
} }
bool load_from_file(const std::string& esrgan_path, bool load_from_file(const std::string& esrgan_path,
bool offload_params_to_cpu) { bool offload_params_to_cpu,
int n_threads) {
ggml_log_set(ggml_log_callback_default, nullptr); ggml_log_set(ggml_log_callback_default, nullptr);
#ifdef SD_USE_CUDA #ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend"); LOG_DEBUG("Using CUDA backend");
@ -54,7 +55,7 @@ struct UpscalerGGML {
if (direct) { if (direct) {
esrgan_upscaler->enable_conv2d_direct(); esrgan_upscaler->enable_conv2d_direct();
} }
if (!esrgan_upscaler->load_from_file(esrgan_path)) { if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {
return false; return false;
} }
return true; return true;
@ -124,7 +125,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
return NULL; return NULL;
} }
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) { if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
delete upscaler_ctx->upscaler; delete upscaler_ctx->upscaler;
upscaler_ctx->upscaler = NULL; upscaler_ctx->upscaler = NULL;
free(upscaler_ctx); free(upscaler_ctx);