diff --git a/README.md b/README.md index 62b5979..c5c3eb1 100644 --- a/README.md +++ b/README.md @@ -125,13 +125,14 @@ cmake --build . --config Release ##### Using HipBLAS This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed. +To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards. Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide. ``` -export GFX_NAME=$(rocminfo | grep -m 1 -E "gfx[^0]{1}" | sed -e 's/ *Name: *//' | awk '{$1=$1; print}' || echo "rocminfo missing") -echo $GFX_NAME -cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON +if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi +if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi +cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON cmake --build . --config Release ``` diff --git a/clip.hpp b/clip.hpp index f8b0485..1cba921 100644 --- a/clip.hpp +++ b/clip.hpp @@ -553,12 +553,13 @@ protected: void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { enum ggml_type token_wtype = GGML_TYPE_F32; if (!force_clip_f32) { - auto tensor_type = tensor_types.find(prefix + "token_embedding.weight"); - if (tensor_type != tensor_types.end()) + auto tensor_type = tensor_types.find(prefix + "token_embedding.weight"); + std::set allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0}; + if (tensor_type != tensor_types.end() && allow_types.find(tensor_type->second) != allow_types.end()) { token_wtype = tensor_type->second; + } } - enum ggml_type position_wtype = GGML_TYPE_F32; - + enum ggml_type position_wtype = GGML_TYPE_F32; params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size); params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions); } diff --git a/conditioner.hpp b/conditioner.hpp index 24066f3..b25ef84 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -146,7 +146,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } return true; }; - model_loader.load_tensors(on_load); + model_loader.load_tensors(on_load, 1); readed_embeddings.push_back(embd_name); if (embd) { int64_t hidden_size = text_model->model.hidden_size; diff --git a/control.hpp b/control.hpp index f9a4923..79b82a2 100644 --- a/control.hpp +++ b/control.hpp @@ -445,7 +445,7 @@ struct ControlNet : public GGMLRunner { guided_hint_cached = true; } - bool load_from_file(const std::string& file_path) { + bool load_from_file(const std::string& file_path, int n_threads) { LOG_INFO("loading control net from '%s'", file_path.c_str()); alloc_params_buffer(); std::map tensors; @@ -458,7 +458,7 @@ struct ControlNet : public GGMLRunner { return false; } - bool success = model_loader.load_tensors(tensors, ignore_tensors); + bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads); if (!success) { LOG_ERROR("load control net tensors from model loader failed"); diff --git a/docs/lora.md b/docs/lora.md index e2e1d82..9885ae5 100644 --- a/docs/lora.md +++ b/docs/lora.md @@ -20,20 +20,30 @@ Here's a simple example: NOTE: The other backends may have different support. -| Quant / Type | CUDA | -|--------------|------| -| F32 | ✔️ | -| F16 | ✔️ | -| BF16 | ✔️ | -| I32 | ✔️ | -| Q4_0 | ✔️ | -| Q4_1 | ✔️ | -| Q5_0 | ✔️ | -| Q5_1 | ✔️ | -| Q8_0 | ✔️ | -| Q2_K | ❌ | -| Q3_K | ❌ | -| Q4_K | ❌ | -| Q5_K | ❌ | -| Q6_K | ❌ | -| Q8_K | ❌ | +| Quant / Type | CUDA | Vulkan | +|--------------|------|--------| +| F32 | ✔️ | ✔️ | +| F16 | ✔️ | ✔️ | +| BF16 | ✔️ | ✔️ | +| I32 | ✔️ | ❌ | +| Q4_0 | ✔️ | ✔️ | +| Q4_1 | ✔️ | ✔️ | +| Q5_0 | ✔️ | ✔️ | +| Q5_1 | ✔️ | ✔️ | +| Q8_0 | ✔️ | ✔️ | +| Q2_K | ❌ | ❌ | +| Q3_K | ❌ | ❌ | +| Q4_K | ❌ | ❌ | +| Q5_K | ❌ | ❌ | +| Q6_K | ❌ | ❌ | +| Q8_K | ❌ | ❌ | +| IQ1_S | ❌ | ✔️ | +| IQ1_M | ❌ | ✔️ | +| IQ2_XXS | ❌ | ✔️ | +| IQ2_XS | ❌ | ✔️ | +| IQ2_S | ❌ | ✔️ | +| IQ3_XXS | ❌ | ✔️ | +| IQ3_S | ❌ | ✔️ | +| IQ4_XS | ❌ | ✔️ | +| IQ4_NL | ❌ | ✔️ | +| MXFP4 | ❌ | ✔️ | diff --git a/esrgan.hpp b/esrgan.hpp index e2003e4..7ede2e4 100644 --- a/esrgan.hpp +++ b/esrgan.hpp @@ -164,7 +164,7 @@ struct ESRGAN : public GGMLRunner { return "esrgan"; } - bool load_from_file(const std::string& file_path) { + bool load_from_file(const std::string& file_path, int n_threads) { LOG_INFO("loading esrgan from '%s'", file_path.c_str()); alloc_params_buffer(); @@ -177,7 +177,7 @@ struct ESRGAN : public GGMLRunner { return false; } - bool success = model_loader.load_tensors(esrgan_tensors); + bool success = model_loader.load_tensors(esrgan_tensors, {}, n_threads); if (!success) { LOG_ERROR("load esrgan tensors from model loader failed"); diff --git a/ggml b/ggml index 5fdc78f..553c447 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 5fdc78fff274094e2a1b155928131983362d8a71 +Subproject commit 553c44706c3cc6e4077f4ab214923fc4c20a013c diff --git a/lora.hpp b/lora.hpp index 222f61b..1fce956 100644 --- a/lora.hpp +++ b/lora.hpp @@ -116,7 +116,7 @@ struct LoraModel : public GGMLRunner { return "lora"; } - bool load_from_file(bool filter_tensor = false, int n_threads = 0) { + bool load_from_file(bool filter_tensor, int n_threads) { LOG_INFO("loading LoRA from '%s'", file_path.c_str()); if (load_failed) { diff --git a/model.cpp b/model.cpp index 8568846..ede53b4 100644 --- a/model.cpp +++ b/model.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -1995,7 +1996,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread std::atomic copy_to_backend_time_ms(0); std::atomic convert_time_ms(0); - int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency(); + int num_threads_to_use = n_threads_p > 0 ? n_threads_p : get_num_physical_cores(); + LOG_DEBUG("using %d threads for model loading", num_threads_to_use); int64_t start_time = ggml_time_ms(); std::vector processed_tensor_storages; @@ -2045,13 +2047,25 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread w.join(); } - std::unordered_map latest_map; + std::vector deduplicated; + deduplicated.reserve(all_results.size()); + std::unordered_map name_to_pos; for (auto& entry : all_results) { - latest_map[entry.ts.name] = entry; + auto it = name_to_pos.find(entry.ts.name); + if (it == name_to_pos.end()) { + name_to_pos.emplace(entry.ts.name, deduplicated.size()); + deduplicated.push_back(entry); + } else if (deduplicated[it->second].index < entry.index) { + deduplicated[it->second] = entry; + } } - processed_tensor_storages.reserve(latest_map.size()); - for (auto& [name, entry] : latest_map) { + std::sort(deduplicated.begin(), deduplicated.end(), [](const IndexedStorage& a, const IndexedStorage& b) { + return a.index < b.index; + }); + + processed_tensor_storages.reserve(deduplicated.size()); + for (auto& entry : deduplicated) { processed_tensor_storages.push_back(entry.ts); } } @@ -2447,6 +2461,8 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage // Pass, do not convert. For MMDiT } else if (contains(name, "time_embed.") || contains(name, "label_emb.")) { // Pass, do not convert. For Unet + } else if (contains(name, "embedding")) { + // Pass, do not convert embedding } else { return true; } diff --git a/pmid.hpp b/pmid.hpp index 3bd59cd..63029cb 100644 --- a/pmid.hpp +++ b/pmid.hpp @@ -591,7 +591,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner { return "id_embeds"; } - bool load_from_file(bool filter_tensor = false) { + bool load_from_file(bool filter_tensor, int n_threads) { LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str()); if (load_failed) { @@ -623,11 +623,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner { return true; }; - model_loader->load_tensors(on_new_tensor_cb); + model_loader->load_tensors(on_new_tensor_cb, n_threads); alloc_params_buffer(); dry_run = false; - model_loader->load_tensors(on_new_tensor_cb); + model_loader->load_tensors(on_new_tensor_cb, n_threads); LOG_DEBUG("finished loading PhotoMaker ID Embeds "); return true; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 1192b05..db06508 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -549,7 +549,7 @@ public: } if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) { pmid_lora = std::make_shared(backend, sd_ctx_params->photo_maker_path, ""); - if (!pmid_lora->load_from_file(true)) { + if (!pmid_lora->load_from_file(true, n_threads)) { LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path); return false; } @@ -617,14 +617,14 @@ public: if (!use_tiny_autoencoder) { vae_params_mem_size = first_stage_model->get_params_buffer_size(); } else { - if (!tae_first_stage->load_from_file(taesd_path)) { + if (!tae_first_stage->load_from_file(taesd_path, n_threads)) { return false; } vae_params_mem_size = tae_first_stage->get_params_buffer_size(); } size_t control_net_params_mem_size = 0; if (control_net) { - if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) { + if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) { return false; } control_net_params_mem_size = control_net->get_params_buffer_size(); @@ -861,7 +861,7 @@ public: return; } LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : ""); - if (!lora.load_from_file()) { + if (!lora.load_from_file(false, n_threads)) { LOG_WARN("load lora tensors from %s failed", file_path.c_str()); return; } diff --git a/tae.hpp b/tae.hpp index 1ae1257..41bcbe2 100644 --- a/tae.hpp +++ b/tae.hpp @@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner { return "taesd"; } - bool load_from_file(const std::string& file_path) { + bool load_from_file(const std::string& file_path, int n_threads) { LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false"); alloc_params_buffer(); std::map taesd_tensors; @@ -238,7 +238,7 @@ struct TinyAutoEncoder : public GGMLRunner { return false; } - bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors); + bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads); if (!success) { LOG_ERROR("load tae tensors from model loader failed"); diff --git a/upscaler.cpp b/upscaler.cpp index 7e765d7..4c138ea 100644 --- a/upscaler.cpp +++ b/upscaler.cpp @@ -18,7 +18,8 @@ struct UpscalerGGML { } bool load_from_file(const std::string& esrgan_path, - bool offload_params_to_cpu) { + bool offload_params_to_cpu, + int n_threads) { ggml_log_set(ggml_log_callback_default, nullptr); #ifdef SD_USE_CUDA LOG_DEBUG("Using CUDA backend"); @@ -54,7 +55,7 @@ struct UpscalerGGML { if (direct) { esrgan_upscaler->enable_conv2d_direct(); } - if (!esrgan_upscaler->load_from_file(esrgan_path)) { + if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) { return false; } return true; @@ -124,7 +125,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, return NULL; } - if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) { + if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) { delete upscaler_ctx->upscaler; upscaler_ctx->upscaler = NULL; free(upscaler_ctx);