Merge branch 'master' into qwen_image

2025-12-13 05:48:56 +00:00 · 2025-09-25 00:39:57 +08:00 · 2025-09-25 00:39:57 +08:00 · a8d3aa0415
commit a8d3aa0415
parent 5af0bb0aee 1ba30ce005
13 changed files with 77 additions and 48 deletions
--- a/README.md
+++ b/README.md
@ -125,13 +125,14 @@ cmake --build . --config Release
 ##### Using HipBLAS
 This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
 To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
 Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
 ```
-export GFX_NAME=$(rocminfo | grep -m 1 -E "gfx[^0]{1}" | sed -e 's/ *Name: *//' | awk '{$1=$1; print}' || echo "rocminfo missing")
+if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
-echo $GFX_NAME
+if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 cmake --build . --config Release
 ```
--- a/clip.hpp
+++ b/clip.hpp
@ -553,12 +553,13 @@ protected:
    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
        enum ggml_type token_wtype = GGML_TYPE_F32;
        if (!force_clip_f32) {
-            auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
+            auto tensor_type                = tensor_types.find(prefix + "token_embedding.weight");
-            if (tensor_type != tensor_types.end())
+            std::set<ggml_type> allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0};
            if (tensor_type != tensor_types.end() && allow_types.find(tensor_type->second) != allow_types.end()) {
                token_wtype = tensor_type->second;
            }
        }
-        enum ggml_type position_wtype = GGML_TYPE_F32;
+        enum ggml_type position_wtype       = GGML_TYPE_F32;
        params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
    }
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -146,7 +146,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            }
            return true;
        };
-        model_loader.load_tensors(on_load);
+        model_loader.load_tensors(on_load, 1);
        readed_embeddings.push_back(embd_name);
        if (embd) {
            int64_t hidden_size = text_model->model.hidden_size;
--- a/control.hpp
+++ b/control.hpp
@ -445,7 +445,7 @@ struct ControlNet : public GGMLRunner {
        guided_hint_cached = true;
    }
-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
        LOG_INFO("loading control net from '%s'", file_path.c_str());
        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> tensors;
@ -458,7 +458,7 @@ struct ControlNet : public GGMLRunner {
            return false;
        }
-        bool success = model_loader.load_tensors(tensors, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
        if (!success) {
            LOG_ERROR("load control net tensors from model loader failed");
--- a/docs/lora.md
+++ b/docs/lora.md
@ -20,20 +20,30 @@ Here's a simple example:
 NOTE: The other backends may have different support.
-| Quant / Type | CUDA |
+| Quant / Type | CUDA | Vulkan |
-|--------------|------|
+|--------------|------|--------|
-| F32          | ✔️   |
+| F32          | ✔️   | ✔️   |
-| F16          | ✔️   |
+| F16          | ✔️   | ✔️   |
-| BF16         | ✔️   |
+| BF16         | ✔️   | ✔️   |
-| I32          | ✔️   |
+| I32          | ✔️   | ❌   |
-| Q4_0         | ✔️   |
+| Q4_0         | ✔️   | ✔️   |
-| Q4_1         | ✔️   |
+| Q4_1         | ✔️   | ✔️   |
-| Q5_0         | ✔️   |
+| Q5_0         | ✔️   | ✔️   |
-| Q5_1         | ✔️   |
+| Q5_1         | ✔️   | ✔️   |
-| Q8_0         | ✔️   |
+| Q8_0         | ✔️   | ✔️   |
-| Q2_K         | ❌   |
+| Q2_K         | ❌   | ❌   |
-| Q3_K         | ❌   |
+| Q3_K         | ❌   | ❌   |
-| Q4_K         | ❌   |
+| Q4_K         | ❌   | ❌   |
-| Q5_K         | ❌   |
+| Q5_K         | ❌   | ❌   |
-| Q6_K         | ❌   |
+| Q6_K         | ❌   | ❌   |
-| Q8_K         | ❌   |
+| Q8_K         | ❌   | ❌   |
 | IQ1_S        | ❌   | ✔️   |
 | IQ1_M        | ❌   | ✔️   |
 | IQ2_XXS      | ❌   | ✔️   |
 | IQ2_XS       | ❌   | ✔️   |
 | IQ2_S        | ❌   | ✔️   |
 | IQ3_XXS      | ❌   | ✔️   |
 | IQ3_S        | ❌   | ✔️   |
 | IQ4_XS       | ❌   | ✔️   |
 | IQ4_NL       | ❌   | ✔️   |
 | MXFP4        | ❌   | ✔️   |
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -164,7 +164,7 @@ struct ESRGAN : public GGMLRunner {
        return "esrgan";
    }
-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
        LOG_INFO("loading esrgan from '%s'", file_path.c_str());
        alloc_params_buffer();
@ -177,7 +177,7 @@ struct ESRGAN : public GGMLRunner {
            return false;
        }
-        bool success = model_loader.load_tensors(esrgan_tensors);
+        bool success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
        if (!success) {
            LOG_ERROR("load esrgan tensors from model loader failed");
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 5fdc78fff274094e2a1b155928131983362d8a71
+Subproject commit 553c44706c3cc6e4077f4ab214923fc4c20a013c
--- a/lora.hpp
+++ b/lora.hpp
@ -116,7 +116,7 @@ struct LoraModel : public GGMLRunner {
        return "lora";
    }
-    bool load_from_file(bool filter_tensor = false, int n_threads = 0) {
+    bool load_from_file(bool filter_tensor, int n_threads) {
        LOG_INFO("loading LoRA from '%s'", file_path.c_str());
        if (load_failed) {
--- a/model.cpp
+++ b/model.cpp
@ -1,4 +1,5 @@
 #include <stdarg.h>
 #include <algorithm>
 #include <atomic>
 #include <chrono>
 #include <fstream>
@ -1995,7 +1996,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
    std::atomic<int64_t> copy_to_backend_time_ms(0);
    std::atomic<int64_t> convert_time_ms(0);
-    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
+    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : get_num_physical_cores();
    LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
    int64_t start_time = ggml_time_ms();
    std::vector<TensorStorage> processed_tensor_storages;
@ -2045,13 +2047,25 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
            w.join();
        }
-        std::unordered_map<std::string, IndexedStorage> latest_map;
+        std::vector<IndexedStorage> deduplicated;
        deduplicated.reserve(all_results.size());
        std::unordered_map<std::string, size_t> name_to_pos;
        for (auto& entry : all_results) {
-            latest_map[entry.ts.name] = entry;
+            auto it = name_to_pos.find(entry.ts.name);
            if (it == name_to_pos.end()) {
                name_to_pos.emplace(entry.ts.name, deduplicated.size());
                deduplicated.push_back(entry);
            } else if (deduplicated[it->second].index < entry.index) {
                deduplicated[it->second] = entry;
            }
        }
-        processed_tensor_storages.reserve(latest_map.size());
+        std::sort(deduplicated.begin(), deduplicated.end(), [](const IndexedStorage& a, const IndexedStorage& b) {
-        for (auto& [name, entry] : latest_map) {
+            return a.index < b.index;
        });
        processed_tensor_storages.reserve(deduplicated.size());
        for (auto& entry : deduplicated) {
            processed_tensor_storages.push_back(entry.ts);
        }
    }
@ -2447,6 +2461,8 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
            // Pass, do not convert. For MMDiT
        } else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
            // Pass, do not convert. For Unet
        } else if (contains(name, "embedding")) {
            // Pass, do not convert embedding
        } else {
            return true;
        }
--- a/pmid.hpp
+++ b/pmid.hpp
@ -591,7 +591,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
        return "id_embeds";
    }
-    bool load_from_file(bool filter_tensor = false) {
+    bool load_from_file(bool filter_tensor, int n_threads) {
        LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
        if (load_failed) {
@ -623,11 +623,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
            return true;
        };
-        model_loader->load_tensors(on_new_tensor_cb);
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
        alloc_params_buffer();
        dry_run = false;
-        model_loader->load_tensors(on_new_tensor_cb);
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
        LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
        return true;
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -549,7 +549,7 @@ public:
            }
            if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
                pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "");
-                if (!pmid_lora->load_from_file(true)) {
+                if (!pmid_lora->load_from_file(true, n_threads)) {
                    LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
                    return false;
                }
@ -617,14 +617,14 @@ public:
            if (!use_tiny_autoencoder) {
                vae_params_mem_size = first_stage_model->get_params_buffer_size();
            } else {
-                if (!tae_first_stage->load_from_file(taesd_path)) {
+                if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
                    return false;
                }
                vae_params_mem_size = tae_first_stage->get_params_buffer_size();
            }
            size_t control_net_params_mem_size = 0;
            if (control_net) {
-                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) {
+                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {
                    return false;
                }
                control_net_params_mem_size = control_net->get_params_buffer_size();
@ -861,7 +861,7 @@ public:
            return;
        }
        LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "");
-        if (!lora.load_from_file()) {
+        if (!lora.load_from_file(false, n_threads)) {
            LOG_WARN("load lora tensors from %s failed", file_path.c_str());
            return;
        }
--- a/tae.hpp
+++ b/tae.hpp
@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
        return "taesd";
    }
-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
        LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> taesd_tensors;
@ -238,7 +238,7 @@ struct TinyAutoEncoder : public GGMLRunner {
            return false;
        }
-        bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);
+        bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);
        if (!success) {
            LOG_ERROR("load tae tensors from model loader failed");
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -18,7 +18,8 @@ struct UpscalerGGML {
    }
    bool load_from_file(const std::string& esrgan_path,
-                        bool offload_params_to_cpu) {
+                        bool offload_params_to_cpu,
                        int n_threads) {
        ggml_log_set(ggml_log_callback_default, nullptr);
 #ifdef SD_USE_CUDA
        LOG_DEBUG("Using CUDA backend");
@ -54,7 +55,7 @@ struct UpscalerGGML {
        if (direct) {
            esrgan_upscaler->enable_conv2d_direct();
        }
-        if (!esrgan_upscaler->load_from_file(esrgan_path)) {
+        if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {
            return false;
        }
        return true;
@ -124,7 +125,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
        return NULL;
    }
-    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
        delete upscaler_ctx->upscaler;
        upscaler_ctx->upscaler = NULL;
        free(upscaler_ctx);
		`@ -1 +1 @@`
			`Subproject commit 5fdc78fff274094e2a1b155928131983362d8a71`				`Subproject commit 553c44706c3cc6e4077f4ab214923fc4c20a013c`