diff --git a/README.md b/README.md
index 62b5979..c5c3eb1 100644
--- a/README.md
+++ b/README.md
@@ -125,13 +125,14 @@ cmake --build . --config Release
 
 ##### Using HipBLAS
 This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
+To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
 
 Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
 
 ```
-export GFX_NAME=$(rocminfo | grep -m 1 -E "gfx[^0]{1}" | sed -e 's/ *Name: *//' | awk '{$1=$1; print}' || echo "rocminfo missing")
-echo $GFX_NAME
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
+if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 cmake --build . --config Release
 ```
 
diff --git a/clip.hpp b/clip.hpp
index f8b0485..1cba921 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -553,12 +553,13 @@ protected:
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         enum ggml_type token_wtype = GGML_TYPE_F32;
         if (!force_clip_f32) {
-            auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
-            if (tensor_type != tensor_types.end())
+            auto tensor_type                = tensor_types.find(prefix + "token_embedding.weight");
+            std::set<ggml_type> allow_types = {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0};
+            if (tensor_type != tensor_types.end() && allow_types.find(tensor_type->second) != allow_types.end()) {
                 token_wtype = tensor_type->second;
+            }
         }
-        enum ggml_type position_wtype = GGML_TYPE_F32;
-
+        enum ggml_type position_wtype       = GGML_TYPE_F32;
         params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
         params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
     }
diff --git a/conditioner.hpp b/conditioner.hpp
index 24066f3..b25ef84 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -146,7 +146,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             }
             return true;
         };
-        model_loader.load_tensors(on_load);
+        model_loader.load_tensors(on_load, 1);
         readed_embeddings.push_back(embd_name);
         if (embd) {
             int64_t hidden_size = text_model->model.hidden_size;
diff --git a/control.hpp b/control.hpp
index f9a4923..79b82a2 100644
--- a/control.hpp
+++ b/control.hpp
@@ -445,7 +445,7 @@ struct ControlNet : public GGMLRunner {
         guided_hint_cached = true;
     }
 
-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
         LOG_INFO("loading control net from '%s'", file_path.c_str());
         alloc_params_buffer();
         std::map<std::string, ggml_tensor*> tensors;
@@ -458,7 +458,7 @@ struct ControlNet : public GGMLRunner {
             return false;
         }
 
-        bool success = model_loader.load_tensors(tensors, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
 
         if (!success) {
             LOG_ERROR("load control net tensors from model loader failed");
diff --git a/docs/lora.md b/docs/lora.md
index e2e1d82..9885ae5 100644
--- a/docs/lora.md
+++ b/docs/lora.md
@@ -20,20 +20,30 @@ Here's a simple example:
 
 NOTE: The other backends may have different support.
 
-| Quant / Type | CUDA |
-|--------------|------|
-| F32          | ✔️   |
-| F16          | ✔️   |
-| BF16         | ✔️   |
-| I32          | ✔️   |
-| Q4_0         | ✔️   |
-| Q4_1         | ✔️   |
-| Q5_0         | ✔️   |
-| Q5_1         | ✔️   |
-| Q8_0         | ✔️   |
-| Q2_K         | ❌   |
-| Q3_K         | ❌   |
-| Q4_K         | ❌   |
-| Q5_K         | ❌   |
-| Q6_K         | ❌   |
-| Q8_K         | ❌   |
+| Quant / Type | CUDA | Vulkan |
+|--------------|------|--------|
+| F32          | ✔️   | ✔️   |
+| F16          | ✔️   | ✔️   |
+| BF16         | ✔️   | ✔️   |
+| I32          | ✔️   | ❌   |
+| Q4_0         | ✔️   | ✔️   |
+| Q4_1         | ✔️   | ✔️   |
+| Q5_0         | ✔️   | ✔️   |
+| Q5_1         | ✔️   | ✔️   |
+| Q8_0         | ✔️   | ✔️   |
+| Q2_K         | ❌   | ❌   |
+| Q3_K         | ❌   | ❌   |
+| Q4_K         | ❌   | ❌   |
+| Q5_K         | ❌   | ❌   |
+| Q6_K         | ❌   | ❌   |
+| Q8_K         | ❌   | ❌   |
+| IQ1_S        | ❌   | ✔️   |
+| IQ1_M        | ❌   | ✔️   |
+| IQ2_XXS      | ❌   | ✔️   |
+| IQ2_XS       | ❌   | ✔️   |
+| IQ2_S        | ❌   | ✔️   |
+| IQ3_XXS      | ❌   | ✔️   |
+| IQ3_S        | ❌   | ✔️   |
+| IQ4_XS       | ❌   | ✔️   |
+| IQ4_NL       | ❌   | ✔️   |
+| MXFP4        | ❌   | ✔️   |
diff --git a/esrgan.hpp b/esrgan.hpp
index e2003e4..7ede2e4 100644
--- a/esrgan.hpp
+++ b/esrgan.hpp
@@ -164,7 +164,7 @@ struct ESRGAN : public GGMLRunner {
         return "esrgan";
     }
 
-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
         LOG_INFO("loading esrgan from '%s'", file_path.c_str());
 
         alloc_params_buffer();
@@ -177,7 +177,7 @@ struct ESRGAN : public GGMLRunner {
             return false;
         }
 
-        bool success = model_loader.load_tensors(esrgan_tensors);
+        bool success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
 
         if (!success) {
             LOG_ERROR("load esrgan tensors from model loader failed");
diff --git a/ggml b/ggml
index 5fdc78f..553c447 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 5fdc78fff274094e2a1b155928131983362d8a71
+Subproject commit 553c44706c3cc6e4077f4ab214923fc4c20a013c
diff --git a/lora.hpp b/lora.hpp
index 222f61b..1fce956 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -116,7 +116,7 @@ struct LoraModel : public GGMLRunner {
         return "lora";
     }
 
-    bool load_from_file(bool filter_tensor = false, int n_threads = 0) {
+    bool load_from_file(bool filter_tensor, int n_threads) {
         LOG_INFO("loading LoRA from '%s'", file_path.c_str());
 
         if (load_failed) {
diff --git a/model.cpp b/model.cpp
index 8568846..ede53b4 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1,4 +1,5 @@
 #include <stdarg.h>
+#include <algorithm>
 #include <atomic>
 #include <chrono>
 #include <fstream>
@@ -1995,7 +1996,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
     std::atomic<int64_t> copy_to_backend_time_ms(0);
     std::atomic<int64_t> convert_time_ms(0);
 
-    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
+    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : get_num_physical_cores();
+    LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
 
     int64_t start_time = ggml_time_ms();
     std::vector<TensorStorage> processed_tensor_storages;
@@ -2045,13 +2047,25 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
             w.join();
         }
 
-        std::unordered_map<std::string, IndexedStorage> latest_map;
+        std::vector<IndexedStorage> deduplicated;
+        deduplicated.reserve(all_results.size());
+        std::unordered_map<std::string, size_t> name_to_pos;
         for (auto& entry : all_results) {
-            latest_map[entry.ts.name] = entry;
+            auto it = name_to_pos.find(entry.ts.name);
+            if (it == name_to_pos.end()) {
+                name_to_pos.emplace(entry.ts.name, deduplicated.size());
+                deduplicated.push_back(entry);
+            } else if (deduplicated[it->second].index < entry.index) {
+                deduplicated[it->second] = entry;
+            }
         }
 
-        processed_tensor_storages.reserve(latest_map.size());
-        for (auto& [name, entry] : latest_map) {
+        std::sort(deduplicated.begin(), deduplicated.end(), [](const IndexedStorage& a, const IndexedStorage& b) {
+            return a.index < b.index;
+        });
+
+        processed_tensor_storages.reserve(deduplicated.size());
+        for (auto& entry : deduplicated) {
             processed_tensor_storages.push_back(entry.ts);
         }
     }
@@ -2447,6 +2461,8 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
             // Pass, do not convert. For MMDiT
         } else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
             // Pass, do not convert. For Unet
+        } else if (contains(name, "embedding")) {
+            // Pass, do not convert embedding
         } else {
             return true;
         }
diff --git a/pmid.hpp b/pmid.hpp
index 3bd59cd..63029cb 100644
--- a/pmid.hpp
+++ b/pmid.hpp
@@ -591,7 +591,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
         return "id_embeds";
     }
 
-    bool load_from_file(bool filter_tensor = false) {
+    bool load_from_file(bool filter_tensor, int n_threads) {
         LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
 
         if (load_failed) {
@@ -623,11 +623,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
             return true;
         };
 
-        model_loader->load_tensors(on_new_tensor_cb);
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
         alloc_params_buffer();
 
         dry_run = false;
-        model_loader->load_tensors(on_new_tensor_cb);
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
 
         LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
         return true;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 1192b05..db06508 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -549,7 +549,7 @@ public:
             }
             if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
                 pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "");
-                if (!pmid_lora->load_from_file(true)) {
+                if (!pmid_lora->load_from_file(true, n_threads)) {
                     LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
                     return false;
                 }
@@ -617,14 +617,14 @@ public:
             if (!use_tiny_autoencoder) {
                 vae_params_mem_size = first_stage_model->get_params_buffer_size();
             } else {
-                if (!tae_first_stage->load_from_file(taesd_path)) {
+                if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
                     return false;
                 }
                 vae_params_mem_size = tae_first_stage->get_params_buffer_size();
             }
             size_t control_net_params_mem_size = 0;
             if (control_net) {
-                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) {
+                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {
                     return false;
                 }
                 control_net_params_mem_size = control_net->get_params_buffer_size();
@@ -861,7 +861,7 @@ public:
             return;
         }
         LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "");
-        if (!lora.load_from_file()) {
+        if (!lora.load_from_file(false, n_threads)) {
             LOG_WARN("load lora tensors from %s failed", file_path.c_str());
             return;
         }
diff --git a/tae.hpp b/tae.hpp
index 1ae1257..41bcbe2 100644
--- a/tae.hpp
+++ b/tae.hpp
@@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
         return "taesd";
     }
 
-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
         LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
         alloc_params_buffer();
         std::map<std::string, ggml_tensor*> taesd_tensors;
@@ -238,7 +238,7 @@ struct TinyAutoEncoder : public GGMLRunner {
             return false;
         }
 
-        bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);
+        bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads);
 
         if (!success) {
             LOG_ERROR("load tae tensors from model loader failed");
diff --git a/upscaler.cpp b/upscaler.cpp
index 7e765d7..4c138ea 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -18,7 +18,8 @@ struct UpscalerGGML {
     }
 
     bool load_from_file(const std::string& esrgan_path,
-                        bool offload_params_to_cpu) {
+                        bool offload_params_to_cpu,
+                        int n_threads) {
         ggml_log_set(ggml_log_callback_default, nullptr);
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
@@ -54,7 +55,7 @@ struct UpscalerGGML {
         if (direct) {
             esrgan_upscaler->enable_conv2d_direct();
         }
-        if (!esrgan_upscaler->load_from_file(esrgan_path)) {
+        if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {
             return false;
         }
         return true;
@@ -124,7 +125,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
         return NULL;
     }
 
-    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
         delete upscaler_ctx->upscaler;
         upscaler_ctx->upscaler = NULL;
         free(upscaler_ctx);