fix nan issue that occurs when using CUDA with k-quants weights

add support for diffusers format lora
add support for qwen3 4b gguf
2025-12-12 13:28:37 +00:00 · 2025-11-30 22:54:13 +08:00 · 2025-11-30 21:47:10 +08:00 · 2025-11-30 21:18:49 +08:00
2 changed files with 51 additions and 1 deletions
--- a/name_conversion.cpp
+++ b/name_conversion.cpp
@ -133,6 +133,8 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
        {"attn_q.", "self_attn.q_proj."},
        {"attn_k.", "self_attn.k_proj."},
        {"attn_v.", "self_attn.v_proj."},
+        {"attn_q_norm.", "self_attn.q_norm."},
+        {"attn_k_norm.", "self_attn.k_norm."},
        {"attn_output.", "self_attn.o_proj."},
        {"attn_norm.", "input_layernorm."},
        {"ffn_down.", "mlp.down_proj."},
@ -613,6 +615,44 @@ std::string convert_diffusers_dit_to_original_flux(std::string name) {
    return name;
 }

+std::string convert_diffusers_dit_to_original_lumina2(std::string name) {
+    int num_layers         = 30;
+    int num_refiner_layers = 2;
+    static std::unordered_map<std::string, std::string> z_image_name_map;
+
+    if (z_image_name_map.empty()) {
+        z_image_name_map["all_x_embedder.2-1."]  = "x_embedder.";
+        z_image_name_map["all_final_layer.2-1."] = "final_layer.";
+
+        // --- transformer blocks ---
+        auto add_attention_map = [&](const std::string& prefix, int num) {
+            for (int i = 0; i < num; ++i) {
+                std::string block_prefix = prefix + std::to_string(i) + ".";
+                std::string dst_prefix   = prefix + std::to_string(i) + ".";
+
+                z_image_name_map[block_prefix + "attention.norm_q."]   = dst_prefix + "attention.q_norm.";
+                z_image_name_map[block_prefix + "attention.norm_k."]   = dst_prefix + "attention.k_norm.";
+                z_image_name_map[block_prefix + "attention.to_out.0."] = dst_prefix + "attention.out.";
+
+                z_image_name_map[block_prefix + "attention.to_q.weight"] = dst_prefix + "attention.qkv.weight";
+                z_image_name_map[block_prefix + "attention.to_q.bias"]   = dst_prefix + "attention.qkv.bias";
+                z_image_name_map[block_prefix + "attention.to_k.weight"] = dst_prefix + "attention.qkv.weight.1";
+                z_image_name_map[block_prefix + "attention.to_k.bias"]   = dst_prefix + "attention.qkv.bias.1";
+                z_image_name_map[block_prefix + "attention.to_v.weight"] = dst_prefix + "attention.qkv.weight.2";
+                z_image_name_map[block_prefix + "attention.to_v.bias"]   = dst_prefix + "attention.qkv.bias.2";
+            }
+        };
+
+        add_attention_map("noise_refiner.", num_refiner_layers);
+        add_attention_map("context_refiner.", num_refiner_layers);
+        add_attention_map("layers.", num_layers);
+    }
+
+    replace_with_prefix_map(name, z_image_name_map);
+
+    return name;
+}
+
 std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) {
    if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
        name = convert_diffusers_unet_to_original_sd1(name);
@ -622,6 +662,8 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
        name = convert_diffusers_dit_to_original_sd3(name);
    } else if (sd_version_is_flux(version) || sd_version_is_flux2(version)) {
        name = convert_diffusers_dit_to_original_flux(name);
+    } else if (sd_version_is_z_image(version)) {
+        name = convert_diffusers_dit_to_original_lumina2(name);
    }
    return name;
 }
--- a/z_image.hpp
+++ b/z_image.hpp
@ -85,7 +85,15 @@ namespace ZImage {
            }
            hidden_dim   = multiple_of * ((hidden_dim + multiple_of - 1) / multiple_of);
            blocks["w1"] = std::make_shared<Linear>(dim, hidden_dim, false);
-            blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false);
+
+            bool force_prec_f32 = false;
+            float scale         = 1.f / 128.f;
+#ifdef SD_USE_VULKAN
+            force_prec_f32 = true;
+#endif
+            // The purpose of the scale here is to prevent NaN issues in certain situations.
+            // For example, when using CUDA but the weights are k-quants.
+            blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, 1.f / 128.f);
            blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
        }
Author	SHA1	Message	Date
leejet	1798ec02ba	fix nan issue that occurs when using CUDA with k-quants weights	2025-11-30 22:54:13 +08:00
leejet	2fec01d2b3	add support for diffusers format lora	2025-11-30 21:47:10 +08:00
leejet	c736364a28	add support for qwen3 4b gguf	2025-11-30 21:18:49 +08:00