fix: avoid generating black images when running T5 on the GPU (#882 )

fix: resolve VAE tiling problem in Qwen Image (#873 )
2025-12-13 05:48:56 +00:00 · 2025-10-13 00:01:06 +08:00 · 2025-10-12 23:45:53 +08:00
3 changed files with 43 additions and 36 deletions
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -483,12 +483,15 @@ __STATIC_INLINE__ void ggml_split_tensor_2d(struct ggml_tensor* input,
    int64_t width    = output->ne[0];
    int64_t height   = output->ne[1];
    int64_t channels = output->ne[2];
+    int64_t ne3      = output->ne[3];
    GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
    for (int iy = 0; iy < height; iy++) {
        for (int ix = 0; ix < width; ix++) {
            for (int k = 0; k < channels; k++) {
-                float value = ggml_tensor_get_f32(input, ix + x, iy + y, k);
-                ggml_tensor_set_f32(output, value, ix, iy, k);
+                for (int l = 0; l < ne3; l++) {
+                    float value = ggml_tensor_get_f32(input, ix + x, iy + y, k, l);
+                    ggml_tensor_set_f32(output, value, ix, iy, k, l);
+                }
            }
        }
    }
@ -511,6 +514,7 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
    int64_t width    = input->ne[0];
    int64_t height   = input->ne[1];
    int64_t channels = input->ne[2];
+    int64_t ne3      = input->ne[3];

    int64_t img_width  = output->ne[0];
    int64_t img_height = output->ne[1];
@ -519,24 +523,26 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
    for (int iy = y_skip; iy < height; iy++) {
        for (int ix = x_skip; ix < width; ix++) {
            for (int k = 0; k < channels; k++) {
-                float new_value = ggml_tensor_get_f32(input, ix, iy, k);
-                if (overlap_x > 0 || overlap_y > 0) {  // blend colors in overlapped area
-                    float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k);
+                for (int l = 0; l < ne3; l++) {
+                    float new_value = ggml_tensor_get_f32(input, ix, iy, k, l);
+                    if (overlap_x > 0 || overlap_y > 0) {  // blend colors in overlapped area
+                        float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k, l);

-                    const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1;
-                    const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1;
-                    const float y_f_0 = (overlap_y > 0 && y > 0) ? (iy - y_skip) / float(overlap_y) : 1;
-                    const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float(overlap_y) : 1;
+                        const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1;
+                        const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1;
+                        const float y_f_0 = (overlap_y > 0 && y > 0) ? (iy - y_skip) / float(overlap_y) : 1;
+                        const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float(overlap_y) : 1;

-                    const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
-                    const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);
+                        const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
+                        const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);

-                    ggml_tensor_set_f32(
-                        output,
-                        old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f),
-                        x + ix, y + iy, k);
-                } else {
-                    ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
+                        ggml_tensor_set_f32(
+                            output,
+                            old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f),
+                            x + ix, y + iy, k, l);
+                    } else {
+                        ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k, l);
+                    }
                }
            }
        }
@ -852,8 +858,8 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
    }

    struct ggml_init_params params = {};
-    params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * sizeof(float);     // input chunk
-    params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * sizeof(float);  // output chunk
+    params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float);      // input chunk
+    params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float);  // output chunk
    params.mem_size += 3 * ggml_tensor_overhead();
    params.mem_buffer = NULL;
    params.no_alloc   = false;
@ -868,8 +874,8 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
    }

    // tiling
-    ggml_tensor* input_tile  = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], 1);
-    ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], 1);
+    ggml_tensor* input_tile  = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
+    ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
    int num_tiles            = num_tiles_x * num_tiles_y;
    LOG_INFO("processing %i tiles", num_tiles);
    pretty_progress(0, num_tiles, 0.0f);
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -338,17 +338,7 @@ public:
        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;

        {
-            clip_backend   = backend;
-            bool use_t5xxl = false;
-            if (sd_version_is_dit(version) && !sd_version_is_qwen_image(version)) {
-                use_t5xxl = true;
-            }
-            if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) {
-                LOG_WARN(
-                    "!!!It appears that you are using the T5 model. Some backends may encounter issues with it."
-                    "If you notice that the generated images are completely black,"
-                    "try running the T5 model on the CPU using the --clip-on-cpu parameter.");
-            }
+            clip_backend = backend;
            if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
                LOG_INFO("CLIP: Using CPU backend");
                clip_backend = ggml_backend_cpu_init();
@ -1440,10 +1430,19 @@ public:
        if (vae_tiling_params.enabled && !encode_video) {
            // TODO wan2.2 vae support?
            int C = sd_version_is_dit(version) ? 16 : 4;
-            if (!use_tiny_autoencoder) {
-                C *= 2;
+            int ne2;
+            int ne3;
+            if (sd_version_is_qwen_image(version)) {
+                ne2 = 1;
+                ne3 = C*x->ne[3];
+            } else {
+                if (!use_tiny_autoencoder) {
+                    C *= 2;
+                }
+                ne2 = C;
+                ne3 = x->ne[3];
            }
-            result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, x->ne[3]);
+            result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
        }

        if (sd_version_is_qwen_image(version)) {
--- a/t5.hpp
+++ b/t5.hpp
@ -504,7 +504,9 @@ public:
    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        blocks["wo"]   = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
+        float scale    = 1.f / 32.f;
+        // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
    }

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
Author	SHA1	Message	Date
leejet	1c32fa03bc	fix: avoid generating black images when running T5 on the GPU (#882 )	2025-10-13 00:01:06 +08:00
Wagner Bruna	9727c6bb98	fix: resolve VAE tiling problem in Qwen Image (#873 )	2025-10-12 23:45:53 +08:00