Compare commits

...

2 Commits

Author SHA1 Message Date
leejet
1c32fa03bc
fix: avoid generating black images when running T5 on the GPU (#882) 2025-10-13 00:01:06 +08:00
Wagner Bruna
9727c6bb98
fix: resolve VAE tiling problem in Qwen Image (#873) 2025-10-12 23:45:53 +08:00
3 changed files with 43 additions and 36 deletions

View File

@ -483,12 +483,15 @@ __STATIC_INLINE__ void ggml_split_tensor_2d(struct ggml_tensor* input,
int64_t width = output->ne[0]; int64_t width = output->ne[0];
int64_t height = output->ne[1]; int64_t height = output->ne[1];
int64_t channels = output->ne[2]; int64_t channels = output->ne[2];
int64_t ne3 = output->ne[3];
GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32); GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
for (int iy = 0; iy < height; iy++) { for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) { for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) { for (int k = 0; k < channels; k++) {
float value = ggml_tensor_get_f32(input, ix + x, iy + y, k); for (int l = 0; l < ne3; l++) {
ggml_tensor_set_f32(output, value, ix, iy, k); float value = ggml_tensor_get_f32(input, ix + x, iy + y, k, l);
ggml_tensor_set_f32(output, value, ix, iy, k, l);
}
} }
} }
} }
@ -511,6 +514,7 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
int64_t width = input->ne[0]; int64_t width = input->ne[0];
int64_t height = input->ne[1]; int64_t height = input->ne[1];
int64_t channels = input->ne[2]; int64_t channels = input->ne[2];
int64_t ne3 = input->ne[3];
int64_t img_width = output->ne[0]; int64_t img_width = output->ne[0];
int64_t img_height = output->ne[1]; int64_t img_height = output->ne[1];
@ -519,9 +523,10 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
for (int iy = y_skip; iy < height; iy++) { for (int iy = y_skip; iy < height; iy++) {
for (int ix = x_skip; ix < width; ix++) { for (int ix = x_skip; ix < width; ix++) {
for (int k = 0; k < channels; k++) { for (int k = 0; k < channels; k++) {
float new_value = ggml_tensor_get_f32(input, ix, iy, k); for (int l = 0; l < ne3; l++) {
float new_value = ggml_tensor_get_f32(input, ix, iy, k, l);
if (overlap_x > 0 || overlap_y > 0) { // blend colors in overlapped area if (overlap_x > 0 || overlap_y > 0) { // blend colors in overlapped area
float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k); float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k, l);
const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1; const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1;
const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1; const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1;
@ -534,9 +539,10 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
ggml_tensor_set_f32( ggml_tensor_set_f32(
output, output,
old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f), old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f),
x + ix, y + iy, k); x + ix, y + iy, k, l);
} else { } else {
ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k); ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k, l);
}
} }
} }
} }
@ -852,8 +858,8 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
} }
struct ggml_init_params params = {}; struct ggml_init_params params = {};
params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * sizeof(float); // input chunk params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float); // input chunk
params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * sizeof(float); // output chunk params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float); // output chunk
params.mem_size += 3 * ggml_tensor_overhead(); params.mem_size += 3 * ggml_tensor_overhead();
params.mem_buffer = NULL; params.mem_buffer = NULL;
params.no_alloc = false; params.no_alloc = false;
@ -868,8 +874,8 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
} }
// tiling // tiling
ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], 1); ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], 1); ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
int num_tiles = num_tiles_x * num_tiles_y; int num_tiles = num_tiles_x * num_tiles_y;
LOG_INFO("processing %i tiles", num_tiles); LOG_INFO("processing %i tiles", num_tiles);
pretty_progress(0, num_tiles, 0.0f); pretty_progress(0, num_tiles, 0.0f);

View File

@ -339,16 +339,6 @@ public:
{ {
clip_backend = backend; clip_backend = backend;
bool use_t5xxl = false;
if (sd_version_is_dit(version) && !sd_version_is_qwen_image(version)) {
use_t5xxl = true;
}
if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) {
LOG_WARN(
"!!!It appears that you are using the T5 model. Some backends may encounter issues with it."
"If you notice that the generated images are completely black,"
"try running the T5 model on the CPU using the --clip-on-cpu parameter.");
}
if (clip_on_cpu && !ggml_backend_is_cpu(backend)) { if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
LOG_INFO("CLIP: Using CPU backend"); LOG_INFO("CLIP: Using CPU backend");
clip_backend = ggml_backend_cpu_init(); clip_backend = ggml_backend_cpu_init();
@ -1440,10 +1430,19 @@ public:
if (vae_tiling_params.enabled && !encode_video) { if (vae_tiling_params.enabled && !encode_video) {
// TODO wan2.2 vae support? // TODO wan2.2 vae support?
int C = sd_version_is_dit(version) ? 16 : 4; int C = sd_version_is_dit(version) ? 16 : 4;
int ne2;
int ne3;
if (sd_version_is_qwen_image(version)) {
ne2 = 1;
ne3 = C*x->ne[3];
} else {
if (!use_tiny_autoencoder) { if (!use_tiny_autoencoder) {
C *= 2; C *= 2;
} }
result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, x->ne[3]); ne2 = C;
ne3 = x->ne[3];
}
result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
} }
if (sd_version_is_qwen_image(version)) { if (sd_version_is_qwen_image(version)) {

4
t5.hpp
View File

@ -504,7 +504,9 @@ public:
T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false)); float scale = 1.f / 32.f;
// The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {