diff --git a/qwen_image.hpp b/qwen_image.hpp index 4fb0e47..68b481a 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -256,7 +256,7 @@ namespace Qwen { auto txt_gate1 = txt_mod_param_vec[2]; auto [img_attn_output, txt_attn_output] = attn->forward(ctx, backend, img_modulated, txt_modulated, pe); - + img = ggml_add(ctx, img, ggml_mul(ctx, img_attn_output, img_gate1)); txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn_output, txt_gate1)); @@ -564,7 +564,7 @@ namespace Qwen { timesteps, context, pe, - ref_latents); + ref_latents); ggml_build_forward_expand(gf, out); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 60966ae..c6b9419 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -952,7 +952,7 @@ public: ggml_set_f32(output, 0.f); } else { sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); - sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size); + sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size); free(image.data); image.data = NULL; @@ -2029,7 +2029,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, std::vector processed_id_images; for (int i = 0; i < pm_params.id_images_count; i++) { sd_image_f32_t id_image = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]); - sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size); + sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size); free(id_image.data); id_image.data = NULL; processed_id_images.push_back(processed_id_image); diff --git a/util.cpp b/util.cpp index 5af6b1e..7b20950 100644 --- a/util.cpp +++ b/util.cpp @@ -84,6 +84,7 @@ int round_up_to(int value, int base) { } #ifdef _WIN32 // code for windows +#define NOMINMAX #include bool file_exists(const std::string& filename) { @@ -427,18 +428,21 @@ float means[3] = {0.48145466, 0.4578275, 0.40821073}; float stds[3] = {0.26862954, 0.26130258, 0.27577711}; // Function to clip and preprocess sd_image_f32_t -sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) { - float scale = (float)size / fmin(image.width, image.height); +sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height) { + float width_scale = (float)target_width / image.width; + float height_scale = (float)target_height / image.height; + + float scale = std::fmax(width_scale, height_scale); // Interpolation - int new_width = (int)(scale * image.width); - int new_height = (int)(scale * image.height); - float* resized_data = (float*)malloc(new_width * new_height * image.channel * sizeof(float)); + int resized_width = (int)(scale * image.width); + int resized_height = (int)(scale * image.height); + float* resized_data = (float*)malloc(resized_width * resized_height * image.channel * sizeof(float)); - for (int y = 0; y < new_height; y++) { - for (int x = 0; x < new_width; x++) { - float original_x = (float)x * image.width / new_width; - float original_y = (float)y * image.height / new_height; + for (int y = 0; y < resized_height; y++) { + for (int x = 0; x < resized_width; x++) { + float original_x = (float)x * image.width / resized_width; + float original_y = (float)y * image.height / resized_height; int x1 = (int)original_x; int y1 = (int)original_y; @@ -456,26 +460,26 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) { float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio); - *(resized_data + y * new_width * image.channel + x * image.channel + k) = value; + *(resized_data + y * resized_width * image.channel + x * image.channel + k) = value; } } } // Clip and preprocess - int h = (new_height - size) / 2; - int w = (new_width - size) / 2; + int h_offset = std::max((int)(resized_height - target_height) / 2, 0); + int w_offset = std::max((int)(resized_width - target_width) / 2, 0); sd_image_f32_t result; - result.width = size; - result.height = size; + result.width = target_width; + result.height = target_height; result.channel = image.channel; - result.data = (float*)malloc(size * size * image.channel * sizeof(float)); + result.data = (float*)malloc(target_height * target_width * image.channel * sizeof(float)); for (int k = 0; k < image.channel; k++) { - for (int i = 0; i < size; i++) { - for (int j = 0; j < size; j++) { - *(result.data + i * size * image.channel + j * image.channel + k) = - fmin(fmax(*(resized_data + (i + h) * new_width * image.channel + (j + w) * image.channel + k), 0.0f), 255.0f) / 255.0f; + for (int i = 0; i < result.height; i++) { + for (int j = 0; j < result.width; j++) { + *(result.data + i * result.width * image.channel + j * image.channel + k) = + fmin(fmax(*(resized_data + (i + h_offset) * resized_width * image.channel + (j + w_offset) * image.channel + k), 0.0f), 255.0f) / 255.0f; } } } @@ -485,10 +489,10 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) { // Normalize for (int k = 0; k < image.channel; k++) { - for (int i = 0; i < size; i++) { - for (int j = 0; j < size; j++) { + for (int i = 0; i < result.height; i++) { + for (int j = 0; j < result.width; j++) { // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f; - int offset = i * size * image.channel + j * image.channel + k; + int offset = i * result.width * image.channel + j * image.channel + k; float value = *(result.data + offset); value = (value - means[k]) / stds[k]; // value = 0.5f; diff --git a/util.h b/util.h index 1e8db6e..17bcd1d 100644 --- a/util.h +++ b/util.h @@ -42,7 +42,7 @@ sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image); sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height); -sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size); +sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height); std::string path_join(const std::string& p1, const std::string& p2); std::vector split_string(const std::string& str, char delimiter);