diff --git a/qwen_image.hpp b/qwen_image.hpp
index 4fb0e47..68b481a 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -256,7 +256,7 @@ namespace Qwen {
             auto txt_gate1     = txt_mod_param_vec[2];
 
             auto [img_attn_output, txt_attn_output] = attn->forward(ctx, backend, img_modulated, txt_modulated, pe);
-            
+
             img = ggml_add(ctx, img, ggml_mul(ctx, img_attn_output, img_gate1));
             txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn_output, txt_gate1));
 
@@ -564,7 +564,7 @@ namespace Qwen {
                                                          timesteps,
                                                          context,
                                                          pe,
-                                                        ref_latents);
+                                                         ref_latents);
 
             ggml_build_forward_expand(gf, out);
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 60966ae..c6b9419 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -952,7 +952,7 @@ public:
             ggml_set_f32(output, 0.f);
         } else {
             sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
-            sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size);
+            sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size);
             free(image.data);
             image.data = NULL;
 
@@ -2029,7 +2029,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
             std::vector<sd_image_f32_t> processed_id_images;
             for (int i = 0; i < pm_params.id_images_count; i++) {
                 sd_image_f32_t id_image           = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]);
-                sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size);
+                sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
                 free(id_image.data);
                 id_image.data = NULL;
                 processed_id_images.push_back(processed_id_image);
diff --git a/util.cpp b/util.cpp
index 5af6b1e..7b20950 100644
--- a/util.cpp
+++ b/util.cpp
@@ -84,6 +84,7 @@ int round_up_to(int value, int base) {
 }
 
 #ifdef _WIN32  // code for windows
+#define NOMINMAX
 #include <windows.h>
 
 bool file_exists(const std::string& filename) {
@@ -427,18 +428,21 @@ float means[3] = {0.48145466, 0.4578275, 0.40821073};
 float stds[3]  = {0.26862954, 0.26130258, 0.27577711};
 
 // Function to clip and preprocess sd_image_f32_t
-sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
-    float scale = (float)size / fmin(image.width, image.height);
+sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height) {
+    float width_scale  = (float)target_width / image.width;
+    float height_scale = (float)target_height / image.height;
+
+    float scale = std::fmax(width_scale, height_scale);
 
     // Interpolation
-    int new_width       = (int)(scale * image.width);
-    int new_height      = (int)(scale * image.height);
-    float* resized_data = (float*)malloc(new_width * new_height * image.channel * sizeof(float));
+    int resized_width   = (int)(scale * image.width);
+    int resized_height  = (int)(scale * image.height);
+    float* resized_data = (float*)malloc(resized_width * resized_height * image.channel * sizeof(float));
 
-    for (int y = 0; y < new_height; y++) {
-        for (int x = 0; x < new_width; x++) {
-            float original_x = (float)x * image.width / new_width;
-            float original_y = (float)y * image.height / new_height;
+    for (int y = 0; y < resized_height; y++) {
+        for (int x = 0; x < resized_width; x++) {
+            float original_x = (float)x * image.width / resized_width;
+            float original_y = (float)y * image.height / resized_height;
 
             int x1 = (int)original_x;
             int y1 = (int)original_y;
@@ -456,26 +460,26 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
 
                 float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
 
-                *(resized_data + y * new_width * image.channel + x * image.channel + k) = value;
+                *(resized_data + y * resized_width * image.channel + x * image.channel + k) = value;
             }
         }
     }
 
     // Clip and preprocess
-    int h = (new_height - size) / 2;
-    int w = (new_width - size) / 2;
+    int h_offset = std::max((int)(resized_height - target_height) / 2, 0);
+    int w_offset = std::max((int)(resized_width - target_width) / 2, 0);
 
     sd_image_f32_t result;
-    result.width   = size;
-    result.height  = size;
+    result.width   = target_width;
+    result.height  = target_height;
     result.channel = image.channel;
-    result.data    = (float*)malloc(size * size * image.channel * sizeof(float));
+    result.data    = (float*)malloc(target_height * target_width * image.channel * sizeof(float));
 
     for (int k = 0; k < image.channel; k++) {
-        for (int i = 0; i < size; i++) {
-            for (int j = 0; j < size; j++) {
-                *(result.data + i * size * image.channel + j * image.channel + k) =
-                    fmin(fmax(*(resized_data + (i + h) * new_width * image.channel + (j + w) * image.channel + k), 0.0f), 255.0f) / 255.0f;
+        for (int i = 0; i < result.height; i++) {
+            for (int j = 0; j < result.width; j++) {
+                *(result.data + i * result.width * image.channel + j * image.channel + k) =
+                    fmin(fmax(*(resized_data + (i + h_offset) * resized_width * image.channel + (j + w_offset) * image.channel + k), 0.0f), 255.0f) / 255.0f;
             }
         }
     }
@@ -485,10 +489,10 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
 
     // Normalize
     for (int k = 0; k < image.channel; k++) {
-        for (int i = 0; i < size; i++) {
-            for (int j = 0; j < size; j++) {
+        for (int i = 0; i < result.height; i++) {
+            for (int j = 0; j < result.width; j++) {
                 // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
-                int offset  = i * size * image.channel + j * image.channel + k;
+                int offset  = i * result.width * image.channel + j * image.channel + k;
                 float value = *(result.data + offset);
                 value       = (value - means[k]) / stds[k];
                 // value = 0.5f;
diff --git a/util.h b/util.h
index 1e8db6e..17bcd1d 100644
--- a/util.h
+++ b/util.h
@@ -42,7 +42,7 @@ sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image);
 
 sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height);
 
-sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size);
+sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height);
 
 std::string path_join(const std::string& p1, const std::string& p2);
 std::vector<std::string> split_string(const std::string& str, char delimiter);