optimize clip_preprocess and fix get_first_stage_encoding

This commit is contained in:
leejet 2025-09-24 20:21:55 +08:00
parent 4e48e6b82b
commit 95cae28465
4 changed files with 31 additions and 27 deletions

View File

@ -256,7 +256,7 @@ namespace Qwen {
auto txt_gate1 = txt_mod_param_vec[2]; auto txt_gate1 = txt_mod_param_vec[2];
auto [img_attn_output, txt_attn_output] = attn->forward(ctx, backend, img_modulated, txt_modulated, pe); auto [img_attn_output, txt_attn_output] = attn->forward(ctx, backend, img_modulated, txt_modulated, pe);
img = ggml_add(ctx, img, ggml_mul(ctx, img_attn_output, img_gate1)); img = ggml_add(ctx, img, ggml_mul(ctx, img_attn_output, img_gate1));
txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn_output, txt_gate1)); txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn_output, txt_gate1));
@ -564,7 +564,7 @@ namespace Qwen {
timesteps, timesteps,
context, context,
pe, pe,
ref_latents); ref_latents);
ggml_build_forward_expand(gf, out); ggml_build_forward_expand(gf, out);

View File

@ -952,7 +952,7 @@ public:
ggml_set_f32(output, 0.f); ggml_set_f32(output, 0.f);
} else { } else {
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image);
sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size); sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size);
free(image.data); free(image.data);
image.data = NULL; image.data = NULL;
@ -2029,7 +2029,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
std::vector<sd_image_f32_t> processed_id_images; std::vector<sd_image_f32_t> processed_id_images;
for (int i = 0; i < pm_params.id_images_count; i++) { for (int i = 0; i < pm_params.id_images_count; i++) {
sd_image_f32_t id_image = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]); sd_image_f32_t id_image = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]);
sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size); sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
free(id_image.data); free(id_image.data);
id_image.data = NULL; id_image.data = NULL;
processed_id_images.push_back(processed_id_image); processed_id_images.push_back(processed_id_image);

View File

@ -84,6 +84,7 @@ int round_up_to(int value, int base) {
} }
#ifdef _WIN32 // code for windows #ifdef _WIN32 // code for windows
#define NOMINMAX
#include <windows.h> #include <windows.h>
bool file_exists(const std::string& filename) { bool file_exists(const std::string& filename) {
@ -427,18 +428,21 @@ float means[3] = {0.48145466, 0.4578275, 0.40821073};
float stds[3] = {0.26862954, 0.26130258, 0.27577711}; float stds[3] = {0.26862954, 0.26130258, 0.27577711};
// Function to clip and preprocess sd_image_f32_t // Function to clip and preprocess sd_image_f32_t
sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) { sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height) {
float scale = (float)size / fmin(image.width, image.height); float width_scale = (float)target_width / image.width;
float height_scale = (float)target_height / image.height;
float scale = std::fmax(width_scale, height_scale);
// Interpolation // Interpolation
int new_width = (int)(scale * image.width); int resized_width = (int)(scale * image.width);
int new_height = (int)(scale * image.height); int resized_height = (int)(scale * image.height);
float* resized_data = (float*)malloc(new_width * new_height * image.channel * sizeof(float)); float* resized_data = (float*)malloc(resized_width * resized_height * image.channel * sizeof(float));
for (int y = 0; y < new_height; y++) { for (int y = 0; y < resized_height; y++) {
for (int x = 0; x < new_width; x++) { for (int x = 0; x < resized_width; x++) {
float original_x = (float)x * image.width / new_width; float original_x = (float)x * image.width / resized_width;
float original_y = (float)y * image.height / new_height; float original_y = (float)y * image.height / resized_height;
int x1 = (int)original_x; int x1 = (int)original_x;
int y1 = (int)original_y; int y1 = (int)original_y;
@ -456,26 +460,26 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio); float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
*(resized_data + y * new_width * image.channel + x * image.channel + k) = value; *(resized_data + y * resized_width * image.channel + x * image.channel + k) = value;
} }
} }
} }
// Clip and preprocess // Clip and preprocess
int h = (new_height - size) / 2; int h_offset = std::max((int)(resized_height - target_height) / 2, 0);
int w = (new_width - size) / 2; int w_offset = std::max((int)(resized_width - target_width) / 2, 0);
sd_image_f32_t result; sd_image_f32_t result;
result.width = size; result.width = target_width;
result.height = size; result.height = target_height;
result.channel = image.channel; result.channel = image.channel;
result.data = (float*)malloc(size * size * image.channel * sizeof(float)); result.data = (float*)malloc(target_height * target_width * image.channel * sizeof(float));
for (int k = 0; k < image.channel; k++) { for (int k = 0; k < image.channel; k++) {
for (int i = 0; i < size; i++) { for (int i = 0; i < result.height; i++) {
for (int j = 0; j < size; j++) { for (int j = 0; j < result.width; j++) {
*(result.data + i * size * image.channel + j * image.channel + k) = *(result.data + i * result.width * image.channel + j * image.channel + k) =
fmin(fmax(*(resized_data + (i + h) * new_width * image.channel + (j + w) * image.channel + k), 0.0f), 255.0f) / 255.0f; fmin(fmax(*(resized_data + (i + h_offset) * resized_width * image.channel + (j + w_offset) * image.channel + k), 0.0f), 255.0f) / 255.0f;
} }
} }
} }
@ -485,10 +489,10 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
// Normalize // Normalize
for (int k = 0; k < image.channel; k++) { for (int k = 0; k < image.channel; k++) {
for (int i = 0; i < size; i++) { for (int i = 0; i < result.height; i++) {
for (int j = 0; j < size; j++) { for (int j = 0; j < result.width; j++) {
// *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f; // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
int offset = i * size * image.channel + j * image.channel + k; int offset = i * result.width * image.channel + j * image.channel + k;
float value = *(result.data + offset); float value = *(result.data + offset);
value = (value - means[k]) / stds[k]; value = (value - means[k]) / stds[k];
// value = 0.5f; // value = 0.5f;

2
util.h
View File

@ -42,7 +42,7 @@ sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image);
sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height); sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height);
sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size); sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height);
std::string path_join(const std::string& p1, const std::string& p2); std::string path_join(const std::string& p1, const std::string& p2);
std::vector<std::string> split_string(const std::string& str, char delimiter); std::vector<std::string> split_string(const std::string& str, char delimiter);