fix wan2.1 i2v

This commit is contained in:
leejet 2025-08-24 14:07:38 +08:00
parent 9fcc85688d
commit cf48441345
4 changed files with 25 additions and 14 deletions

View File

@ -774,7 +774,10 @@ public:
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* pixel_values,
bool return_pooled = true,
int clip_skip = -1) {
// pixel_values: [N, num_channels, image_size, image_size]
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@ -783,7 +786,8 @@ public:
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x);
x = encoder->forward(ctx, x, -1, false);
LOG_DEBUG("clip_vison skip %d", clip_skip);
x = encoder->forward(ctx, x, clip_skip, false);
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
auto last_hidden_state = x;
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
@ -853,13 +857,14 @@ public:
struct ggml_tensor* forward(struct ggml_context* ctx,
struct ggml_tensor* pixel_values,
bool return_pooled = true) {
bool return_pooled = true,
int clip_skip = -1) {
// pixel_values: [N, num_channels, image_size, image_size]
// return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
auto x = vision_model->forward(ctx, pixel_values, return_pooled); // [N, hidden_size] or [N, n_token, hidden_size]
auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
if (return_pooled) {
x = visual_projection->forward(ctx, x); // [N, projection_dim]

View File

@ -634,12 +634,12 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
}
struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled) {
struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled, int clip_skip) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
pixel_values = to_backend(pixel_values);
struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values, return_pooled);
struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states);
@ -649,10 +649,11 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
void compute(const int n_threads,
ggml_tensor* pixel_values,
bool return_pooled,
int clip_skip,
ggml_tensor** output,
ggml_context* output_ctx) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(pixel_values, return_pooled);
return build_graph(pixel_values, return_pooled, clip_skip);
};
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
}

View File

@ -168,6 +168,7 @@ struct LoraModel : public GGMLRunner {
auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
out = ggml_get_rows(ctx, out, zero_index);
out = ggml_reshape(ctx, out, a);
// auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
return out;
}
@ -246,6 +247,8 @@ struct LoraModel : public GGMLRunner {
set_backend_tensor_data(zero_index, zero_index_vec.data());
ggml_build_forward_expand(gf, zero_index);
original_tensor_to_final_tensor.clear();
std::set<std::string> applied_lora_tensors;
for (auto it : model_tensors) {
std::string model_tensor_name = it.first;
@ -812,7 +815,7 @@ struct LoraModel : public GGMLRunner {
}
scale_value *= multiplier;
ggml_tensor* original_tensor = model_tensor;
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(model_tensor->buffer)) {
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
model_tensor = ggml_dup_tensor(compute_ctx, model_tensor);
set_backend_tensor_data(model_tensor, original_tensor->data);
}

View File

@ -830,6 +830,7 @@ public:
ggml_tensor* get_clip_vision_output(ggml_context* work_ctx,
sd_image_t init_image,
bool return_pooled = true,
int clip_skip = -1,
bool zero_out_masked = false) {
ggml_tensor* output = NULL;
if (zero_out_masked) {
@ -857,7 +858,7 @@ public:
resized_image.data = NULL;
// print_ggml_tensor(pixel_values);
clip_vision->compute(n_threads, pixel_values, return_pooled, &output, work_ctx);
clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx);
// print_ggml_tensor(c_crossattn);
}
return output;
@ -873,7 +874,7 @@ public:
bool zero_out_masked = false) {
// c_crossattn
int64_t t0 = ggml_time_ms();
struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, zero_out_masked);
struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked);
// c_concat
struct ggml_tensor* c_concat = NULL;
@ -2250,15 +2251,18 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
int64_t t0 = ggml_time_ms();
// Apply lora
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
ggml_tensor* clip_vision_output = NULL;
ggml_tensor* concat_latent = NULL;
if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B") {
LOG_INFO("IMG2VID");
if (sd_vid_gen_params->init_image.data) {
clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false);
clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2);
} else {
clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, true);
clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2, true);
}
int64_t t1 = ggml_time_ms();
@ -2312,8 +2316,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true);
int sample_steps = sigmas.size() - 1;
// Apply lora
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
// Get learned condition
bool zero_out_masked = true;