diff --git a/clip.hpp b/clip.hpp index ce22863..1f64eee 100644 --- a/clip.hpp +++ b/clip.hpp @@ -774,7 +774,10 @@ public: blocks["post_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size)); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) { + struct ggml_tensor* forward(struct ggml_context* ctx, + struct ggml_tensor* pixel_values, + bool return_pooled = true, + int clip_skip = -1) { // pixel_values: [N, num_channels, image_size, image_size] auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); auto pre_layernorm = std::dynamic_pointer_cast(blocks["pre_layernorm"]); @@ -783,7 +786,8 @@ public: auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim] x = pre_layernorm->forward(ctx, x); - x = encoder->forward(ctx, x, -1, false); + LOG_DEBUG("clip_vison skip %d", clip_skip); + x = encoder->forward(ctx, x, clip_skip, false); // print_ggml_tensor(x, true, "ClipVisionModel x: "); auto last_hidden_state = x; x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size] @@ -853,13 +857,14 @@ public: struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, - bool return_pooled = true) { + bool return_pooled = true, + int clip_skip = -1) { // pixel_values: [N, num_channels, image_size, image_size] // return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size] auto vision_model = std::dynamic_pointer_cast(blocks["vision_model"]); auto visual_projection = std::dynamic_pointer_cast(blocks["visual_projection"]); - auto x = vision_model->forward(ctx, pixel_values, return_pooled); // [N, hidden_size] or [N, n_token, hidden_size] + auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size] if (return_pooled) { x = visual_projection->forward(ctx, x); // [N, projection_dim] diff --git a/conditioner.hpp b/conditioner.hpp index da7a08d..d01b1c6 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -634,12 +634,12 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { vision_model.get_param_tensors(tensors, "cond_stage_model.transformer"); } - struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled) { + struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled, int clip_skip) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); pixel_values = to_backend(pixel_values); - struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values, return_pooled); + struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values, return_pooled, clip_skip); ggml_build_forward_expand(gf, hidden_states); @@ -649,10 +649,11 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { void compute(const int n_threads, ggml_tensor* pixel_values, bool return_pooled, + int clip_skip, ggml_tensor** output, ggml_context* output_ctx) { auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(pixel_values, return_pooled); + return build_graph(pixel_values, return_pooled, clip_skip); }; GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); } diff --git a/lora.hpp b/lora.hpp index 2e1e490..84c8be5 100644 --- a/lora.hpp +++ b/lora.hpp @@ -168,6 +168,7 @@ struct LoraModel : public GGMLRunner { auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a)); out = ggml_get_rows(ctx, out, zero_index); out = ggml_reshape(ctx, out, a); + // auto out = ggml_cast(ctx, a, GGML_TYPE_F32); return out; } @@ -246,6 +247,8 @@ struct LoraModel : public GGMLRunner { set_backend_tensor_data(zero_index, zero_index_vec.data()); ggml_build_forward_expand(gf, zero_index); + original_tensor_to_final_tensor.clear(); + std::set applied_lora_tensors; for (auto it : model_tensors) { std::string model_tensor_name = it.first; @@ -812,7 +815,7 @@ struct LoraModel : public GGMLRunner { } scale_value *= multiplier; ggml_tensor* original_tensor = model_tensor; - if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(model_tensor->buffer)) { + if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) { model_tensor = ggml_dup_tensor(compute_ctx, model_tensor); set_backend_tensor_data(model_tensor, original_tensor->data); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 94c7fab..a06bfdc 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -830,6 +830,7 @@ public: ggml_tensor* get_clip_vision_output(ggml_context* work_ctx, sd_image_t init_image, bool return_pooled = true, + int clip_skip = -1, bool zero_out_masked = false) { ggml_tensor* output = NULL; if (zero_out_masked) { @@ -857,7 +858,7 @@ public: resized_image.data = NULL; // print_ggml_tensor(pixel_values); - clip_vision->compute(n_threads, pixel_values, return_pooled, &output, work_ctx); + clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx); // print_ggml_tensor(c_crossattn); } return output; @@ -873,7 +874,7 @@ public: bool zero_out_masked = false) { // c_crossattn int64_t t0 = ggml_time_ms(); - struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, zero_out_masked); + struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked); // c_concat struct ggml_tensor* c_concat = NULL; @@ -2250,15 +2251,18 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s int64_t t0 = ggml_time_ms(); + // Apply lora + prompt = sd_ctx->sd->apply_loras_from_prompt(prompt); + ggml_tensor* clip_vision_output = NULL; ggml_tensor* concat_latent = NULL; if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B") { LOG_INFO("IMG2VID"); if (sd_vid_gen_params->init_image.data) { - clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false); + clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2); } else { - clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, true); + clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2, true); } int64_t t1 = ggml_time_ms(); @@ -2312,8 +2316,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true); int sample_steps = sigmas.size() - 1; - // Apply lora - prompt = sd_ctx->sd->apply_loras_from_prompt(prompt); // Get learned condition bool zero_out_masked = true;