mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
fix wan2.1 i2v
This commit is contained in:
parent
9fcc85688d
commit
cf48441345
13
clip.hpp
13
clip.hpp
@ -774,7 +774,10 @@ public:
|
||||
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||
struct ggml_tensor* pixel_values,
|
||||
bool return_pooled = true,
|
||||
int clip_skip = -1) {
|
||||
// pixel_values: [N, num_channels, image_size, image_size]
|
||||
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
|
||||
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
|
||||
@ -783,7 +786,8 @@ public:
|
||||
|
||||
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
|
||||
x = pre_layernorm->forward(ctx, x);
|
||||
x = encoder->forward(ctx, x, -1, false);
|
||||
LOG_DEBUG("clip_vison skip %d", clip_skip);
|
||||
x = encoder->forward(ctx, x, clip_skip, false);
|
||||
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
|
||||
auto last_hidden_state = x;
|
||||
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
|
||||
@ -853,13 +857,14 @@ public:
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||
struct ggml_tensor* pixel_values,
|
||||
bool return_pooled = true) {
|
||||
bool return_pooled = true,
|
||||
int clip_skip = -1) {
|
||||
// pixel_values: [N, num_channels, image_size, image_size]
|
||||
// return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
|
||||
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
|
||||
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
|
||||
|
||||
auto x = vision_model->forward(ctx, pixel_values, return_pooled); // [N, hidden_size] or [N, n_token, hidden_size]
|
||||
auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
|
||||
|
||||
if (return_pooled) {
|
||||
x = visual_projection->forward(ctx, x); // [N, projection_dim]
|
||||
|
||||
@ -634,12 +634,12 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
|
||||
vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
|
||||
}
|
||||
|
||||
struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled) {
|
||||
struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled, int clip_skip) {
|
||||
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||
|
||||
pixel_values = to_backend(pixel_values);
|
||||
|
||||
struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values, return_pooled);
|
||||
struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values, return_pooled, clip_skip);
|
||||
|
||||
ggml_build_forward_expand(gf, hidden_states);
|
||||
|
||||
@ -649,10 +649,11 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
|
||||
void compute(const int n_threads,
|
||||
ggml_tensor* pixel_values,
|
||||
bool return_pooled,
|
||||
int clip_skip,
|
||||
ggml_tensor** output,
|
||||
ggml_context* output_ctx) {
|
||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||
return build_graph(pixel_values, return_pooled);
|
||||
return build_graph(pixel_values, return_pooled, clip_skip);
|
||||
};
|
||||
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
|
||||
}
|
||||
|
||||
5
lora.hpp
5
lora.hpp
@ -168,6 +168,7 @@ struct LoraModel : public GGMLRunner {
|
||||
auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
|
||||
out = ggml_get_rows(ctx, out, zero_index);
|
||||
out = ggml_reshape(ctx, out, a);
|
||||
// auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -246,6 +247,8 @@ struct LoraModel : public GGMLRunner {
|
||||
set_backend_tensor_data(zero_index, zero_index_vec.data());
|
||||
ggml_build_forward_expand(gf, zero_index);
|
||||
|
||||
original_tensor_to_final_tensor.clear();
|
||||
|
||||
std::set<std::string> applied_lora_tensors;
|
||||
for (auto it : model_tensors) {
|
||||
std::string model_tensor_name = it.first;
|
||||
@ -812,7 +815,7 @@ struct LoraModel : public GGMLRunner {
|
||||
}
|
||||
scale_value *= multiplier;
|
||||
ggml_tensor* original_tensor = model_tensor;
|
||||
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(model_tensor->buffer)) {
|
||||
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
|
||||
model_tensor = ggml_dup_tensor(compute_ctx, model_tensor);
|
||||
set_backend_tensor_data(model_tensor, original_tensor->data);
|
||||
}
|
||||
|
||||
@ -830,6 +830,7 @@ public:
|
||||
ggml_tensor* get_clip_vision_output(ggml_context* work_ctx,
|
||||
sd_image_t init_image,
|
||||
bool return_pooled = true,
|
||||
int clip_skip = -1,
|
||||
bool zero_out_masked = false) {
|
||||
ggml_tensor* output = NULL;
|
||||
if (zero_out_masked) {
|
||||
@ -857,7 +858,7 @@ public:
|
||||
resized_image.data = NULL;
|
||||
|
||||
// print_ggml_tensor(pixel_values);
|
||||
clip_vision->compute(n_threads, pixel_values, return_pooled, &output, work_ctx);
|
||||
clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx);
|
||||
// print_ggml_tensor(c_crossattn);
|
||||
}
|
||||
return output;
|
||||
@ -873,7 +874,7 @@ public:
|
||||
bool zero_out_masked = false) {
|
||||
// c_crossattn
|
||||
int64_t t0 = ggml_time_ms();
|
||||
struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, zero_out_masked);
|
||||
struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked);
|
||||
|
||||
// c_concat
|
||||
struct ggml_tensor* c_concat = NULL;
|
||||
@ -2250,15 +2251,18 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
||||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
|
||||
// Apply lora
|
||||
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
|
||||
|
||||
ggml_tensor* clip_vision_output = NULL;
|
||||
ggml_tensor* concat_latent = NULL;
|
||||
if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B") {
|
||||
LOG_INFO("IMG2VID");
|
||||
|
||||
if (sd_vid_gen_params->init_image.data) {
|
||||
clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false);
|
||||
clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2);
|
||||
} else {
|
||||
clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, true);
|
||||
clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2, true);
|
||||
}
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
@ -2312,8 +2316,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
||||
|
||||
ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true);
|
||||
int sample_steps = sigmas.size() - 1;
|
||||
// Apply lora
|
||||
prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
|
||||
|
||||
// Get learned condition
|
||||
bool zero_out_masked = true;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user