mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
add vace i2v support
This commit is contained in:
parent
53aeb555bd
commit
64c5d8ea8f
2
ggml
2
ggml
@ -1 +1 @@
|
|||||||
Subproject commit 5fdc78fff274094e2a1b155928131983362d8a71
|
Subproject commit c46da318b9b6730806196ef7fff67c8160a74c8e
|
||||||
@ -1543,6 +1543,7 @@ protected:
|
|||||||
ggml_backend_tensor_copy(t, offload_t);
|
ggml_backend_tensor_copy(t, offload_t);
|
||||||
std::swap(t->buffer, offload_t->buffer);
|
std::swap(t->buffer, offload_t->buffer);
|
||||||
std::swap(t->data, offload_t->data);
|
std::swap(t->data, offload_t->data);
|
||||||
|
std::swap(t->extra, offload_t->extra);
|
||||||
|
|
||||||
t = ggml_get_next_tensor(params_ctx, t);
|
t = ggml_get_next_tensor(params_ctx, t);
|
||||||
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
|
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
|
||||||
@ -1573,8 +1574,10 @@ protected:
|
|||||||
while (t != NULL && offload_t != NULL) {
|
while (t != NULL && offload_t != NULL) {
|
||||||
t->buffer = offload_t->buffer;
|
t->buffer = offload_t->buffer;
|
||||||
t->data = offload_t->data;
|
t->data = offload_t->data;
|
||||||
|
t->extra = offload_t->extra;
|
||||||
offload_t->buffer = NULL;
|
offload_t->buffer = NULL;
|
||||||
offload_t->data = NULL;
|
offload_t->data = NULL;
|
||||||
|
offload_t->extra = NULL;
|
||||||
|
|
||||||
t = ggml_get_next_tensor(params_ctx, t);
|
t = ggml_get_next_tensor(params_ctx, t);
|
||||||
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
|
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
|
||||||
|
|||||||
@ -2402,7 +2402,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1G
|
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1G
|
||||||
params.mem_size += width * height * frames * 3 * sizeof(float) * 2;
|
|
||||||
params.mem_buffer = NULL;
|
params.mem_buffer = NULL;
|
||||||
params.no_alloc = false;
|
params.no_alloc = false;
|
||||||
// LOG_DEBUG("mem_size %u ", params.mem_size);
|
// LOG_DEBUG("mem_size %u ", params.mem_size);
|
||||||
@ -2430,6 +2429,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
ggml_tensor* concat_latent = NULL;
|
ggml_tensor* concat_latent = NULL;
|
||||||
ggml_tensor* denoise_mask = NULL;
|
ggml_tensor* denoise_mask = NULL;
|
||||||
ggml_tensor* vace_context = NULL;
|
ggml_tensor* vace_context = NULL;
|
||||||
|
int64_t ref_image_num = 0; // for vace
|
||||||
if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
|
if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
|
||||||
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" ||
|
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" ||
|
||||||
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
|
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
|
||||||
@ -2526,6 +2526,20 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
} else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" ||
|
} else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" ||
|
||||||
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.x-VACE-14B") {
|
sd_ctx->sd->diffusion_model->get_desc() == "Wan2.x-VACE-14B") {
|
||||||
LOG_INFO("VACE");
|
LOG_INFO("VACE");
|
||||||
|
int64_t t1 = ggml_time_ms();
|
||||||
|
ggml_tensor* ref_image_latent = NULL;
|
||||||
|
if (sd_vid_gen_params->init_image.data) {
|
||||||
|
ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
|
||||||
|
sd_image_to_tensor(sd_vid_gen_params->init_image.data, ref_img);
|
||||||
|
ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3);
|
||||||
|
|
||||||
|
ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img); // [b*c, 1, h/16, w/16]
|
||||||
|
sd_ctx->sd->process_latent_in(ref_image_latent);
|
||||||
|
auto zero_latent = ggml_dup_tensor(work_ctx, ref_image_latent);
|
||||||
|
ggml_set_f32(zero_latent, 0.f);
|
||||||
|
ref_image_latent = ggml_tensor_concat(work_ctx, ref_image_latent, zero_latent, 3); // [b*2*c, 1, h/16, w/16]
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
|
ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
|
||||||
ggml_set_f32(control_video, 0.5f);
|
ggml_set_f32(control_video, 0.5f);
|
||||||
ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1);
|
ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1);
|
||||||
@ -2549,23 +2563,43 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
sd_ctx->sd->process_latent_in(inactive);
|
sd_ctx->sd->process_latent_in(inactive);
|
||||||
sd_ctx->sd->process_latent_in(reactive);
|
sd_ctx->sd->process_latent_in(reactive);
|
||||||
|
|
||||||
vace_context = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, inactive->ne[0], inactive->ne[1], inactive->ne[2], 96); // [b*96, t, h/8, w/8]
|
int64_t length = inactive->ne[2];
|
||||||
|
if (ref_image_latent) {
|
||||||
|
length += 1;
|
||||||
|
frames = (length - 1) * 4 + 1;
|
||||||
|
ref_image_num = 1;
|
||||||
|
}
|
||||||
|
vace_context = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, inactive->ne[0], inactive->ne[1], length, 96); // [b*96, t, h/8, w/8]
|
||||||
ggml_tensor_iter(vace_context, [&](ggml_tensor* vace_context, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
ggml_tensor_iter(vace_context, [&](ggml_tensor* vace_context, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
||||||
float value;
|
float value;
|
||||||
if (i3 < 16) {
|
if (i3 < 32) {
|
||||||
value = ggml_tensor_get_f32(inactive, i0, i1, i2, i3);
|
if (ref_image_latent && i2 == 0) {
|
||||||
} else if (i3 >= 16 && i3 < 32) {
|
value = ggml_tensor_get_f32(ref_image_latent, i0, i1, 0, i3);
|
||||||
value = ggml_tensor_get_f32(reactive, i0, i1, i2, i3);
|
} else {
|
||||||
|
if (i3 < 16) {
|
||||||
|
value = ggml_tensor_get_f32(inactive, i0, i1, i2 - ref_image_num, i3);
|
||||||
|
} else {
|
||||||
|
value = ggml_tensor_get_f32(reactive, i0, i1, i2 - ref_image_num, i3);
|
||||||
|
}
|
||||||
|
}
|
||||||
} else { // mask
|
} else { // mask
|
||||||
int64_t vae_stride = 8;
|
if (ref_image_latent && i2 == 0) {
|
||||||
int64_t mask_height_index = i1 * vae_stride + (i3 - 32) / vae_stride;
|
value = 0.f;
|
||||||
int64_t mask_width_index = i0 * vae_stride + (i3 - 32) % vae_stride;
|
} else {
|
||||||
value = ggml_tensor_get_f32(mask, mask_width_index, mask_height_index, i2, 0);
|
int64_t vae_stride = 8;
|
||||||
|
int64_t mask_height_index = i1 * vae_stride + (i3 - 32) / vae_stride;
|
||||||
|
int64_t mask_width_index = i0 * vae_stride + (i3 - 32) % vae_stride;
|
||||||
|
value = ggml_tensor_get_f32(mask, mask_width_index, mask_height_index, i2 - ref_image_num, 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ggml_tensor_set_f32(vace_context, value, i0, i1, i2, i3);
|
ggml_tensor_set_f32(vace_context, value, i0, i1, i2, i3);
|
||||||
});
|
});
|
||||||
|
int64_t t2 = ggml_time_ms();
|
||||||
|
LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
print_ggml_tensor(vace_context);
|
||||||
|
|
||||||
if (init_latent == NULL) {
|
if (init_latent == NULL) {
|
||||||
init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true);
|
init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true);
|
||||||
}
|
}
|
||||||
@ -2690,6 +2724,20 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ref_image_num > 0) {
|
||||||
|
ggml_tensor* trim_latent = ggml_new_tensor_4d(work_ctx,
|
||||||
|
GGML_TYPE_F32,
|
||||||
|
final_latent->ne[0],
|
||||||
|
final_latent->ne[1],
|
||||||
|
final_latent->ne[2] - ref_image_num,
|
||||||
|
final_latent->ne[3]);
|
||||||
|
ggml_tensor_iter(trim_latent, [&](ggml_tensor* trim_latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
||||||
|
float value = ggml_tensor_get_f32(final_latent, i0, i1, i2 + ref_image_num, i3);
|
||||||
|
ggml_tensor_set_f32(trim_latent, value, i0, i1, i2, i3);
|
||||||
|
});
|
||||||
|
final_latent = trim_latent;
|
||||||
|
}
|
||||||
|
|
||||||
int64_t t4 = ggml_time_ms();
|
int64_t t4 = ggml_time_ms();
|
||||||
LOG_INFO("generating latent video completed, taking %.2fs", (t4 - t2) * 1.0f / 1000);
|
LOG_INFO("generating latent video completed, taking %.2fs", (t4 - t2) * 1.0f / 1000);
|
||||||
struct ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true);
|
struct ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user