From f68ce0582abac6f6d41c81fc77965ca48efef9fd Mon Sep 17 00:00:00 2001 From: leejet Date: Sat, 13 Sep 2025 16:08:56 +0800 Subject: [PATCH] add vace v2v support --- examples/cli/main.cpp | 75 +++++++++++++++++++++++++++++++++++++------ ggml_extend.hpp | 57 ++++++++++++-------------------- preprocessing.hpp | 17 +++++----- stable-diffusion.cpp | 24 ++++++++------ stable-diffusion.h | 16 ++++----- upscaler.cpp | 2 +- 6 files changed, 118 insertions(+), 73 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 7779db2..97dd010 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -35,6 +35,8 @@ #define SAFE_STR(s) ((s) ? (s) : "") #define BOOL_STR(b) ((b) ? "true" : "false") +namespace fs = std::filesystem; + const char* modes_str[] = { "img_gen", "vid_gen", @@ -75,6 +77,7 @@ struct SDParams { std::string mask_image_path; std::string control_image_path; std::vector ref_image_paths; + std::string control_video_path; bool increase_ref_index = false; std::string prompt; @@ -158,6 +161,7 @@ void print_params(SDParams params) { for (auto& path : params.ref_image_paths) { printf(" %s\n", path.c_str()); }; + printf(" control_video_path: %s\n", params.control_video_path.c_str()); printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false"); printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false"); printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false"); @@ -178,7 +182,7 @@ void print_params(SDParams params) { printf(" flow_shift: %.2f\n", params.flow_shift); printf(" strength(img2img): %.2f\n", params.strength); printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); - printf(" seed: %ld\n", params.seed); + printf(" seed: %zd\n", params.seed); printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); printf(" upscale_repeats: %d\n", params.upscale_repeats); @@ -226,6 +230,9 @@ void print_usage(int argc, const char* argv[]) { printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n"); printf(" --control-image [IMAGE] path to image condition, control net\n"); printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n"); + printf(" --control-video [PATH] path to control video frames, It must be a directory path."); + printf(" The video frames inside should be stored as images in lexicographical (character) order\n"); + printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n"); printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n"); printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n"); printf(" -p, --prompt [PROMPT] the prompt to render\n"); @@ -484,6 +491,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--input-id-images-dir", "", ¶ms.input_id_images_path}, {"", "--mask", "", ¶ms.mask_image_path}, {"", "--control-image", "", ¶ms.control_image_path}, + {"", "--control-video", "", ¶ms.control_video_path}, {"-o", "--output", "", ¶ms.output_path}, {"-p", "--prompt", "", ¶ms.prompt}, {"-n", "--negative-prompt", "", ¶ms.negative_prompt}, @@ -1062,6 +1070,7 @@ int main(int argc, const char* argv[]) { sd_image_t control_image = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL}; sd_image_t mask_image = {(uint32_t)params.width, (uint32_t)params.height, 1, NULL}; std::vector ref_images; + std::vector control_frames; auto release_all_resources = [&]() { free(init_image.data); @@ -1073,6 +1082,11 @@ int main(int argc, const char* argv[]) { ref_image.data = NULL; } ref_images.clear(); + for (auto frame : control_frames) { + free(frame.data); + frame.data = NULL; + } + control_frames.clear(); }; if (params.init_image_path.size() > 0) { @@ -1131,14 +1145,12 @@ int main(int argc, const char* argv[]) { return 1; } if (params.canny_preprocess) { // apply preprocessor - control_image.data = preprocess_canny(control_image.data, - control_image.width, - control_image.height, - 0.08f, - 0.08f, - 0.8f, - 1.0f, - false); + preprocess_canny(control_image, + 0.08f, + 0.08f, + 0.8f, + 1.0f, + false); } } @@ -1160,6 +1172,48 @@ int main(int argc, const char* argv[]) { } } + if (!params.control_video_path.empty()) { + std::string dir = params.control_video_path; + + if (!fs::exists(dir) || !fs::is_directory(dir)) { + fprintf(stderr, "'%s' is not a valid directory\n", dir.c_str()); + release_all_resources(); + return 1; + } + + for (const auto& entry : fs::directory_iterator(dir)) { + if (!entry.is_regular_file()) + continue; + + std::string path = entry.path().string(); + std::string ext = entry.path().extension().string(); + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + + if (ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".bmp") { + if (params.verbose) { + printf("load control frame %zu from '%s'\n", control_frames.size(), path.c_str()); + } + int width = 0; + int height = 0; + uint8_t* image_buffer = load_image(path.c_str(), width, height, params.width, params.height); + if (image_buffer == NULL) { + fprintf(stderr, "load image from '%s' failed\n", path.c_str()); + release_all_resources(); + return 1; + } + + control_frames.push_back({(uint32_t)params.width, + (uint32_t)params.height, + 3, + image_buffer}); + + if (control_frames.size() >= params.video_frames) { + break; + } + } + } + } + if (params.mode == VID_GEN) { vae_decode_only = false; } @@ -1239,6 +1293,8 @@ int main(int argc, const char* argv[]) { params.clip_skip, init_image, end_image, + control_frames.data(), + (int)control_frames.size(), params.width, params.height, params.sample_params, @@ -1290,7 +1346,6 @@ int main(int argc, const char* argv[]) { // create directory if not exists { - namespace fs = std::filesystem; const fs::path out_path = params.output_path; if (const fs::path out_dir = out_path.parent_path(); !out_dir.empty()) { std::error_code ec; diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 390fc49..a2a7435 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -173,6 +173,14 @@ __STATIC_INLINE__ ggml_fp16_t ggml_tensor_get_f16(const ggml_tensor* tensor, int return *(ggml_fp16_t*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]); } +__STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int iw, int ih, int ic, bool scale = true) { + float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic); + if (scale) { + value /= 255.f; + } + return value; +} + static struct ggml_tensor* get_tensor_from_graph(struct ggml_cgraph* gf, const char* name) { struct ggml_tensor* res = NULL; for (int i = 0; i < ggml_graph_n_nodes(gf); i++) { @@ -255,13 +263,12 @@ __STATIC_INLINE__ void ggml_tensor_iter( } } - __STATIC_INLINE__ void ggml_tensor_diff( ggml_tensor* a, ggml_tensor* b, float gap = 0.1f) { GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); - ggml_tensor_iter(a, [&] (ggml_tensor* a, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + ggml_tensor_iter(a, [&](ggml_tensor* a, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { float a_value = ggml_tensor_get_f32(a, i0, i1, i2, i3); float b_value = ggml_tensor_get_f32(b, i0, i1, i2, i3); if (abs(a_value - b_value) > gap) { @@ -401,42 +408,18 @@ __STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input, int idx return image_data; } -__STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data, - struct ggml_tensor* output, +__STATIC_INLINE__ void sd_image_to_tensor(sd_image_t image, + ggml_tensor* tensor, bool scale = true) { - int64_t width = output->ne[0]; - int64_t height = output->ne[1]; - int64_t channels = output->ne[2]; - GGML_ASSERT(channels == 3 && output->type == GGML_TYPE_F32); - for (int iy = 0; iy < height; iy++) { - for (int ix = 0; ix < width; ix++) { - for (int k = 0; k < channels; k++) { - float value = *(image_data + iy * width * channels + ix * channels + k); - if (scale) { - value /= 255.f; - } - ggml_tensor_set_f32(output, value, ix, iy, k); - } - } - } -} - -__STATIC_INLINE__ void sd_mask_to_tensor(const uint8_t* image_data, - struct ggml_tensor* output, - bool scale = true) { - int64_t width = output->ne[0]; - int64_t height = output->ne[1]; - int64_t channels = output->ne[2]; - GGML_ASSERT(channels == 1 && output->type == GGML_TYPE_F32); - for (int iy = 0; iy < height; iy++) { - for (int ix = 0; ix < width; ix++) { - float value = *(image_data + iy * width * channels + ix); - if (scale) { - value /= 255.f; - } - ggml_tensor_set_f32(output, value, ix, iy); - } - } + GGML_ASSERT(image.width == tensor->ne[0]); + GGML_ASSERT(image.height == tensor->ne[1]); + GGML_ASSERT(image.channel == tensor->ne[2]); + GGML_ASSERT(1 == tensor->ne[3]); + GGML_ASSERT(tensor->type == GGML_TYPE_F32); + ggml_tensor_iter(tensor, [&](ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + float value = sd_image_get_f32(image, i0, i1, i2, scale); + ggml_tensor_set_f32(tensor, value, i0, i1, i2, i3); + }); } __STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data, diff --git a/preprocessing.hpp b/preprocessing.hpp index 4ea1dba..08df4a7 100644 --- a/preprocessing.hpp +++ b/preprocessing.hpp @@ -162,7 +162,7 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo } } -uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) { +bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) { struct ggml_init_params params; params.mem_size = static_cast(10 * 1024 * 1024); // 10 params.mem_buffer = NULL; @@ -171,7 +171,7 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh if (!work_ctx) { LOG_ERROR("ggml_init() failed"); - return NULL; + return false; } float kX[9] = { @@ -192,8 +192,8 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky)); gaussian_kernel(gkernel); - struct ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); + struct ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1); + struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1); struct ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray); struct ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray); struct ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray); @@ -209,8 +209,8 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh non_max_supression(image_gray, G, tetha); threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong); // to RGB channels - for (int iy = 0; iy < height; iy++) { - for (int ix = 0; ix < width; ix++) { + for (int iy = 0; iy < img.height; iy++) { + for (int ix = 0; ix < img.width; ix++) { float gray = ggml_tensor_get_f32(image_gray, ix, iy); gray = inverse ? 1.0f - gray : gray; ggml_tensor_set_f32(image, gray, ix, iy); @@ -218,10 +218,11 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh ggml_tensor_set_f32(image, gray, ix, iy, 2); } } - free(img); uint8_t* output = sd_tensor_to_image(image); + free(img.data); + img.data = output; ggml_free(work_ctx); - return output; + return true; } #endif // __PREPROCESSING_HPP__ \ No newline at end of file diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e269508..807b624 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -952,7 +952,7 @@ public: free(resized_image.data); resized_image.data = NULL; } else { - sd_image_to_tensor(init_image.data, init_img); + sd_image_to_tensor(init_image, init_img); } if (augmentation_level > 0.f) { struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img); @@ -1947,7 +1947,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, struct ggml_tensor* image_hint = NULL; if (control_image.data != NULL) { image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_tensor(control_image.data, image_hint); + sd_image_to_tensor(control_image, image_hint); } // Sample @@ -2208,8 +2208,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); - sd_mask_to_tensor(sd_img_gen_params->mask_image.data, mask_img); - sd_image_to_tensor(sd_img_gen_params->init_image.data, init_img); + sd_image_to_tensor(sd_img_gen_params->mask_image, mask_img); + sd_image_to_tensor(sd_img_gen_params->init_image, init_img); if (sd_version_is_inpaint(sd_ctx->sd->version)) { int64_t mask_channels = 1; @@ -2300,7 +2300,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_img_gen_params->ref_images[i].height, 3, 1); - sd_image_to_tensor(sd_img_gen_params->ref_images[i].data, img); + sd_image_to_tensor(sd_img_gen_params->ref_images[i], img); ggml_tensor* latent = NULL; if (sd_ctx->sd->use_tiny_autoencoder) { @@ -2401,7 +2401,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } struct ggml_init_params params; - params.mem_size = static_cast(1024 * 1024) * 1024; // 1G + params.mem_size = static_cast(1024 * 1024) * 1024; // 1G params.mem_buffer = NULL; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); @@ -2500,7 +2500,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s int64_t t1 = ggml_time_ms(); ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_tensor(sd_vid_gen_params->init_image.data, init_img); + sd_image_to_tensor(sd_vid_gen_params->init_image, init_img); init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3); auto init_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); // [b*c, 1, h/16, w/16] @@ -2530,7 +2530,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s ggml_tensor* ref_image_latent = NULL; if (sd_vid_gen_params->init_image.data) { ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_tensor(sd_vid_gen_params->init_image.data, ref_img); + sd_image_to_tensor(sd_vid_gen_params->init_image, ref_img); ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3); ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img); // [b*c, 1, h/16, w/16] @@ -2541,7 +2541,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3); - ggml_set_f32(control_video, 0.5f); + ggml_tensor_iter(control_video, [&](ggml_tensor* control_video, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + float value = 0.5f; + if (i2 < sd_vid_gen_params->control_frames_size) { + value = sd_image_get_f32(sd_vid_gen_params->control_frames[i2], i0, i1, i3); + } + ggml_tensor_set_f32(control_video, value, i0, i1, i2, i3); + }); ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1); ggml_set_f32(mask, 1.0f); ggml_tensor* inactive = ggml_dup_tensor(work_ctx, control_video); diff --git a/stable-diffusion.h b/stable-diffusion.h index 57aad81..3abe195 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -203,6 +203,8 @@ typedef struct { int clip_skip; sd_image_t init_image; sd_image_t end_image; + sd_image_t* control_frames; + int control_frames_size; int width; int height; sd_sample_params_t sample_params; @@ -267,14 +269,12 @@ SD_API bool convert(const char* input_path, enum sd_type_t output_type, const char* tensor_type_rules); -SD_API uint8_t* preprocess_canny(uint8_t* img, - int width, - int height, - float high_threshold, - float low_threshold, - float weak, - float strong, - bool inverse); +SD_API bool preprocess_canny(sd_image_t image, + float high_threshold, + float low_threshold, + float weak, + float strong, + bool inverse); #ifdef __cplusplus } diff --git a/upscaler.cpp b/upscaler.cpp index 4ab0b73..652453a 100644 --- a/upscaler.cpp +++ b/upscaler.cpp @@ -82,7 +82,7 @@ struct UpscalerGGML { } LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1); - sd_image_to_tensor(input_image.data, input_image_tensor); + sd_image_to_tensor(input_image, input_image_tensor); ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1); auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {