From f68ce0582abac6f6d41c81fc77965ca48efef9fd Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sat, 13 Sep 2025 16:08:56 +0800
Subject: [PATCH] add vace v2v support

---
 examples/cli/main.cpp | 75 +++++++++++++++++++++++++++++++++++++------
 ggml_extend.hpp       | 57 ++++++++++++--------------------
 preprocessing.hpp     | 17 +++++-----
 stable-diffusion.cpp  | 24 ++++++++------
 stable-diffusion.h    | 16 ++++-----
 upscaler.cpp          |  2 +-
 6 files changed, 118 insertions(+), 73 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 7779db2..97dd010 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -35,6 +35,8 @@
 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
 
+namespace fs = std::filesystem;
+
 const char* modes_str[] = {
     "img_gen",
     "vid_gen",
@@ -75,6 +77,7 @@ struct SDParams {
     std::string mask_image_path;
     std::string control_image_path;
     std::vector<std::string> ref_image_paths;
+    std::string control_video_path;
     bool increase_ref_index = false;
 
     std::string prompt;
@@ -158,6 +161,7 @@ void print_params(SDParams params) {
     for (auto& path : params.ref_image_paths) {
         printf("        %s\n", path.c_str());
     };
+    printf("    control_video_path:                %s\n", params.control_video_path.c_str());
     printf("    increase_ref_index:                %s\n", params.increase_ref_index ? "true" : "false");
     printf("    offload_params_to_cpu:             %s\n", params.offload_params_to_cpu ? "true" : "false");
     printf("    clip_on_cpu:                       %s\n", params.clip_on_cpu ? "true" : "false");
@@ -178,7 +182,7 @@ void print_params(SDParams params) {
     printf("    flow_shift:                        %.2f\n", params.flow_shift);
     printf("    strength(img2img):                 %.2f\n", params.strength);
     printf("    rng:                               %s\n", sd_rng_type_name(params.rng_type));
-    printf("    seed:                              %ld\n", params.seed);
+    printf("    seed:                              %zd\n", params.seed);
     printf("    batch_count:                       %d\n", params.batch_count);
     printf("    vae_tiling:                        %s\n", params.vae_tiling ? "true" : "false");
     printf("    upscale_repeats:                   %d\n", params.upscale_repeats);
@@ -226,6 +230,9 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -i, --end-img [IMAGE]              path to the end image, required by flf2v\n");
     printf("  --control-image [IMAGE]            path to image condition, control net\n");
     printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
+    printf("  --control-video [PATH]             path to control video frames, It must be a directory path.");
+    printf("                                     The video frames inside should be stored as images in lexicographical (character) order\n");
+    printf("                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n");
     printf("  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
     printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
     printf("  -p, --prompt [PROMPT]              the prompt to render\n");
@@ -484,6 +491,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--input-id-images-dir", "", &params.input_id_images_path},
         {"", "--mask", "", &params.mask_image_path},
         {"", "--control-image", "", &params.control_image_path},
+        {"", "--control-video", "", &params.control_video_path},
         {"-o", "--output", "", &params.output_path},
         {"-p", "--prompt", "", &params.prompt},
         {"-n", "--negative-prompt", "", &params.negative_prompt},
@@ -1062,6 +1070,7 @@ int main(int argc, const char* argv[]) {
     sd_image_t control_image = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL};
     sd_image_t mask_image    = {(uint32_t)params.width, (uint32_t)params.height, 1, NULL};
     std::vector<sd_image_t> ref_images;
+    std::vector<sd_image_t> control_frames;
 
     auto release_all_resources = [&]() {
         free(init_image.data);
@@ -1073,6 +1082,11 @@ int main(int argc, const char* argv[]) {
             ref_image.data = NULL;
         }
         ref_images.clear();
+        for (auto frame : control_frames) {
+            free(frame.data);
+            frame.data = NULL;
+        }
+        control_frames.clear();
     };
 
     if (params.init_image_path.size() > 0) {
@@ -1131,14 +1145,12 @@ int main(int argc, const char* argv[]) {
             return 1;
         }
         if (params.canny_preprocess) {  // apply preprocessor
-            control_image.data = preprocess_canny(control_image.data,
-                                                  control_image.width,
-                                                  control_image.height,
-                                                  0.08f,
-                                                  0.08f,
-                                                  0.8f,
-                                                  1.0f,
-                                                  false);
+            preprocess_canny(control_image,
+                             0.08f,
+                             0.08f,
+                             0.8f,
+                             1.0f,
+                             false);
         }
     }
 
@@ -1160,6 +1172,48 @@ int main(int argc, const char* argv[]) {
         }
     }
 
+    if (!params.control_video_path.empty()) {
+        std::string dir = params.control_video_path;
+
+        if (!fs::exists(dir) || !fs::is_directory(dir)) {
+            fprintf(stderr, "'%s' is not a valid directory\n", dir.c_str());
+            release_all_resources();
+            return 1;
+        }
+
+        for (const auto& entry : fs::directory_iterator(dir)) {
+            if (!entry.is_regular_file())
+                continue;
+
+            std::string path = entry.path().string();
+            std::string ext  = entry.path().extension().string();
+            std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+
+            if (ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".bmp") {
+                if (params.verbose) {
+                    printf("load control frame %zu from '%s'\n", control_frames.size(), path.c_str());
+                }
+                int width             = 0;
+                int height            = 0;
+                uint8_t* image_buffer = load_image(path.c_str(), width, height, params.width, params.height);
+                if (image_buffer == NULL) {
+                    fprintf(stderr, "load image from '%s' failed\n", path.c_str());
+                    release_all_resources();
+                    return 1;
+                }
+
+                control_frames.push_back({(uint32_t)params.width,
+                                          (uint32_t)params.height,
+                                          3,
+                                          image_buffer});
+
+                if (control_frames.size() >= params.video_frames) {
+                    break;
+                }
+            }
+        }
+    }
+
     if (params.mode == VID_GEN) {
         vae_decode_only = false;
     }
@@ -1239,6 +1293,8 @@ int main(int argc, const char* argv[]) {
             params.clip_skip,
             init_image,
             end_image,
+            control_frames.data(),
+            (int)control_frames.size(),
             params.width,
             params.height,
             params.sample_params,
@@ -1290,7 +1346,6 @@ int main(int argc, const char* argv[]) {
 
     // create directory if not exists
     {
-        namespace fs            = std::filesystem;
         const fs::path out_path = params.output_path;
         if (const fs::path out_dir = out_path.parent_path(); !out_dir.empty()) {
             std::error_code ec;
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 390fc49..a2a7435 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -173,6 +173,14 @@ __STATIC_INLINE__ ggml_fp16_t ggml_tensor_get_f16(const ggml_tensor* tensor, int
     return *(ggml_fp16_t*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
 }
 
+__STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int iw, int ih, int ic, bool scale = true) {
+    float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic);
+    if (scale) {
+        value /= 255.f;
+    }
+    return value;
+}
+
 static struct ggml_tensor* get_tensor_from_graph(struct ggml_cgraph* gf, const char* name) {
     struct ggml_tensor* res = NULL;
     for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
@@ -255,13 +263,12 @@ __STATIC_INLINE__ void ggml_tensor_iter(
     }
 }
 
-
 __STATIC_INLINE__ void ggml_tensor_diff(
     ggml_tensor* a,
     ggml_tensor* b,
     float gap = 0.1f) {
     GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-    ggml_tensor_iter(a, [&] (ggml_tensor* a, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+    ggml_tensor_iter(a, [&](ggml_tensor* a, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
         float a_value = ggml_tensor_get_f32(a, i0, i1, i2, i3);
         float b_value = ggml_tensor_get_f32(b, i0, i1, i2, i3);
         if (abs(a_value - b_value) > gap) {
@@ -401,42 +408,18 @@ __STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input, int idx
     return image_data;
 }
 
-__STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
-                                          struct ggml_tensor* output,
+__STATIC_INLINE__ void sd_image_to_tensor(sd_image_t image,
+                                          ggml_tensor* tensor,
                                           bool scale = true) {
-    int64_t width    = output->ne[0];
-    int64_t height   = output->ne[1];
-    int64_t channels = output->ne[2];
-    GGML_ASSERT(channels == 3 && output->type == GGML_TYPE_F32);
-    for (int iy = 0; iy < height; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            for (int k = 0; k < channels; k++) {
-                float value = *(image_data + iy * width * channels + ix * channels + k);
-                if (scale) {
-                    value /= 255.f;
-                }
-                ggml_tensor_set_f32(output, value, ix, iy, k);
-            }
-        }
-    }
-}
-
-__STATIC_INLINE__ void sd_mask_to_tensor(const uint8_t* image_data,
-                                         struct ggml_tensor* output,
-                                         bool scale = true) {
-    int64_t width    = output->ne[0];
-    int64_t height   = output->ne[1];
-    int64_t channels = output->ne[2];
-    GGML_ASSERT(channels == 1 && output->type == GGML_TYPE_F32);
-    for (int iy = 0; iy < height; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            float value = *(image_data + iy * width * channels + ix);
-            if (scale) {
-                value /= 255.f;
-            }
-            ggml_tensor_set_f32(output, value, ix, iy);
-        }
-    }
+    GGML_ASSERT(image.width == tensor->ne[0]);
+    GGML_ASSERT(image.height == tensor->ne[1]);
+    GGML_ASSERT(image.channel == tensor->ne[2]);
+    GGML_ASSERT(1 == tensor->ne[3]);
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
+    ggml_tensor_iter(tensor, [&](ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+        float value = sd_image_get_f32(image, i0, i1, i2, scale);
+        ggml_tensor_set_f32(tensor, value, i0, i1, i2, i3);
+    });
 }
 
 __STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data,
diff --git a/preprocessing.hpp b/preprocessing.hpp
index 4ea1dba..08df4a7 100644
--- a/preprocessing.hpp
+++ b/preprocessing.hpp
@@ -162,7 +162,7 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
     }
 }
 
-uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
+bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
     struct ggml_init_params params;
     params.mem_size               = static_cast<size_t>(10 * 1024 * 1024);  // 10
     params.mem_buffer             = NULL;
@@ -171,7 +171,7 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
 
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
-        return NULL;
+        return false;
     }
 
     float kX[9] = {
@@ -192,8 +192,8 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
     struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
     memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
     gaussian_kernel(gkernel);
-    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1);
+    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1);
     struct ggml_tensor* iX         = ggml_dup_tensor(work_ctx, image_gray);
     struct ggml_tensor* iY         = ggml_dup_tensor(work_ctx, image_gray);
     struct ggml_tensor* G          = ggml_dup_tensor(work_ctx, image_gray);
@@ -209,8 +209,8 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
     non_max_supression(image_gray, G, tetha);
     threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
     // to RGB channels
-    for (int iy = 0; iy < height; iy++) {
-        for (int ix = 0; ix < width; ix++) {
+    for (int iy = 0; iy < img.height; iy++) {
+        for (int ix = 0; ix < img.width; ix++) {
             float gray = ggml_tensor_get_f32(image_gray, ix, iy);
             gray       = inverse ? 1.0f - gray : gray;
             ggml_tensor_set_f32(image, gray, ix, iy);
@@ -218,10 +218,11 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
             ggml_tensor_set_f32(image, gray, ix, iy, 2);
         }
     }
-    free(img);
     uint8_t* output = sd_tensor_to_image(image);
+    free(img.data);
+    img.data = output;
     ggml_free(work_ctx);
-    return output;
+    return true;
 }
 
 #endif  // __PREPROCESSING_HPP__
\ No newline at end of file
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index e269508..807b624 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -952,7 +952,7 @@ public:
                     free(resized_image.data);
                     resized_image.data = NULL;
                 } else {
-                    sd_image_to_tensor(init_image.data, init_img);
+                    sd_image_to_tensor(init_image, init_img);
                 }
                 if (augmentation_level > 0.f) {
                     struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img);
@@ -1947,7 +1947,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
     struct ggml_tensor* image_hint = NULL;
     if (control_image.data != NULL) {
         image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_image_to_tensor(control_image.data, image_hint);
+        sd_image_to_tensor(control_image, image_hint);
     }
 
     // Sample
@@ -2208,8 +2208,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
         ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
         ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
 
-        sd_mask_to_tensor(sd_img_gen_params->mask_image.data, mask_img);
-        sd_image_to_tensor(sd_img_gen_params->init_image.data, init_img);
+        sd_image_to_tensor(sd_img_gen_params->mask_image, mask_img);
+        sd_image_to_tensor(sd_img_gen_params->init_image, init_img);
 
         if (sd_version_is_inpaint(sd_ctx->sd->version)) {
             int64_t mask_channels = 1;
@@ -2300,7 +2300,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                               sd_img_gen_params->ref_images[i].height,
                                               3,
                                               1);
-        sd_image_to_tensor(sd_img_gen_params->ref_images[i].data, img);
+        sd_image_to_tensor(sd_img_gen_params->ref_images[i], img);
 
         ggml_tensor* latent = NULL;
         if (sd_ctx->sd->use_tiny_autoencoder) {
@@ -2401,7 +2401,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     }
 
     struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
+    params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
     params.mem_buffer = NULL;
     params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
@@ -2500,7 +2500,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
 
         int64_t t1            = ggml_time_ms();
         ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_image_to_tensor(sd_vid_gen_params->init_image.data, init_img);
+        sd_image_to_tensor(sd_vid_gen_params->init_image, init_img);
         init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3);
 
         auto init_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);  // [b*c, 1, h/16, w/16]
@@ -2530,7 +2530,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         ggml_tensor* ref_image_latent = NULL;
         if (sd_vid_gen_params->init_image.data) {
             ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-            sd_image_to_tensor(sd_vid_gen_params->init_image.data, ref_img);
+            sd_image_to_tensor(sd_vid_gen_params->init_image, ref_img);
             ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3);
 
             ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img);  // [b*c, 1, h/16, w/16]
@@ -2541,7 +2541,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         }
 
         ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
-        ggml_set_f32(control_video, 0.5f);
+        ggml_tensor_iter(control_video, [&](ggml_tensor* control_video, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+            float value = 0.5f;
+            if (i2 < sd_vid_gen_params->control_frames_size) {
+                value = sd_image_get_f32(sd_vid_gen_params->control_frames[i2], i0, i1, i3);
+            }
+            ggml_tensor_set_f32(control_video, value, i0, i1, i2, i3);
+        });
         ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1);
         ggml_set_f32(mask, 1.0f);
         ggml_tensor* inactive = ggml_dup_tensor(work_ctx, control_video);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 57aad81..3abe195 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -203,6 +203,8 @@ typedef struct {
     int clip_skip;
     sd_image_t init_image;
     sd_image_t end_image;
+    sd_image_t* control_frames;
+    int control_frames_size;
     int width;
     int height;
     sd_sample_params_t sample_params;
@@ -267,14 +269,12 @@ SD_API bool convert(const char* input_path,
                     enum sd_type_t output_type,
                     const char* tensor_type_rules);
 
-SD_API uint8_t* preprocess_canny(uint8_t* img,
-                                 int width,
-                                 int height,
-                                 float high_threshold,
-                                 float low_threshold,
-                                 float weak,
-                                 float strong,
-                                 bool inverse);
+SD_API bool preprocess_canny(sd_image_t image,
+                             float high_threshold,
+                             float low_threshold,
+                             float weak,
+                             float strong,
+                             bool inverse);
 
 #ifdef __cplusplus
 }
diff --git a/upscaler.cpp b/upscaler.cpp
index 4ab0b73..652453a 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -82,7 +82,7 @@ struct UpscalerGGML {
         }
         LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
         ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
-        sd_image_to_tensor(input_image.data, input_image_tensor);
+        sd_image_to_tensor(input_image, input_image_tensor);
 
         ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
         auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {