add vace v2v support

2025-12-13 05:48:56 +00:00 · 2025-09-13 16:08:56 +08:00 · 2025-09-13 16:08:56 +08:00 · f68ce0582a
commit f68ce0582a
parent e751ae6d6f
6 changed files with 118 additions and 73 deletions
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -35,6 +35,8 @@
 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
 namespace fs = std::filesystem;
 const char* modes_str[] = {
    "img_gen",
    "vid_gen",
@ -75,6 +77,7 @@ struct SDParams {
    std::string mask_image_path;
    std::string control_image_path;
    std::vector<std::string> ref_image_paths;
    std::string control_video_path;
    bool increase_ref_index = false;
    std::string prompt;
@ -158,6 +161,7 @@ void print_params(SDParams params) {
    for (auto& path : params.ref_image_paths) {
        printf("        %s\n", path.c_str());
    };
    printf("    control_video_path:                %s\n", params.control_video_path.c_str());
    printf("    increase_ref_index:                %s\n", params.increase_ref_index ? "true" : "false");
    printf("    offload_params_to_cpu:             %s\n", params.offload_params_to_cpu ? "true" : "false");
    printf("    clip_on_cpu:                       %s\n", params.clip_on_cpu ? "true" : "false");
@ -178,7 +182,7 @@ void print_params(SDParams params) {
    printf("    flow_shift:                        %.2f\n", params.flow_shift);
    printf("    strength(img2img):                 %.2f\n", params.strength);
    printf("    rng:                               %s\n", sd_rng_type_name(params.rng_type));
-    printf("    seed:                              %ld\n", params.seed);
+    printf("    seed:                              %zd\n", params.seed);
    printf("    batch_count:                       %d\n", params.batch_count);
    printf("    vae_tiling:                        %s\n", params.vae_tiling ? "true" : "false");
    printf("    upscale_repeats:                   %d\n", params.upscale_repeats);
@ -226,6 +230,9 @@ void print_usage(int argc, const char* argv[]) {
    printf("  -i, --end-img [IMAGE]              path to the end image, required by flf2v\n");
    printf("  --control-image [IMAGE]            path to image condition, control net\n");
    printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
    printf("  --control-video [PATH]             path to control video frames, It must be a directory path.");
    printf("                                     The video frames inside should be stored as images in lexicographical (character) order\n");
    printf("                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n");
    printf("  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
@ -484,6 +491,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        {"", "--input-id-images-dir", "", &params.input_id_images_path},
        {"", "--mask", "", &params.mask_image_path},
        {"", "--control-image", "", &params.control_image_path},
        {"", "--control-video", "", &params.control_video_path},
        {"-o", "--output", "", &params.output_path},
        {"-p", "--prompt", "", &params.prompt},
        {"-n", "--negative-prompt", "", &params.negative_prompt},
@ -1062,6 +1070,7 @@ int main(int argc, const char* argv[]) {
    sd_image_t control_image = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL};
    sd_image_t mask_image    = {(uint32_t)params.width, (uint32_t)params.height, 1, NULL};
    std::vector<sd_image_t> ref_images;
    std::vector<sd_image_t> control_frames;
    auto release_all_resources = [&]() {
        free(init_image.data);
@ -1073,6 +1082,11 @@ int main(int argc, const char* argv[]) {
            ref_image.data = NULL;
        }
        ref_images.clear();
        for (auto frame : control_frames) {
            free(frame.data);
            frame.data = NULL;
        }
        control_frames.clear();
    };
    if (params.init_image_path.size() > 0) {
@ -1131,14 +1145,12 @@ int main(int argc, const char* argv[]) {
            return 1;
        }
        if (params.canny_preprocess) {  // apply preprocessor
-            control_image.data = preprocess_canny(control_image.data,
+            preprocess_canny(control_image,
-                                                  control_image.width,
+                             0.08f,
-                                                  control_image.height,
+                             0.08f,
-                                                  0.08f,
+                             0.8f,
-                                                  0.08f,
+                             1.0f,
-                                                  0.8f,
+                             false);
                                                  1.0f,
                                                  false);
        }
    }
@ -1160,6 +1172,48 @@ int main(int argc, const char* argv[]) {
        }
    }
    if (!params.control_video_path.empty()) {
        std::string dir = params.control_video_path;
        if (!fs::exists(dir) || !fs::is_directory(dir)) {
            fprintf(stderr, "'%s' is not a valid directory\n", dir.c_str());
            release_all_resources();
            return 1;
        }
        for (const auto& entry : fs::directory_iterator(dir)) {
            if (!entry.is_regular_file())
                continue;
            std::string path = entry.path().string();
            std::string ext  = entry.path().extension().string();
            std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
            if (ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".bmp") {
                if (params.verbose) {
                    printf("load control frame %zu from '%s'\n", control_frames.size(), path.c_str());
                }
                int width             = 0;
                int height            = 0;
                uint8_t* image_buffer = load_image(path.c_str(), width, height, params.width, params.height);
                if (image_buffer == NULL) {
                    fprintf(stderr, "load image from '%s' failed\n", path.c_str());
                    release_all_resources();
                    return 1;
                }
                control_frames.push_back({(uint32_t)params.width,
                                          (uint32_t)params.height,
                                          3,
                                          image_buffer});
                if (control_frames.size() >= params.video_frames) {
                    break;
                }
            }
        }
    }
    if (params.mode == VID_GEN) {
        vae_decode_only = false;
    }
@ -1239,6 +1293,8 @@ int main(int argc, const char* argv[]) {
            params.clip_skip,
            init_image,
            end_image,
            control_frames.data(),
            (int)control_frames.size(),
            params.width,
            params.height,
            params.sample_params,
@ -1290,7 +1346,6 @@ int main(int argc, const char* argv[]) {
    // create directory if not exists
    {
        namespace fs            = std::filesystem;
        const fs::path out_path = params.output_path;
        if (const fs::path out_dir = out_path.parent_path(); !out_dir.empty()) {
            std::error_code ec;
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -173,6 +173,14 @@ __STATIC_INLINE__ ggml_fp16_t ggml_tensor_get_f16(const ggml_tensor* tensor, int
    return *(ggml_fp16_t*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
 }
 __STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int iw, int ih, int ic, bool scale = true) {
    float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic);
    if (scale) {
        value /= 255.f;
    }
    return value;
 }
 static struct ggml_tensor* get_tensor_from_graph(struct ggml_cgraph* gf, const char* name) {
    struct ggml_tensor* res = NULL;
    for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
@ -255,13 +263,12 @@ __STATIC_INLINE__ void ggml_tensor_iter(
    }
 }
 __STATIC_INLINE__ void ggml_tensor_diff(
    ggml_tensor* a,
    ggml_tensor* b,
    float gap = 0.1f) {
    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-    ggml_tensor_iter(a, [&] (ggml_tensor* a, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+    ggml_tensor_iter(a, [&](ggml_tensor* a, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
        float a_value = ggml_tensor_get_f32(a, i0, i1, i2, i3);
        float b_value = ggml_tensor_get_f32(b, i0, i1, i2, i3);
        if (abs(a_value - b_value) > gap) {
@ -401,42 +408,18 @@ __STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input, int idx
    return image_data;
 }
-__STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
+__STATIC_INLINE__ void sd_image_to_tensor(sd_image_t image,
-                                          struct ggml_tensor* output,
+                                          ggml_tensor* tensor,
                                          bool scale = true) {
-    int64_t width    = output->ne[0];
+    GGML_ASSERT(image.width == tensor->ne[0]);
-    int64_t height   = output->ne[1];
+    GGML_ASSERT(image.height == tensor->ne[1]);
-    int64_t channels = output->ne[2];
+    GGML_ASSERT(image.channel == tensor->ne[2]);
-    GGML_ASSERT(channels == 3 && output->type == GGML_TYPE_F32);
+    GGML_ASSERT(1 == tensor->ne[3]);
-    for (int iy = 0; iy < height; iy++) {
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
-        for (int ix = 0; ix < width; ix++) {
+    ggml_tensor_iter(tensor, [&](ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            for (int k = 0; k < channels; k++) {
+        float value = sd_image_get_f32(image, i0, i1, i2, scale);
-                float value = *(image_data + iy * width * channels + ix * channels + k);
+        ggml_tensor_set_f32(tensor, value, i0, i1, i2, i3);
-                if (scale) {
+    });
                    value /= 255.f;
                }
                ggml_tensor_set_f32(output, value, ix, iy, k);
            }
        }
    }
 }
 __STATIC_INLINE__ void sd_mask_to_tensor(const uint8_t* image_data,
                                         struct ggml_tensor* output,
                                         bool scale = true) {
    int64_t width    = output->ne[0];
    int64_t height   = output->ne[1];
    int64_t channels = output->ne[2];
    GGML_ASSERT(channels == 1 && output->type == GGML_TYPE_F32);
    for (int iy = 0; iy < height; iy++) {
        for (int ix = 0; ix < width; ix++) {
            float value = *(image_data + iy * width * channels + ix);
            if (scale) {
                value /= 255.f;
            }
            ggml_tensor_set_f32(output, value, ix, iy);
        }
    }
 }
 __STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data,
--- a/preprocessing.hpp
+++ b/preprocessing.hpp
@ -162,7 +162,7 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
    }
 }
-uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
+bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
    struct ggml_init_params params;
    params.mem_size               = static_cast<size_t>(10 * 1024 * 1024);  // 10
    params.mem_buffer             = NULL;
@ -171,7 +171,7 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
    if (!work_ctx) {
        LOG_ERROR("ggml_init() failed");
-        return NULL;
+        return false;
    }
    float kX[9] = {
@ -192,8 +192,8 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
    struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
    memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
    gaussian_kernel(gkernel);
-    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1);
-    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1);
    struct ggml_tensor* iX         = ggml_dup_tensor(work_ctx, image_gray);
    struct ggml_tensor* iY         = ggml_dup_tensor(work_ctx, image_gray);
    struct ggml_tensor* G          = ggml_dup_tensor(work_ctx, image_gray);
@ -209,8 +209,8 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
    non_max_supression(image_gray, G, tetha);
    threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
    // to RGB channels
-    for (int iy = 0; iy < height; iy++) {
+    for (int iy = 0; iy < img.height; iy++) {
-        for (int ix = 0; ix < width; ix++) {
+        for (int ix = 0; ix < img.width; ix++) {
            float gray = ggml_tensor_get_f32(image_gray, ix, iy);
            gray       = inverse ? 1.0f - gray : gray;
            ggml_tensor_set_f32(image, gray, ix, iy);
@ -218,10 +218,11 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
            ggml_tensor_set_f32(image, gray, ix, iy, 2);
        }
    }
    free(img);
    uint8_t* output = sd_tensor_to_image(image);
    free(img.data);
    img.data = output;
    ggml_free(work_ctx);
-    return output;
+    return true;
 }
 #endif  // __PREPROCESSING_HPP__
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -952,7 +952,7 @@ public:
                    free(resized_image.data);
                    resized_image.data = NULL;
                } else {
-                    sd_image_to_tensor(init_image.data, init_img);
+                    sd_image_to_tensor(init_image, init_img);
                }
                if (augmentation_level > 0.f) {
                    struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img);
@ -1947,7 +1947,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
    struct ggml_tensor* image_hint = NULL;
    if (control_image.data != NULL) {
        image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_image_to_tensor(control_image.data, image_hint);
+        sd_image_to_tensor(control_image, image_hint);
    }
    // Sample
@ -2208,8 +2208,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
        ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
-        sd_mask_to_tensor(sd_img_gen_params->mask_image.data, mask_img);
+        sd_image_to_tensor(sd_img_gen_params->mask_image, mask_img);
-        sd_image_to_tensor(sd_img_gen_params->init_image.data, init_img);
+        sd_image_to_tensor(sd_img_gen_params->init_image, init_img);
        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
            int64_t mask_channels = 1;
@ -2300,7 +2300,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                              sd_img_gen_params->ref_images[i].height,
                                              3,
                                              1);
-        sd_image_to_tensor(sd_img_gen_params->ref_images[i].data, img);
+        sd_image_to_tensor(sd_img_gen_params->ref_images[i], img);
        ggml_tensor* latent = NULL;
        if (sd_ctx->sd->use_tiny_autoencoder) {
@ -2401,7 +2401,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
    }
    struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
+    params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
    params.mem_buffer = NULL;
    params.no_alloc   = false;
    // LOG_DEBUG("mem_size %u ", params.mem_size);
@ -2500,7 +2500,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
        int64_t t1            = ggml_time_ms();
        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_image_to_tensor(sd_vid_gen_params->init_image.data, init_img);
+        sd_image_to_tensor(sd_vid_gen_params->init_image, init_img);
        init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3);
        auto init_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);  // [b*c, 1, h/16, w/16]
@ -2530,7 +2530,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
        ggml_tensor* ref_image_latent = NULL;
        if (sd_vid_gen_params->init_image.data) {
            ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-            sd_image_to_tensor(sd_vid_gen_params->init_image.data, ref_img);
+            sd_image_to_tensor(sd_vid_gen_params->init_image, ref_img);
            ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3);
            ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img);  // [b*c, 1, h/16, w/16]
@ -2541,7 +2541,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
        }
        ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
-        ggml_set_f32(control_video, 0.5f);
+        ggml_tensor_iter(control_video, [&](ggml_tensor* control_video, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
            float value = 0.5f;
            if (i2 < sd_vid_gen_params->control_frames_size) {
                value = sd_image_get_f32(sd_vid_gen_params->control_frames[i2], i0, i1, i3);
            }
            ggml_tensor_set_f32(control_video, value, i0, i1, i2, i3);
        });
        ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1);
        ggml_set_f32(mask, 1.0f);
        ggml_tensor* inactive = ggml_dup_tensor(work_ctx, control_video);
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -203,6 +203,8 @@ typedef struct {
    int clip_skip;
    sd_image_t init_image;
    sd_image_t end_image;
    sd_image_t* control_frames;
    int control_frames_size;
    int width;
    int height;
    sd_sample_params_t sample_params;
@ -267,14 +269,12 @@ SD_API bool convert(const char* input_path,
                    enum sd_type_t output_type,
                    const char* tensor_type_rules);
-SD_API uint8_t* preprocess_canny(uint8_t* img,
+SD_API bool preprocess_canny(sd_image_t image,
-                                 int width,
+                             float high_threshold,
-                                 int height,
+                             float low_threshold,
-                                 float high_threshold,
+                             float weak,
-                                 float low_threshold,
+                             float strong,
-                                 float weak,
+                             bool inverse);
                                 float strong,
                                 bool inverse);
 #ifdef __cplusplus
 }
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -82,7 +82,7 @@ struct UpscalerGGML {
        }
        LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
        ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
-        sd_image_to_tensor(input_image.data, input_image_tensor);
+        sd_image_to_tensor(input_image, input_image_tensor);
        ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
        auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {