feat: add detailed tensor loading time stat (#793 )

feat: support incrementing ref image index (omni-kontext) (#755 )
* kontext: support ref images indices * lora: support x_embedder * update help message * Support for negative indices * support for OmniControl (offsets at index 0) * c++11 compat * add --increase-ref-index option * simplify the logic and fix some issues * update README.md * remove unused variable --------- Co-authored-by: leejet <leejet714@gmail.com>
2025-12-13 05:48:56 +00:00 · 2025-09-07 22:51:44 +08:00 · 2025-09-07 22:35:16 +08:00
9 changed files with 98 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -319,6 +319,7 @@ arguments:
  -i, --end-img [IMAGE]              path to the end image, required by flf2v
  --control-image [IMAGE]            path to image condition, control net
  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
+  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).
  -o, --output OUTPUT                path to write result image to (default: ./output.png)
  -p, --prompt [PROMPT]              the prompt to render
  -n, --negative-prompt PROMPT       the negative prompt (default: "")
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -16,6 +16,7 @@ struct DiffusionModel {
                         struct ggml_tensor* y,
                         struct ggml_tensor* guidance,
                         std::vector<ggml_tensor*> ref_latents     = {},
+                         bool increase_ref_index                   = false,
                         int num_video_frames                      = -1,
                         std::vector<struct ggml_tensor*> controls = {},
                         float control_strength                    = 0.f,
@ -77,6 +78,7 @@ struct UNetModel : public DiffusionModel {
                 struct ggml_tensor* y,
                 struct ggml_tensor* guidance,
                 std::vector<ggml_tensor*> ref_latents     = {},
+                 bool increase_ref_index                   = false,
                 int num_video_frames                      = -1,
                 std::vector<struct ggml_tensor*> controls = {},
                 float control_strength                    = 0.f,
@ -133,6 +135,7 @@ struct MMDiTModel : public DiffusionModel {
                 struct ggml_tensor* y,
                 struct ggml_tensor* guidance,
                 std::vector<ggml_tensor*> ref_latents     = {},
+                 bool increase_ref_index                   = false,
                 int num_video_frames                      = -1,
                 std::vector<struct ggml_tensor*> controls = {},
                 float control_strength                    = 0.f,
@ -191,13 +194,14 @@ struct FluxModel : public DiffusionModel {
                 struct ggml_tensor* y,
                 struct ggml_tensor* guidance,
                 std::vector<ggml_tensor*> ref_latents     = {},
+                 bool increase_ref_index                   = false,
                 int num_video_frames                      = -1,
                 std::vector<struct ggml_tensor*> controls = {},
                 float control_strength                    = 0.f,
                 struct ggml_tensor** output               = NULL,
                 struct ggml_context* output_ctx           = NULL,
                 std::vector<int> skip_layers              = std::vector<int>()) {
-        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers);
+        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, output, output_ctx, skip_layers);
    }
 };

@ -250,6 +254,7 @@ struct WanModel : public DiffusionModel {
                 struct ggml_tensor* y,
                 struct ggml_tensor* guidance,
                 std::vector<ggml_tensor*> ref_latents     = {},
+                 bool increase_ref_index                   = false,
                 int num_video_frames                      = -1,
                 std::vector<struct ggml_tensor*> controls = {},
                 float control_strength                    = 0.f,
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -74,6 +74,7 @@ struct SDParams {
    std::string mask_image_path;
    std::string control_image_path;
    std::vector<std::string> ref_image_paths;
+    bool increase_ref_index = false;

    std::string prompt;
    std::string negative_prompt;
@ -156,6 +157,7 @@ void print_params(SDParams params) {
    for (auto& path : params.ref_image_paths) {
        printf("        %s\n", path.c_str());
    };
+    printf("    increase_ref_index:                %s\n", params.increase_ref_index ? "true" : "false");
    printf("    offload_params_to_cpu:             %s\n", params.offload_params_to_cpu ? "true" : "false");
    printf("    clip_on_cpu:                       %s\n", params.clip_on_cpu ? "true" : "false");
    printf("    control_net_cpu:                   %s\n", params.control_net_cpu ? "true" : "false");
@ -222,6 +224,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  -i, --end-img [IMAGE]              path to the end image, required by flf2v\n");
    printf("  --control-image [IMAGE]            path to image condition, control net\n");
    printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
+    printf("  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
@ -536,6 +539,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        {"", "--color", "", true, &params.color},
        {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
        {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
+        {"", "--increase-ref-index", "", true, &params.increase_ref_index},
    };

    auto on_mode_arg = [&](int argc, const char** argv, int index) {
@ -1207,6 +1211,7 @@ int main(int argc, const char* argv[]) {
            init_image,
            ref_images.data(),
            (int)ref_images.size(),
+            params.increase_ref_index,
            mask_image,
            params.width,
            params.height,
--- a/flux.hpp
+++ b/flux.hpp
@ -960,6 +960,7 @@ namespace Flux {
                                        struct ggml_tensor* y,
                                        struct ggml_tensor* guidance,
                                        std::vector<ggml_tensor*> ref_latents = {},
+                                        bool increase_ref_index               = false,
                                        std::vector<int> skip_layers          = {}) {
            GGML_ASSERT(x->ne[3] == 1);
            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
@ -999,6 +1000,7 @@ namespace Flux {
                                            x->ne[3],
                                            context->ne[1],
                                            ref_latents,
+                                            increase_ref_index,
                                            flux_params.theta,
                                            flux_params.axes_dim);
            int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
@ -1035,6 +1037,7 @@ namespace Flux {
                     struct ggml_tensor* y,
                     struct ggml_tensor* guidance,
                     std::vector<ggml_tensor*> ref_latents = {},
+                     bool increase_ref_index               = false,
                     struct ggml_tensor** output           = NULL,
                     struct ggml_context* output_ctx       = NULL,
                     std::vector<int> skip_layers          = std::vector<int>()) {
@ -1044,7 +1047,7 @@ namespace Flux {
            // y: [N, adm_in_channels] or [1, adm_in_channels]
            // guidance: [N, ]
            auto get_graph = [&]() -> struct ggml_cgraph* {
-                return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, skip_layers);
+                return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
            };

            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
@ -1084,7 +1087,7 @@ namespace Flux {
                struct ggml_tensor* out = NULL;

                int t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, NULL, y, guidance, {}, &out, work_ctx);
+                compute(8, x, timesteps, context, NULL, y, guidance, {}, false, &out, work_ctx);
                int t1 = ggml_time_ms();

                print_ggml_tensor(out);
--- a/lora.hpp
+++ b/lora.hpp
@ -58,6 +58,7 @@ struct LoraModel : public GGMLRunner {
        {"x_block.attn.proj", "attn.to_out.0"},
        {"x_block.attn2.proj", "attn2.to_out.0"},
        // flux
+        {"img_in", "x_embedder"},
        // singlestream
        {"linear2", "proj_out"},
        {"modulation.lin", "norm.linear"},
--- a/model.cpp
+++ b/model.cpp
@ -1966,6 +1966,16 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
 }

 bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
+    int64_t process_time_ms         = 0;
+    int64_t read_time_ms            = 0;
+    int64_t memcpy_time_ms          = 0;
+    int64_t copy_to_backend_time_ms = 0;
+    int64_t convert_time_ms         = 0;
+
+    int64_t prev_time_ms = 0;
+    int64_t curr_time_ms = 0;
+    int64_t start_time   = ggml_time_ms();
+    prev_time_ms         = start_time;
    std::vector<TensorStorage> processed_tensor_storages;
    for (auto& tensor_storage : tensor_storages) {
        // LOG_DEBUG("%s", name.c_str());
@ -1978,6 +1988,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
    }
    std::vector<TensorStorage> dedup = remove_duplicates(processed_tensor_storages);
    processed_tensor_storages        = dedup;
+    curr_time_ms                     = ggml_time_ms();
+    process_time_ms                  = curr_time_ms - prev_time_ms;
+    prev_time_ms                     = curr_time_ms;

    bool success = true;
    for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
@ -2019,15 +2032,27 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
                size_t entry_size = zip_entry_size(zip);
                if (entry_size != n) {
                    read_buffer.resize(entry_size);
+                    prev_time_ms = ggml_time_ms();
                    zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
+                    curr_time_ms = ggml_time_ms();
+                    read_time_ms += curr_time_ms - prev_time_ms;
+                    prev_time_ms = curr_time_ms;
                    memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
+                    curr_time_ms = ggml_time_ms();
+                    memcpy_time_ms += curr_time_ms - prev_time_ms;
                } else {
+                    prev_time_ms = ggml_time_ms();
                    zip_entry_noallocread(zip, (void*)buf, n);
+                    curr_time_ms = ggml_time_ms();
+                    read_time_ms += curr_time_ms - prev_time_ms;
                }
                zip_entry_close(zip);
            } else {
+                prev_time_ms = ggml_time_ms();
                file.seekg(tensor_storage.offset);
                file.read(buf, n);
+                curr_time_ms = ggml_time_ms();
+                read_time_ms += curr_time_ms - prev_time_ms;
                if (!file) {
                    LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
                    return false;
@ -2072,6 +2097,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
                        read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
                    }

+                    prev_time_ms = ggml_time_ms();
                    if (tensor_storage.is_bf16) {
                        // inplace op
                        bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
@ -2086,10 +2112,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
                    } else if (tensor_storage.is_i64) {
                        i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
                    }
+                    curr_time_ms = ggml_time_ms();
+                    convert_time_ms += curr_time_ms - prev_time_ms;
                } else {
                    read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
                    read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);

+                    prev_time_ms = ggml_time_ms();
                    if (tensor_storage.is_bf16) {
                        // inplace op
                        bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
@ -2109,11 +2138,14 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {

                    convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
                                   dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                    curr_time_ms = ggml_time_ms();
+                    convert_time_ms += curr_time_ms - prev_time_ms;
                }
            } else {
                read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
                read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);

+                prev_time_ms = ggml_time_ms();
                if (tensor_storage.is_bf16) {
                    // inplace op
                    bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
@ -2133,14 +2165,24 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {

                if (tensor_storage.type == dst_tensor->type) {
                    // copy to device memory
+                    curr_time_ms = ggml_time_ms();
+                    convert_time_ms += curr_time_ms - prev_time_ms;
+                    prev_time_ms = curr_time_ms;
                    ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
+                    curr_time_ms = ggml_time_ms();
+                    copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
                } else {
                    // convert first, then copy to device memory
                    convert_buffer.resize(ggml_nbytes(dst_tensor));
                    convert_tensor((void*)read_buffer.data(), tensor_storage.type,
                                   (void*)convert_buffer.data(), dst_tensor->type,
                                   (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                    curr_time_ms = ggml_time_ms();
+                    convert_time_ms += curr_time_ms - prev_time_ms;
+                    prev_time_ms = curr_time_ms;
                    ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
+                    curr_time_ms = ggml_time_ms();
+                    copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
                }
            }
            ++tensor_count;
@ -2170,6 +2212,14 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
            break;
        }
    }
+    int64_t end_time = ggml_time_ms();
+    LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
+             (end_time - start_time) / 1000.f,
+             process_time_ms / 1000.f,
+             read_time_ms / 1000.f,
+             memcpy_time_ms / 1000.f,
+             convert_time_ms / 1000.f,
+             copy_to_backend_time_ms / 1000.f);
    return success;
 }

--- a/rope.hpp
+++ b/rope.hpp
@ -156,25 +156,33 @@ struct Rope {
                                                        int patch_size,
                                                        int bs,
                                                        int context_len,
-                                                        std::vector<ggml_tensor*> ref_latents) {
+                                                        std::vector<ggml_tensor*> ref_latents,
+                                                        bool increase_ref_index) {
        auto txt_ids = gen_txt_ids(bs, context_len);
        auto img_ids = gen_img_ids(h, w, patch_size, bs);

        auto ids               = concat_ids(txt_ids, img_ids, bs);
        uint64_t curr_h_offset = 0;
        uint64_t curr_w_offset = 0;
+        int index              = 1;
        for (ggml_tensor* ref : ref_latents) {
            uint64_t h_offset = 0;
            uint64_t w_offset = 0;
-            if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
-                w_offset = curr_w_offset;
-            } else {
-                h_offset = curr_h_offset;
+            if (!increase_ref_index) {
+                if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
+                    w_offset = curr_w_offset;
+                } else {
+                    h_offset = curr_h_offset;
+                }
            }

-            auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset);
+            auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, index, h_offset, w_offset);
            ids          = concat_ids(ids, ref_ids, bs);

+            if (increase_ref_index) {
+                index++;
+            }
+
            curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
            curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
        }
@ -188,9 +196,10 @@ struct Rope {
                                          int bs,
                                          int context_len,
                                          std::vector<ggml_tensor*> ref_latents,
+                                          bool increase_ref_index,
                                          int theta,
                                          const std::vector<int>& axes_dim) {
-        std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents);
+        std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
        return embed_nd(ids, bs, theta, axes_dim);
    }

--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -557,8 +557,6 @@ public:
        // load weights
        LOG_DEBUG("loading weights");

-        int64_t t0 = ggml_time_ms();
-
        std::set<std::string> ignore_tensors;
        tensors["alphas_cumprod"] = alphas_cumprod_tensor;
        if (use_tiny_autoencoder) {
@ -656,11 +654,7 @@ public:
                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
        }

-        int64_t t1 = ggml_time_ms();
-        LOG_INFO("loading model from '%s' completed, taking %.2fs", SAFE_STR(sd_ctx_params->model_path), (t1 - t0) * 1.0f / 1000);
-
        // check is_using_v_parameterization_for_sd2
-
        if (sd_version_is_sd2(version)) {
            if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
                is_using_v_parameterization = true;
@ -775,7 +769,7 @@ public:

        int64_t t0              = ggml_time_ms();
        struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
-        diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, -1, {}, 0.f, &out);
+        diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, false, -1, {}, 0.f, &out);
        diffusion_model->free_compute_buffer();

        double result = 0.f;
@ -1032,6 +1026,7 @@ public:
                        int start_merge_step,
                        SDCondition id_cond,
                        std::vector<ggml_tensor*> ref_latents = {},
+                        bool increase_ref_index               = false,
                        ggml_tensor* denoise_mask             = nullptr) {
        std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);

@ -1126,6 +1121,7 @@ public:
                                              cond.c_vector,
                                              guidance_tensor,
                                              ref_latents,
+                                              increase_ref_index,
                                              -1,
                                              controls,
                                              control_strength,
@ -1139,6 +1135,7 @@ public:
                                              id_cond.c_vector,
                                              guidance_tensor,
                                              ref_latents,
+                                              increase_ref_index,
                                              -1,
                                              controls,
                                              control_strength,
@ -1160,6 +1157,7 @@ public:
                                              uncond.c_vector,
                                              guidance_tensor,
                                              ref_latents,
+                                              increase_ref_index,
                                              -1,
                                              controls,
                                              control_strength,
@ -1177,6 +1175,7 @@ public:
                                              img_cond.c_vector,
                                              guidance_tensor,
                                              ref_latents,
+                                              increase_ref_index,
                                              -1,
                                              controls,
                                              control_strength,
@ -1198,6 +1197,7 @@ public:
                                              cond.c_vector,
                                              guidance_tensor,
                                              ref_latents,
+                                              increase_ref_index,
                                              -1,
                                              controls,
                                              control_strength,
@ -1710,6 +1710,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
             "\n"
             "batch_count: %d\n"
             "ref_images_count: %d\n"
+             "increase_ref_index: %s\n"
             "control_strength: %.2f\n"
             "style_strength: %.2f\n"
             "normalize_input: %s\n"
@ -1724,6 +1725,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
             sd_img_gen_params->seed,
             sd_img_gen_params->batch_count,
             sd_img_gen_params->ref_images_count,
+             BOOL_STR(sd_img_gen_params->increase_ref_index),
             sd_img_gen_params->control_strength,
             sd_img_gen_params->style_strength,
             BOOL_STR(sd_img_gen_params->normalize_input),
@ -1797,6 +1799,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                    bool normalize_input,
                                    std::string input_id_images_path,
                                    std::vector<ggml_tensor*> ref_latents,
+                                    bool increase_ref_index,
                                    ggml_tensor* concat_latent = NULL,
                                    ggml_tensor* denoise_mask  = NULL) {
    if (seed < 0) {
@ -2054,6 +2057,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                     start_merge_step,
                                                     id_cond,
                                                     ref_latents,
+                                                     increase_ref_index,
                                                     denoise_mask);
        // print_ggml_tensor(x_0);
        int64_t sampling_end = ggml_time_ms();
@ -2304,7 +2308,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
        LOG_INFO("EDIT mode");
    }

-    std::vector<struct ggml_tensor*> ref_latents;
+    std::vector<ggml_tensor*> ref_latents;
    for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
        ggml_tensor* img = ggml_new_tensor_4d(work_ctx,
                                              GGML_TYPE_F32,
@ -2359,6 +2363,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                        sd_img_gen_params->normalize_input,
                                                        sd_img_gen_params->input_id_images_path,
                                                        ref_latents,
+                                                        sd_img_gen_params->increase_ref_index,
                                                        concat_latent,
                                                        denoise_mask);

--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -182,6 +182,7 @@ typedef struct {
    sd_image_t init_image;
    sd_image_t* ref_images;
    int ref_images_count;
+    bool increase_ref_index;
    sd_image_t mask_image;
    int width;
    int height;