From f16a110f8776398ef23a2a6b7b57522c2471637a Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 30 Mar 2026 00:19:25 +0800 Subject: [PATCH] refactor: migrate generation pipeline to sd::Tensor (#1373) --- examples/cli/main.cpp | 2 +- src/anima.hpp | 37 +- src/auto_encoder_kl.hpp | 261 +-- src/cache_dit.hpp | 52 +- src/clip.hpp | 29 +- src/common_dit.hpp | 58 +- src/condition_cache_utils.hpp | 64 + src/conditioner.hpp | 951 ++++------ src/control.hpp | 98 +- src/denoiser.hpp | 1329 ++++--------- src/diffusion_model.hpp | 204 +- src/easycache.hpp | 72 +- src/esrgan.hpp | 19 +- src/flux.hpp | 119 +- src/ggml_extend.hpp | 469 +++-- src/latent-preview.h | 66 + src/llm.hpp | 156 +- src/lora.hpp | 2 +- src/mmdit.hpp | 68 +- src/pmid.hpp | 43 +- src/preprocessing.hpp | 362 ++-- src/qwen_image.hpp | 78 +- src/sample-cache.cpp | 361 ++++ src/sample-cache.h | 61 + src/spectrum.hpp | 20 +- src/stable-diffusion.cpp | 3297 ++++++++++++++------------------- src/t5.hpp | 2074 +++++++++++---------- src/tae.hpp | 56 +- src/tensor.hpp | 1249 +++++++++++++ src/tensor_ggml.hpp | 127 ++ src/tokenize_util.cpp | 1986 ++++++++++---------- src/ucache.hpp | 67 +- src/unet.hpp | 97 +- src/upscaler.cpp | 73 +- src/util.cpp | 194 +- src/util.h | 19 +- src/vae.hpp | 260 +-- src/wan.hpp | 308 ++- src/z_image.hpp | 78 +- 39 files changed, 7768 insertions(+), 7098 deletions(-) create mode 100644 src/condition_cache_utils.hpp create mode 100644 src/sample-cache.cpp create mode 100644 src/sample-cache.h create mode 100644 src/tensor.hpp create mode 100644 src/tensor_ggml.hpp diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index f9e4928..ddb88c9 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -601,7 +601,7 @@ int main(int argc, const char* argv[]) { if (gen_params.end_image_path.size() > 0) { vae_decode_only = false; - if (!load_image_and_update_size(gen_params.init_image_path, end_image)) { + if (!load_image_and_update_size(gen_params.end_image_path, end_image)) { return 1; } } diff --git a/src/anima.hpp b/src/anima.hpp index 81dbefe..5850cc3 100644 --- a/src/anima.hpp +++ b/src/anima.hpp @@ -602,20 +602,19 @@ namespace Anima { return Rope::embed_nd(ids, bs, axis_thetas, axes_dim); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* t5_ids = nullptr, - ggml_tensor* t5_weights = nullptr) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& t5_ids_tensor = {}, + const sd::Tensor& t5_weights_tensor = {}) { + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* t5_ids = make_optional_input(t5_ids_tensor); + ggml_tensor* t5_weights = make_optional_input(t5_weights_tensor); GGML_ASSERT(x->ne[3] == 1); ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE); - x = to_backend(x); - timesteps = to_backend(timesteps); - context = to_backend(context); - t5_ids = to_backend(t5_ids); - t5_weights = to_backend(t5_weights); - int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size; int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size; int64_t h_pad = x->ne[1] + pad_h; @@ -667,18 +666,16 @@ namespace Anima { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* t5_ids = nullptr, - ggml_tensor* t5_weights = nullptr, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& t5_ids = {}, + const sd::Tensor& t5_weights = {}) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, t5_ids, t5_weights); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } }; } // namespace Anima diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp index 6efdb41..039fb9d 100644 --- a/src/auto_encoder_kl.hpp +++ b/src/auto_encoder_kl.hpp @@ -1,4 +1,4 @@ -#ifndef __AUTO_ENCODER_KL_HPP__ +#ifndef __AUTO_ENCODER_KL_HPP__ #define __AUTO_ENCODER_KL_HPP__ #include "vae.hpp" @@ -685,10 +685,9 @@ struct AutoEncoderKL : public VAE { ae.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - z = to_backend(z); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); @@ -699,184 +698,100 @@ struct AutoEncoderKL : public VAE { return gf; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) override { + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z, + bool decode_graph) override { GGML_ASSERT(!decode_only || decode_graph); auto get_graph = [&]() -> ggml_cgraph* { return build_graph(z, decode_graph); }; - // ggml_set_f32(z, 0.5f); - // print_ggml_tensor(z); - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z.dim()); } - ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments, std::shared_ptr rng) { + sd::Tensor gaussian_latent_sample(const sd::Tensor& moments, std::shared_ptr rng) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor* latents = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); - ggml_tensor* noise = ggml_dup_tensor(work_ctx, latents); - ggml_ext_im_set_randn_f32(noise, rng); - { - float mean = 0; - float logvar = 0; - float value = 0; - float std_ = 0; - for (int i = 0; i < latents->ne[3]; i++) { - for (int j = 0; j < latents->ne[2]; j++) { - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - mean = ggml_ext_tensor_get_f32(moments, l, k, j, i); - logvar = ggml_ext_tensor_get_f32(moments, l, k, j + (int)latents->ne[2], i); - logvar = std::max(-30.0f, std::min(logvar, 20.0f)); - std_ = std::exp(0.5f * logvar); - value = mean + std_ * ggml_ext_tensor_get_f32(noise, l, k, j, i); - // printf("%d %d %d %d -> %f\n", i, j, k, l, value); - ggml_ext_tensor_set_f32(latents, value, l, k, j, i); - } - } - } - } - } + auto chunks = sd::ops::chunk(moments, 2, 2); + const auto& mean = chunks[0]; + const auto& logvar = chunks[1]; + sd::Tensor stddev = sd::ops::exp(0.5f * sd::ops::clamp(logvar, -30.0f, 20.0f)); + sd::Tensor noise = sd::Tensor::randn_like(mean, rng); + sd::Tensor latents = mean + stddev * noise; return latents; } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { if (sd_version_is_flux2(version)) { return vae_output; } else if (version == VERSION_SD1_PIX2PIX) { - return ggml_view_3d(work_ctx, - vae_output, - vae_output->ne[0], - vae_output->ne[1], - vae_output->ne[2] / 2, - vae_output->nb[1], - vae_output->nb[2], - 0); + return sd::ops::chunk(vae_output, 2, 2)[0]; } else { - return gaussian_latent_sample(work_ctx, vae_output, rng); + return gaussian_latent_sample(vae_output, rng); } } - void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector& latents_mean_vec, std::vector& latents_std_vec) { - // flux2 + std::pair, sd::Tensor> get_latents_mean_std(const sd::Tensor& latents, int channel_dim) { + GGML_ASSERT(channel_dim >= 0 && static_cast(channel_dim) < static_cast(latents.dim())); if (sd_version_is_flux2(version)) { - GGML_ASSERT(latents->ne[channel_dim] == 128); - latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f, - -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f, - -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f, - 0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f, - -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f, - 0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f, - 0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f, - 0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f, - -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f, - 0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f, - -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f, - 0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f, - -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f, - 0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f, - -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f, - -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f}; - latents_std_vec = { - 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f, - 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f, - 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f, - 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f, - 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f, - 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f, - 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f, - 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f, - 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f, - 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f, - 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f, - 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f, - 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f, - 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f, - 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f, - 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f}; + GGML_ASSERT(latents.shape()[channel_dim] == 128); + std::vector stats_shape(static_cast(latents.dim()), 1); + stats_shape[static_cast(channel_dim)] = latents.shape()[channel_dim]; + + auto mean_tensor = sd::Tensor::from_vector({-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f, + -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f, + -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f, + 0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f, + -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f, + 0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f, + 0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f, + 0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f, + -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f, + 0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f, + -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f, + 0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f, + -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f, + 0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f, + -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f, + -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f}); + mean_tensor.reshape_(stats_shape); + auto std_tensor = sd::Tensor::from_vector({1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f, + 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f, + 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f, + 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f, + 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f, + 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f, + 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f, + 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f, + 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f, + 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f, + 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f, + 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f, + 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f, + 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f, + 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f, + 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f}); + std_tensor.reshape_(stats_shape); + return {std::move(mean_tensor), std::move(std_tensor)}; } else { GGML_ABORT("unknown version %d", version); } } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - ggml_tensor* vae_latents = ggml_dup(work_ctx, latents); + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { if (sd_version_is_flux2(version)) { - int channel_dim = 2; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latents->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latents->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[j]; - std_ = latents_std_vec[j]; - } - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); - value = value * std_ / scale_factor + mean; - ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i); - } - } - } - } - } else { - ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3); - value = (value / scale_factor) + shift_factor; - ggml_ext_tensor_set_f32(vae_latents, value, i0, i1, i2, i3); - }); + int channel_dim = 2; + auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim); + return (latents * std_tensor) / scale_factor + mean_tensor; } - return vae_latents; + return (latents / scale_factor) + shift_factor; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents); + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { if (sd_version_is_flux2(version)) { - int channel_dim = 2; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latents->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latents->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[j]; - std_ = latents_std_vec[j]; - } - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); - value = (value - mean) * scale_factor / std_; - ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i); - } - } - } - } - } else { - ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3); - value = (value - shift_factor) * scale_factor; - ggml_ext_tensor_set_f32(diffusion_latents, value, i0, i1, i2, i3); - }); + int channel_dim = 2; + auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim); + return ((latents - mean_tensor) * scale_factor) / std_tensor; } - return diffusion_latents; + return (latents - shift_factor) * scale_factor; } int get_encoder_output_channels(int input_channels) { @@ -889,24 +804,26 @@ struct AutoEncoderKL : public VAE { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // CPU, x{1, 3, 64, 64}: Pass // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan // CPU, x{2, 3, 64, 64}: Wrong result // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result - auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2); - ggml_set_f32(x, 0.5f); - print_ggml_tensor(x); - ggml_tensor* out = nullptr; + sd::Tensor x({64, 64, 3, 2}); + x.fill_(0.5f); + print_sd_tensor(x); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - _compute(8, x, false, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = _compute(8, x, false); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("encode test done in %lldms", t1 - t0); } @@ -915,16 +832,18 @@ struct AutoEncoderKL : public VAE { // CUDA, z{1, 4, 8, 8}: Pass // CPU, z{3, 4, 8, 8}: Wrong result // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result - auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); - ggml_set_f32(z, 0.5f); - print_ggml_tensor(z); - ggml_tensor* out = nullptr; + sd::Tensor z({8, 8, 4, 1}); + z.fill_(0.5f); + print_sd_tensor(z); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - _compute(8, z, true, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = _compute(8, z, true); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("decode test done in %lldms", t1 - t0); } }; diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp index 9af627f..dad67d4 100644 --- a/src/cache_dit.hpp +++ b/src/cache_dit.hpp @@ -8,7 +8,9 @@ #include #include +#include "condition_cache_utils.hpp" #include "ggml_extend.hpp" +#include "tensor.hpp" struct DBCacheConfig { bool enabled = false; @@ -771,35 +773,37 @@ struct CacheDitConditionState { return it != cache_diffs.end() && !it->second.diff.empty(); } - void update_cache(const void* cond, const float* input, const float* output, size_t size) { + void update_cache(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { CacheEntry& entry = cache_diffs[cond]; - entry.diff.resize(size); - for (size_t i = 0; i < size; i++) { - entry.diff[i] = output[i] - input[i]; + if (!sd::store_condition_cache_diff(&entry.diff, input, output)) { + entry.prev_input.clear(); + entry.prev_output.clear(); + entry.has_prev = false; + return; } + size_t size = static_cast(output.numel()); + const float* input_data = input.data(); + const float* output_data = output.data(); entry.prev_input.resize(size); entry.prev_output.resize(size); for (size_t i = 0; i < size; i++) { - entry.prev_input[i] = input[i]; - entry.prev_output[i] = output[i]; + entry.prev_input[i] = input_data[i]; + entry.prev_output[i] = output_data[i]; } entry.has_prev = true; } - void apply_cache(const void* cond, const float* input, float* output, size_t size) { + void apply_cache(const void* cond, + const sd::Tensor& input, + sd::Tensor* output) { auto it = cache_diffs.find(cond); if (it == cache_diffs.end() || it->second.diff.empty()) return; - if (it->second.diff.size() != size) - return; - - for (size_t i = 0; i < size; i++) { - output[i] = input[i] + it->second.diff[i]; - } + sd::apply_condition_cache_diff(it->second.diff, input, output); } - bool before_condition(const void* cond, ggml_tensor* input, ggml_tensor* output, float sigma, int step_index) { + bool before_condition(const void* cond, const sd::Tensor& input, sd::Tensor* output, float sigma, int step_index) { if (!enabled() || step_index < 0) return false; @@ -819,8 +823,7 @@ struct CacheDitConditionState { if (skip_current_step) { if (has_cache(cond)) { - apply_cache(cond, (float*)input->data, (float*)output->data, - static_cast(ggml_nelements(output))); + apply_cache(cond, input, output); return true; } return false; @@ -833,13 +836,13 @@ struct CacheDitConditionState { if (it == cache_diffs.end() || !it->second.has_prev) return false; - size_t ne = static_cast(ggml_nelements(input)); + size_t ne = static_cast(input.numel()); if (it->second.prev_input.size() != ne) return false; - float* input_data = (float*)input->data; - float diff = CacheDitState::calculate_residual_diff( - it->second.prev_input.data(), input_data, ne); + const float* input_data = input.data(); + float diff = CacheDitState::calculate_residual_diff( + it->second.prev_input.data(), input_data, ne); float effective_threshold = config.residual_diff_threshold; if (config.Fn_compute_blocks > 0) { @@ -859,7 +862,7 @@ struct CacheDitConditionState { cached_steps.push_back(current_step_index); continuous_cached_steps++; accumulated_residual_diff += diff; - apply_cache(cond, input_data, (float*)output->data, ne); + apply_cache(cond, input, output); return true; } @@ -867,15 +870,14 @@ struct CacheDitConditionState { return false; } - void after_condition(const void* cond, ggml_tensor* input, ggml_tensor* output) { + void after_condition(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { if (!step_is_active()) return; - size_t ne = static_cast(ggml_nelements(output)); - update_cache(cond, (float*)input->data, (float*)output->data, ne); + update_cache(cond, input, output); if (cond == anchor_condition && taylor_config.enabled) { - taylor_state.update_derivatives((float*)output->data, ne, current_step_index); + taylor_state.update_derivatives(output.data(), static_cast(output.numel()), current_step_index); } } diff --git a/src/clip.hpp b/src/clip.hpp index f4e5ef7..8f2ac06 100644 --- a/src/clip.hpp +++ b/src/clip.hpp @@ -957,15 +957,14 @@ struct CLIPTextModelRunner : public GGMLRunner { return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip); } - ggml_cgraph* build_graph(ggml_tensor* input_ids, + ggml_cgraph* build_graph(const sd::Tensor& input_ids_tensor, int num_custom_embeddings = 0, void* custom_embeddings_data = nullptr, size_t max_token_idx = 0, bool return_pooled = false, int clip_skip = -1) { - ggml_cgraph* gf = new_graph_custom(2048); - - input_ids = to_backend(input_ids); + ggml_cgraph* gf = new_graph_custom(2048); + ggml_tensor* input_ids = make_input(input_ids_tensor); ggml_tensor* embeddings = nullptr; @@ -1004,19 +1003,21 @@ struct CLIPTextModelRunner : public GGMLRunner { return gf; } - bool compute(const int n_threads, - ggml_tensor* input_ids, - int num_custom_embeddings, - void* custom_embeddings_data, - size_t max_token_idx, - bool return_pooled, - int clip_skip, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(const int n_threads, + const sd::Tensor& input_ids, + int num_custom_embeddings, + void* custom_embeddings_data, + size_t max_token_idx, + bool return_pooled, + int clip_skip) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + auto result = GGMLRunner::compute(get_graph, n_threads, true); + if (return_pooled) { + return take_or_empty(std::move(result)); + } + return restore_trailing_singleton_dims(std::move(result), 3); } }; diff --git a/src/common_dit.hpp b/src/common_dit.hpp index 0e6f0f0..30141d4 100644 --- a/src/common_dit.hpp +++ b/src/common_dit.hpp @@ -4,11 +4,11 @@ #include "ggml_extend.hpp" namespace DiT { - ggml_tensor* patchify(ggml_context* ctx, - ggml_tensor* x, - int pw, - int ph, - bool patch_last = true) { + inline ggml_tensor* patchify(ggml_context* ctx, + ggml_tensor* x, + int pw, + int ph, + bool patch_last = true) { // x: [N, C, H, W] // return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C] int64_t N = x->ne[3]; @@ -33,13 +33,13 @@ namespace DiT { return x; } - ggml_tensor* unpatchify(ggml_context* ctx, - ggml_tensor* x, - int64_t h, - int64_t w, - int ph, - int pw, - bool patch_last = true) { + inline ggml_tensor* unpatchify(ggml_context* ctx, + ggml_tensor* x, + int64_t h, + int64_t w, + int ph, + int pw, + bool patch_last = true) { // x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C] // return: [N, C, H, W] int64_t N = x->ne[2]; @@ -64,10 +64,10 @@ namespace DiT { return x; } - ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, - ggml_tensor* x, - int ph, - int pw) { + inline ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, + ggml_tensor* x, + int ph, + int pw) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; @@ -77,23 +77,23 @@ namespace DiT { return x; } - ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx, - ggml_tensor* x, - int ph, - int pw, - bool patch_last = true) { + inline ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx, + ggml_tensor* x, + int ph, + int pw, + bool patch_last = true) { x = pad_to_patch_size(ctx, x, ph, pw); x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last); return x; } - ggml_tensor* unpatchify_and_crop(ggml_context* ctx, - ggml_tensor* x, - int64_t H, - int64_t W, - int ph, - int pw, - bool patch_last = true) { + inline ggml_tensor* unpatchify_and_crop(ggml_context* ctx, + ggml_tensor* x, + int64_t H, + int64_t W, + int ph, + int pw, + bool patch_last = true) { int pad_h = (ph - H % ph) % ph; int pad_w = (pw - W % pw) % pw; int64_t h = ((H + pad_h) / ph); @@ -105,4 +105,4 @@ namespace DiT { } } // namespace DiT -#endif // __COMMON_DIT_HPP__ \ No newline at end of file +#endif // __COMMON_DIT_HPP__ diff --git a/src/condition_cache_utils.hpp b/src/condition_cache_utils.hpp new file mode 100644 index 0000000..903d64e --- /dev/null +++ b/src/condition_cache_utils.hpp @@ -0,0 +1,64 @@ +#ifndef __CONDITION_CACHE_UTILS_HPP__ +#define __CONDITION_CACHE_UTILS_HPP__ + +#include + +#include "tensor.hpp" + +namespace sd { + + inline bool store_condition_cache_diff(std::vector* diff, + const sd::Tensor& input, + const sd::Tensor& output) { + if (diff == nullptr || input.empty() || output.empty()) { + return false; + } + + size_t input_size = static_cast(input.numel()); + size_t output_size = static_cast(output.numel()); + if (input_size == 0 || input_size != output_size) { + diff->clear(); + return false; + } + + const float* input_data = input.data(); + const float* output_data = output.data(); + if (input_data == nullptr || output_data == nullptr) { + diff->clear(); + return false; + } + + diff->resize(output_size); + for (size_t i = 0; i < output_size; ++i) { + (*diff)[i] = output_data[i] - input_data[i]; + } + return true; + } + + inline bool apply_condition_cache_diff(const std::vector& diff, + const sd::Tensor& input, + sd::Tensor* output) { + if (output == nullptr || input.empty() || diff.empty()) { + return false; + } + + size_t input_size = static_cast(input.numel()); + if (input_size == 0 || diff.size() != input_size) { + return false; + } + + *output = input; + float* output_data = output->data(); + if (output_data == nullptr) { + return false; + } + + for (size_t i = 0; i < input_size; ++i) { + output_data[i] += diff[i]; + } + return true; + } + +} // namespace sd + +#endif // __CONDITION_CACHE_UTILS_HPP__ diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 534a2f1..05167cf 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -1,39 +1,85 @@ #ifndef __CONDITIONER_HPP__ #define __CONDITIONER_HPP__ +#include + #include "clip.hpp" #include "llm.hpp" #include "t5.hpp" +#include "tensor_ggml.hpp" struct SDCondition { - ggml_tensor* c_crossattn = nullptr; // aka context - ggml_tensor* c_vector = nullptr; // aka y - ggml_tensor* c_concat = nullptr; + sd::Tensor c_crossattn; + sd::Tensor c_vector; + sd::Tensor c_concat; + sd::Tensor c_t5_ids; + sd::Tensor c_t5_weights; - std::vector extra_c_crossattns; + std::vector> extra_c_crossattns; SDCondition() = default; - SDCondition(ggml_tensor* c_crossattn, - ggml_tensor* c_vector, - ggml_tensor* c_concat, - const std::vector& extra_c_crossattns = {}) - : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {} + + SDCondition(sd::Tensor c_crossattn, + sd::Tensor c_vector, + sd::Tensor c_concat) + : c_crossattn(std::move(c_crossattn)), c_vector(std::move(c_vector)), c_concat(std::move(c_concat)) {} + + bool empty() const { + if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() || + !c_t5_ids.empty() || !c_t5_weights.empty()) { + return false; + } + + for (const auto& tensor : extra_c_crossattns) { + if (!tensor.empty()) { + return false; + } + } + + return true; + } }; +static inline sd::Tensor apply_token_weights(sd::Tensor hidden_states, + const std::vector& weights) { + if (hidden_states.empty()) { + return hidden_states; + } + + if (hidden_states.dim() == 1) { + hidden_states.unsqueeze_(1); + } + + GGML_ASSERT(static_cast(hidden_states.shape()[1]) == weights.size()); + + float original_mean = hidden_states.mean(); + auto chunk_weights = sd::Tensor::from_vector(weights); + chunk_weights.reshape_({1, static_cast(weights.size())}); + hidden_states *= chunk_weights; + float new_mean = hidden_states.mean(); + if (new_mean != 0.0f) { + hidden_states *= (original_mean / new_mean); + } + + return hidden_states; +} + struct ConditionerParams { std::string text; - int clip_skip = -1; - int width = -1; - int height = -1; - int adm_in_channels = -1; - bool zero_out_masked = false; - int num_input_imgs = 0; // for photomaker - std::vector ref_images = {}; // for qwen image edit + int clip_skip = -1; + int width = -1; + int height = -1; + int adm_in_channels = -1; + bool zero_out_masked = false; + int num_input_imgs = 0; // for photomaker + const std::vector>* ref_images = nullptr; // for qwen image edit }; struct Conditioner { - virtual SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + virtual ~Conditioner() = default; + +public: + virtual SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) = 0; virtual void alloc_params_buffer() = 0; virtual void free_params_buffer() = 0; @@ -41,13 +87,11 @@ struct Conditioner { virtual size_t get_params_buffer_size() = 0; virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} - virtual std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx, - int n_threads, + virtual std::tuple> get_learned_condition_with_trigger(int n_threads, const ConditionerParams& conditioner_params) { GGML_ABORT("Not implemented yet!"); } - virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx, - const std::string& prompt) { + virtual std::string remove_trigger_from_prompt(const std::string& prompt) { GGML_ABORT("Not implemented yet!"); } }; @@ -426,8 +470,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return {tokens, weights}; } - SDCondition get_learned_condition_common(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition_common(int n_threads, std::vector& tokens, std::vector& weights, int clip_skip, @@ -435,13 +478,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { int height, int adm_in_channels = -1, bool zero_out_masked = false) { - int64_t t0 = ggml_time_ms(); - ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] - ggml_tensor* chunk_hidden_states = nullptr; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] - ggml_tensor* chunk_hidden_states1 = nullptr; // [n_token, hidden_size] - ggml_tensor* chunk_hidden_states2 = nullptr; // [n_token, hidden_size2] - ggml_tensor* pooled = nullptr; - std::vector hidden_states_vec; + int64_t t0 = ggml_time_ms(); + sd::Tensor hidden_states; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] + sd::Tensor pooled; if (clip_skip <= 0) { clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1; @@ -455,9 +494,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::vector chunk_weights(weights.begin() + chunk_idx * chunk_len, weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - ggml_tensor* input_ids2 = nullptr; - size_t max_token_idx = 0; + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); + sd::Tensor input_ids2; + size_t max_token_idx = 0; if (sd_version_is_sdxl(version)) { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID); if (it != chunk_tokens.end()) { @@ -466,7 +505,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - input_ids2 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + input_ids2 = sd::Tensor({static_cast(chunk_tokens.size())}, chunk_tokens); // for (int i = 0; i < chunk_tokens.size(); i++) { // printf("%d ", chunk_tokens[i]); @@ -475,118 +514,87 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } { - text_model->compute(n_threads, - input_ids, - num_custom_embeddings, - token_embed_custom.data(), - max_token_idx, - false, - clip_skip, - &chunk_hidden_states1, - work_ctx); + auto chunk_hidden_states = text_model->compute(n_threads, + input_ids, + num_custom_embeddings, + token_embed_custom.data(), + max_token_idx, + false, + clip_skip); + GGML_ASSERT(!chunk_hidden_states.empty()); if (sd_version_is_sdxl(version)) { - text_model2->compute(n_threads, - input_ids2, - num_custom_embeddings, - token_embed_custom.data(), - max_token_idx, - false, - clip_skip, - &chunk_hidden_states2, work_ctx); - // concat - chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0); + auto chunk_hidden_states2 = text_model2->compute(n_threads, + input_ids2, + num_custom_embeddings, + token_embed_custom.data(), + max_token_idx, + false, + clip_skip); + GGML_ASSERT(!chunk_hidden_states2.empty()); + chunk_hidden_states = sd::ops::concat(chunk_hidden_states, chunk_hidden_states2, 0); if (chunk_idx == 0) { - text_model2->compute(n_threads, - input_ids2, - num_custom_embeddings, - token_embed_custom.data(), - max_token_idx, - true, - clip_skip, - &pooled, - work_ctx); + pooled = text_model2->compute(n_threads, + input_ids2, + num_custom_embeddings, + token_embed_custom.data(), + max_token_idx, + true, + clip_skip); + GGML_ASSERT(!pooled.empty()); } - } else { - chunk_hidden_states = chunk_hidden_states1; } - } + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states); - { - float original_mean = ggml_ext_tensor_mean(chunk_hidden_states); - for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) { - for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) { - for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(chunk_hidden_states, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(result, value, i0, i1, i2); - } - } + chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights); + + if (zero_out_masked) { + chunk_hidden_states.fill_(0.0f); } - float new_mean = ggml_ext_tensor_mean(result); - ggml_ext_tensor_scale_inplace(result, (original_mean / new_mean)); - } - if (zero_out_masked) { - float* vec = (float*)result->data; - for (int i = 0; i < ggml_nelements(result); i++) { - vec[i] = 0; + if (!hidden_states.empty()) { + hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1); + } else { + hidden_states = std::move(chunk_hidden_states); } } - hidden_states_vec.insert(hidden_states_vec.end(), (float*)result->data, ((float*)result->data) + ggml_nelements(result)); } - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); - - ggml_tensor* vec = nullptr; + sd::Tensor vec; if (sd_version_is_sdxl(version)) { int out_dim = 256; - vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels); - // [0:1280] + GGML_ASSERT(!pooled.empty()); + vec = sd::Tensor({adm_in_channels}); + vec.fill_(0.0f); size_t offset = 0; - memcpy(vec->data, pooled->data, ggml_nbytes(pooled)); - offset += ggml_nbytes(pooled); + std::copy(pooled.values().begin(), pooled.values().end(), vec.values().begin()); + offset += pooled.values().size(); - // original_size_as_tuple - float orig_width = (float)width; - float orig_height = (float)height; - std::vector timesteps = {orig_height, orig_width}; + auto append_embedding = [&](const std::vector& timesteps) { + sd::Tensor embedding; + set_timestep_embedding(timesteps, &embedding, out_dim); + std::copy(embedding.values().begin(), embedding.values().end(), vec.values().begin() + static_cast(offset)); + offset += embedding.values().size(); + }; - ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); - offset += ggml_nbytes(embed_view); - set_timestep_embedding(timesteps, embed_view, out_dim); - // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); - // crop_coords_top_left - float crop_coord_top = 0.f; - float crop_coord_left = 0.f; - timesteps = {crop_coord_top, crop_coord_left}; - embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); - offset += ggml_nbytes(embed_view); - set_timestep_embedding(timesteps, embed_view, out_dim); - // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); - // target_size_as_tuple - float target_width = (float)width; - float target_height = (float)height; - timesteps = {target_height, target_width}; - embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); - offset += ggml_nbytes(embed_view); - set_timestep_embedding(timesteps, embed_view, out_dim); - // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); - GGML_ASSERT(offset == ggml_nbytes(vec)); + append_embedding({static_cast(height), static_cast(width)}); + append_embedding({0.0f, 0.0f}); + append_embedding({static_cast(height), static_cast(width)}); + GGML_ASSERT(offset == vec.values().size()); } - // print_ggml_tensor(result); - return {hidden_states, vec, nullptr}; + SDCondition result; + if (!hidden_states.empty()) { + result.c_crossattn = std::move(hidden_states); + } + + if (!vec.empty()) { + result.c_vector = std::move(vec); + } + return result; } std::tuple> - get_learned_condition_with_trigger(ggml_context* work_ctx, - int n_threads, + get_learned_condition_with_trigger(int n_threads, const ConditionerParams& conditioner_params) override { auto image_tokens = convert_token_to_id(trigger_word); // if(image_tokens.size() == 1){ @@ -608,8 +616,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { // for(int i = 0; i < clsm.size(); ++i) // printf("%d ", clsm[i]?1:0); // printf("\n"); - auto cond = get_learned_condition_common(work_ctx, - n_threads, + auto cond = get_learned_condition_common(n_threads, tokens, weights, conditioner_params.clip_skip, @@ -620,8 +627,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return std::make_tuple(cond, clsm); } - std::string remove_trigger_from_prompt(ggml_context* work_ctx, - const std::string& prompt) override { + std::string remove_trigger_from_prompt(const std::string& prompt) override { auto image_tokens = convert_token_to_id(trigger_word); GGML_ASSERT(image_tokens.size() == 1); auto tokens_and_weights = tokenize(prompt, false); @@ -632,14 +638,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return decode(tokens); } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, true); std::vector& tokens = tokens_and_weights.first; std::vector& weights = tokens_and_weights.second; - return get_learned_condition_common(work_ctx, - n_threads, + return get_learned_condition_common(n_threads, tokens, weights, conditioner_params.clip_skip, @@ -680,10 +684,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { vision_model.get_param_tensors(tensors, "cond_stage_model.transformer"); } - ggml_cgraph* build_graph(ggml_tensor* pixel_values, bool return_pooled, int clip_skip) { - ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - pixel_values = to_backend(pixel_values); + ggml_cgraph* build_graph(const sd::Tensor& pixel_values_tensor, bool return_pooled, int clip_skip) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_tensor* pixel_values = make_input(pixel_values_tensor); auto runner_ctx = get_context(); @@ -694,16 +697,14 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { return gf; } - bool compute(const int n_threads, - ggml_tensor* pixel_values, - bool return_pooled, - int clip_skip, - ggml_tensor** output, - ggml_context* output_ctx) { + sd::Tensor compute(const int n_threads, + const sd::Tensor& pixel_values, + bool return_pooled, + int clip_skip) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(pixel_values, return_pooled, clip_skip); }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); } }; @@ -893,8 +894,7 @@ struct SD3CLIPEmbedder : public Conditioner { return {{clip_l_tokens, clip_l_weights}, {clip_g_tokens, clip_g_weights}, {t5_tokens, t5_weights}}; } - SDCondition get_learned_condition_common(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition_common(int n_threads, std::vector, std::vector>> token_and_weights, int clip_skip, bool zero_out_masked = false) { @@ -909,232 +909,155 @@ struct SD3CLIPEmbedder : public Conditioner { clip_skip = 2; } - int64_t t0 = ggml_time_ms(); - ggml_tensor* hidden_states = nullptr; // [N, n_token*2, 4096] - ggml_tensor* chunk_hidden_states = nullptr; // [n_token*2, 4096] - ggml_tensor* chunk_hidden_states_l = nullptr; // [n_token, hidden_size_l] - ggml_tensor* chunk_hidden_states_g = nullptr; // [n_token, hidden_size_g] - ggml_tensor* chunk_hidden_states_t5 = nullptr; // [n_token, hidden_size_t5] - ggml_tensor* pooled = nullptr; - ggml_tensor* pooled_l = nullptr; // [768,] - ggml_tensor* pooled_g = nullptr; // [1280,] - std::vector hidden_states_vec; + size_t chunk_len = 77; + int64_t t0 = ggml_time_ms(); + sd::Tensor hidden_states; + sd::Tensor pooled; - size_t chunk_len = 77; size_t chunk_count = std::max(std::max(clip_l_tokens.size(), clip_g_tokens.size()), t5_tokens.size()) / chunk_len; + for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { // clip_l + sd::Tensor chunk_hidden_states_l; + sd::Tensor pooled_l; if (clip_l) { std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len, clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len, clip_l_weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); size_t max_token_idx = 0; - clip_l->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - false, - clip_skip, - &chunk_hidden_states_l, - work_ctx); - { - auto tensor = chunk_hidden_states_l; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); - } + chunk_hidden_states_l = clip_l->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + false, + clip_skip); + GGML_ASSERT(!chunk_hidden_states_l.empty()); + chunk_hidden_states_l = ::apply_token_weights(std::move(chunk_hidden_states_l), chunk_weights); if (chunk_idx == 0) { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - clip_l->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - true, - clip_skip, - &pooled_l, - work_ctx); + pooled_l = clip_l->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + true, + clip_skip); + GGML_ASSERT(!pooled_l.empty()); } } else { - chunk_hidden_states_l = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, chunk_len); - ggml_set_f32(chunk_hidden_states_l, 0.f); + chunk_hidden_states_l = sd::Tensor::zeros({768, static_cast(chunk_len), 1}); if (chunk_idx == 0) { - pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); - ggml_set_f32(pooled_l, 0.f); + pooled = sd::Tensor::zeros({768, 1}); } } // clip_g + sd::Tensor chunk_hidden_states_g; + sd::Tensor pooled_g; if (clip_g) { std::vector chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len, clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len, clip_g_weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); size_t max_token_idx = 0; - clip_g->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - false, - clip_skip, - &chunk_hidden_states_g, - work_ctx); - - { - auto tensor = chunk_hidden_states_g; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); - } + chunk_hidden_states_g = clip_g->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + false, + clip_skip); + GGML_ASSERT(!chunk_hidden_states_g.empty()); + chunk_hidden_states_g = ::apply_token_weights(std::move(chunk_hidden_states_g), chunk_weights); if (chunk_idx == 0) { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID); max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - clip_g->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - true, - clip_skip, - &pooled_g, - work_ctx); + pooled_g = clip_g->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + true, + clip_skip); + GGML_ASSERT(!pooled_g.empty()); } } else { - chunk_hidden_states_g = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 1280, chunk_len); - ggml_set_f32(chunk_hidden_states_g, 0.f); + chunk_hidden_states_g = sd::Tensor::zeros({1280, static_cast(chunk_len), 1}); if (chunk_idx == 0) { - pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280); - ggml_set_f32(pooled_g, 0.f); + pooled_g = sd::Tensor::zeros({1280, 1}); } } // t5 + sd::Tensor chunk_hidden_states_t5; if (t5) { std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, t5_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, t5_weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); - t5->compute(n_threads, - input_ids, - nullptr, - &chunk_hidden_states_t5, - work_ctx); - { - auto tensor = chunk_hidden_states_t5; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); - } + chunk_hidden_states_t5 = t5->compute(n_threads, + input_ids, + sd::Tensor()); + GGML_ASSERT(!chunk_hidden_states_t5.empty()); + chunk_hidden_states_t5 = ::apply_token_weights(std::move(chunk_hidden_states_t5), chunk_weights); } else { - chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); - ggml_set_f32(chunk_hidden_states_t5, 0.f); + chunk_hidden_states_t5 = sd::Tensor::zeros({4096, static_cast(chunk_len), 1}); } - auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx, - chunk_hidden_states_l->type, - 4096, - chunk_hidden_states_l->ne[1], - chunk_hidden_states_l->ne[2]); // [n_token, 4096] - - for (int i2 = 0; i2 < chunk_hidden_states_lg_pad->ne[2]; i2++) { - for (int i1 = 0; i1 < chunk_hidden_states_lg_pad->ne[1]; i1++) { - for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) { - float value = 0.f; - if (i0 < chunk_hidden_states_l->ne[0]) { - value = ggml_ext_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2); - } else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) { - value = ggml_ext_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2); - } - ggml_ext_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2); - } - } + sd::Tensor chunk_hidden_states_lg = sd::ops::concat(chunk_hidden_states_l, chunk_hidden_states_g, 0); + if (chunk_hidden_states_lg.shape()[0] < 4096) { + auto pad_shape = chunk_hidden_states_lg.shape(); + pad_shape[0] = 4096 - chunk_hidden_states_lg.shape()[0]; + chunk_hidden_states_lg = sd::ops::concat(chunk_hidden_states_lg, + sd::Tensor::zeros(pad_shape), + 0); } - chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096] + sd::Tensor chunk_hidden_states = sd::ops::concat(chunk_hidden_states_lg, + chunk_hidden_states_t5, + 1); // [n_token*2, 4096] if (chunk_idx == 0) { - pooled = ggml_ext_tensor_concat(work_ctx, pooled_l, pooled_g, 0); // [768 + 1280] + pooled = sd::ops::concat(pooled_l, pooled_g, 0); // [768 + 1280] } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); if (zero_out_masked) { - float* vec = (float*)chunk_hidden_states->data; - for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) { - vec[i] = 0; - } + chunk_hidden_states.fill_(0.0f); } - hidden_states_vec.insert(hidden_states_vec.end(), - (float*)chunk_hidden_states->data, - ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); + if (!hidden_states.empty()) { + hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1); + } else { + hidden_states = std::move(chunk_hidden_states); + } } - if (hidden_states_vec.size() > 0) { - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); - } else { - hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256); - ggml_set_f32(hidden_states, 0.f); - } - if (pooled == nullptr) { - pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2048); - ggml_set_f32(pooled, 0.f); - } - return {hidden_states, pooled, nullptr}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.c_vector = std::move(pooled); + return result; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, 77, true); - return get_learned_condition_common(work_ctx, - n_threads, + return get_learned_condition_common(n_threads, tokens_and_weights, conditioner_params.clip_skip, conditioner_params.zero_out_masked); @@ -1292,8 +1215,7 @@ struct FluxCLIPEmbedder : public Conditioner { return {{clip_l_tokens, clip_l_weights}, {t5_tokens, t5_weights}}; } - SDCondition get_learned_condition_common(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition_common(int n_threads, std::vector, std::vector>> token_and_weights, int clip_skip, bool zero_out_masked = false) { @@ -1306,11 +1228,9 @@ struct FluxCLIPEmbedder : public Conditioner { clip_skip = 2; } - int64_t t0 = ggml_time_ms(); - ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] - ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] - ggml_tensor* pooled = nullptr; // [768,] - std::vector hidden_states_vec; + int64_t t0 = ggml_time_ms(); + sd::Tensor hidden_states; // [N, n_token, 4096] + sd::Tensor pooled; // [768,] size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len; for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { @@ -1323,95 +1243,65 @@ struct FluxCLIPEmbedder : public Conditioner { std::vector chunk_weights(clip_l_weights.begin(), clip_l_weights.begin() + chunk_len_l); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); size_t max_token_idx = 0; auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - clip_l->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - true, - clip_skip, - &pooled, - work_ctx); + pooled = clip_l->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + true, + clip_skip); + GGML_ASSERT(!pooled.empty()); + } else { + pooled = sd::Tensor::zeros({768}); } } // t5 + sd::Tensor chunk_hidden_states; if (t5) { std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, t5_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, t5_weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - - t5->compute(n_threads, - input_ids, - nullptr, - &chunk_hidden_states, - work_ctx); - { - auto tensor = chunk_hidden_states; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); + chunk_hidden_states = t5->compute(n_threads, + input_ids, + sd::Tensor()); + GGML_ASSERT(!chunk_hidden_states.empty()); + chunk_hidden_states = ::apply_token_weights(std::move(chunk_hidden_states), chunk_weights); + if (zero_out_masked) { + chunk_hidden_states.fill_(0.0f); } } else { - chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); - ggml_set_f32(chunk_hidden_states, 0.f); + chunk_hidden_states = sd::Tensor::zeros({4096, static_cast(chunk_len)}); } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - if (zero_out_masked) { - float* vec = (float*)chunk_hidden_states->data; - for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) { - vec[i] = 0; - } + if (!hidden_states.empty()) { + hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1); + } else { + hidden_states = std::move(chunk_hidden_states); } - - hidden_states_vec.insert(hidden_states_vec.end(), - (float*)chunk_hidden_states->data, - ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); } - if (hidden_states_vec.size() > 0) { - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); - } else { - hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256); - ggml_set_f32(hidden_states, 0.f); - } - if (pooled == nullptr) { - pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); - ggml_set_f32(pooled, 0.f); - } - return {hidden_states, pooled, nullptr}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.c_vector = std::move(pooled); + return result; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true); - return get_learned_condition_common(work_ctx, - n_threads, + return get_learned_condition_common(n_threads, tokens_and_weights, conditioner_params.clip_skip, conditioner_params.zero_out_masked); @@ -1523,8 +1413,9 @@ struct T5CLIPEmbedder : public Conditioner { return {t5_tokens, t5_weights, t5_mask}; } - void modify_mask_to_attend_padding(ggml_tensor* mask, int max_seq_length, int num_extra_padding = 8) { - float* mask_data = (float*)mask->data; + void modify_mask_to_attend_padding(sd::Tensor* mask, int max_seq_length, int num_extra_padding = 8) { + GGML_ASSERT(mask != nullptr); + float* mask_data = mask->data(); int num_pad = 0; for (int64_t i = 0; i < max_seq_length; i++) { if (num_pad >= num_extra_padding) { @@ -1538,29 +1429,23 @@ struct T5CLIPEmbedder : public Conditioner { // LOG_DEBUG("PAD: %d", num_pad); } - SDCondition get_learned_condition_common(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition_common(int n_threads, std::tuple, std::vector, std::vector> token_and_weights, int clip_skip, bool zero_out_masked = false) { if (!t5) { - auto hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256); - ggml_set_f32(hidden_states, 0.f); - auto t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 256); - ggml_set_f32(t5_attn_mask, -HUGE_VALF); - return {hidden_states, t5_attn_mask, nullptr}; + SDCondition result; + result.c_crossattn = sd::Tensor::zeros({4096, 256}); + result.c_vector = sd::Tensor::full({256}, -HUGE_VALF); + return result; } auto& t5_tokens = std::get<0>(token_and_weights); auto& t5_weights = std::get<1>(token_and_weights); auto& t5_attn_mask_vec = std::get<2>(token_and_weights); - int64_t t0 = ggml_time_ms(); - ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] - ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] - ggml_tensor* pooled = nullptr; - ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [n_token] - - std::vector hidden_states_vec; + int64_t t0 = ggml_time_ms(); + sd::Tensor t5_attn_mask = sd::Tensor::from_vector(t5_attn_mask_vec); + sd::Tensor hidden_states; size_t chunk_count = t5_tokens.size() / chunk_len; @@ -1573,68 +1458,46 @@ struct T5CLIPEmbedder : public Conditioner { std::vector chunk_mask(t5_attn_mask_vec.begin() + chunk_idx * chunk_len, t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : nullptr; + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); + sd::Tensor t5_attn_mask_chunk; + if (use_mask) { + t5_attn_mask_chunk = sd::Tensor({static_cast(chunk_mask.size())}, chunk_mask); + } - t5->compute(n_threads, - input_ids, - t5_attn_mask_chunk, - &chunk_hidden_states, - work_ctx); - { - auto tensor = chunk_hidden_states; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); + auto chunk_hidden_states = t5->compute(n_threads, + input_ids, + t5_attn_mask_chunk); + GGML_ASSERT(!chunk_hidden_states.empty()); + chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights); + + if (zero_out_masked) { + auto chunk_mask_tensor = sd::Tensor::from_vector(chunk_mask) + .reshape_({1, static_cast(chunk_mask.size())}); + chunk_hidden_states.masked_fill_(chunk_mask_tensor < 0.0f, 0.0f); } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - if (zero_out_masked) { - auto tensor = chunk_hidden_states; - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - if (chunk_mask[i1] < 0.f) { - ggml_ext_tensor_set_f32(tensor, 0.f, i0, i1, i2); - } - } - } - } - } - hidden_states_vec.insert(hidden_states_vec.end(), - (float*)chunk_hidden_states->data, - ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); + if (!hidden_states.empty()) { + hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1); + } else { + hidden_states = std::move(chunk_hidden_states); + } } - GGML_ASSERT(hidden_states_vec.size() > 0); - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); + modify_mask_to_attend_padding(&t5_attn_mask, static_cast(t5_attn_mask.numel()), mask_pad); - modify_mask_to_attend_padding(t5_attn_mask, static_cast(ggml_nelements(t5_attn_mask)), mask_pad); - - return {hidden_states, t5_attn_mask, nullptr}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.c_vector = std::move(t5_attn_mask); + return result; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true); - return get_learned_condition_common(work_ctx, - n_threads, + return get_learned_condition_common(n_threads, tokens_and_weights, conditioner_params.clip_skip, conditioner_params.zero_out_masked); @@ -1723,8 +1586,7 @@ struct AnimaConditioner : public Conditioner { return {qwen_tokens, qwen_weights, t5_tokens, t5_weights}; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { int64_t t0 = ggml_time_ms(); @@ -1734,46 +1596,25 @@ struct AnimaConditioner : public Conditioner { auto& t5_tokens = std::get<2>(tokenized); auto& t5_weights = std::get<3>(tokenized); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens); - - ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024] - llm->compute(n_threads, - input_ids, - nullptr, - {}, - {}, - &hidden_states, - work_ctx); - - { - auto tensor = hidden_states; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= qwen_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - if (new_mean != 0.f) { - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); - } - } - - ggml_tensor* t5_ids_tensor = nullptr; - ggml_tensor* t5_weight_tensor = nullptr; - if (!t5_tokens.empty()) { - t5_ids_tensor = vector_to_ggml_tensor_i32(work_ctx, t5_tokens); - t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights); - } + sd::Tensor input_ids({static_cast(qwen_tokens.size()), 1}, qwen_tokens); + auto hidden_states = llm->compute(n_threads, + input_ids, + sd::Tensor(), + {}, + {}); + GGML_ASSERT(!hidden_states.empty()); + hidden_states = apply_token_weights(std::move(hidden_states), qwen_weights); + auto t5_ids_tensor = sd::Tensor::from_vector(t5_tokens); + auto t5_weight_tensor = sd::Tensor::from_vector(t5_weights); int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - return {hidden_states, t5_weight_tensor, t5_ids_tensor}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.c_t5_ids = std::move(t5_ids_tensor); + result.c_t5_weights = std::move(t5_weight_tensor); + return result; } }; @@ -1884,15 +1725,14 @@ struct LLMEmbedder : public Conditioner { return {tokens, weights}; } - ggml_tensor* encode_prompt(ggml_context* work_ctx, - int n_threads, - const std::string prompt, - const std::pair& prompt_attn_range, - int max_length, - int min_length, - std::vector> image_embeds, - const std::set& out_layers, - int prompt_template_encode_start_idx) { + sd::Tensor encode_prompt(int n_threads, + const std::string prompt, + const std::pair& prompt_attn_range, + int max_length, + int min_length, + const std::vector>>& image_embeds, + const std::set& out_layers, + int prompt_template_encode_start_idx) { auto tokens_and_weights = tokenize(prompt, prompt_attn_range); auto& tokens = std::get<0>(tokens_and_weights); auto& weights = std::get<1>(tokens_and_weights); @@ -1904,81 +1744,59 @@ struct LLMEmbedder : public Conditioner { tokenizer->pad_tokens(tokens, weights, max_length, true); } - ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] - - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - - ggml_tensor* attention_mask = nullptr; + sd::Tensor input_ids({static_cast(tokens.size())}, tokens); + sd::Tensor attention_mask; if (!mask.empty()) { - attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size()); - ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.f; - if (mask[i0] == 0.f) { - value = -INFINITY; - } else if (i0 > i1) { - value = -INFINITY; - } - ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3); - }); - } - - llm->compute(n_threads, - input_ids, - attention_mask, - image_embeds, - out_layers, - &hidden_states, - work_ctx); - { - auto tensor = hidden_states; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); + attention_mask = sd::Tensor({static_cast(mask.size()), static_cast(mask.size())}); + for (size_t i1 = 0; i1 < mask.size(); ++i1) { + for (size_t i0 = 0; i0 < mask.size(); ++i0) { + float value = 0.0f; + if (mask[i0] == 0.0f || i0 > i1) { + value = -INFINITY; } + attention_mask[static_cast(i0 + mask.size() * i1)] = value; } } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); } - GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx); + auto hidden_states = llm->compute(n_threads, + input_ids, + attention_mask, + image_embeds, + out_layers); + GGML_ASSERT(!hidden_states.empty()); + hidden_states = apply_token_weights(std::move(hidden_states), weights); + GGML_ASSERT(hidden_states.shape()[1] > prompt_template_encode_start_idx); int64_t zero_pad_len = 0; if (min_length > 0) { - if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) { - zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx; + if (hidden_states.shape()[1] - prompt_template_encode_start_idx < min_length) { + zero_pad_len = min_length - hidden_states.shape()[1] + prompt_template_encode_start_idx; } } - ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx, - GGML_TYPE_F32, - hidden_states->ne[0], - hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len, - hidden_states->ne[2]); - - ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.f; - if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) { - value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3); - } - ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3); - }); + sd::Tensor new_hidden_states = sd::ops::slice(hidden_states, + 1, + prompt_template_encode_start_idx, + hidden_states.shape()[1]); + if (zero_pad_len > 0) { + auto pad_shape = new_hidden_states.shape(); + pad_shape[1] = zero_pad_len; + new_hidden_states = sd::ops::concat(new_hidden_states, + sd::Tensor::zeros(std::move(pad_shape)), + 1); + } return new_hidden_states; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { std::string prompt; std::pair prompt_attn_range; std::vector extra_prompts; std::vector> extra_prompts_attn_range; - std::vector> image_embeds; + std::vector>> image_embeds; int prompt_template_encode_start_idx = 34; int max_length = 0; // pad tokens int min_length = 0; // zero pad hidden_states @@ -1987,7 +1805,7 @@ struct LLMEmbedder : public Conditioner { int64_t t0 = ggml_time_ms(); if (sd_version_is_qwen_image(version)) { - if (llm->enable_vision && !conditioner_params.ref_images.empty()) { + if (llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) { LOG_INFO("QwenImageEditPlusPipeline"); prompt_template_encode_start_idx = 64; int image_embed_idx = 64 + 6; @@ -1997,13 +1815,13 @@ struct LLMEmbedder : public Conditioner { std::string placeholder = "<|image_pad|>"; std::string img_prompt; - for (int i = 0; i < conditioner_params.ref_images.size(); i++) { - sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]); - double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size; - int height = image.height; - int width = image.width; - int h_bar = static_cast(std::round(height / factor) * factor); - int w_bar = static_cast(std::round(width / factor) * factor); + for (int i = 0; i < conditioner_params.ref_images->size(); i++) { + const auto& image = (*conditioner_params.ref_images)[i]; + double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size; + int height = static_cast(image.shape()[1]); + int width = static_cast(image.shape()[0]); + int h_bar = static_cast(std::round(height / factor) * factor); + int w_bar = static_cast(std::round(width / factor) * factor); if (static_cast(h_bar) * w_bar > max_pixels) { double beta = std::sqrt((height * width) / static_cast(max_pixels)); @@ -2017,24 +1835,17 @@ struct LLMEmbedder : public Conditioner { w_bar = static_cast(std::ceil(width * beta / factor)) * static_cast(factor); } - LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar); + LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar); - sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar); - free(image.data); - image.data = nullptr; + auto resized_image = clip_preprocess(image, w_bar, h_bar); - ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1); - sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false); - free(resized_image.data); - resized_image.data = nullptr; - - ggml_tensor* image_embed = nullptr; - llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx); + auto image_embed = llm->encode_image(n_threads, resized_image); + GGML_ASSERT(!image_embed.empty()); image_embeds.emplace_back(image_embed_idx, image_embed); - image_embed_idx += 1 + static_cast(image_embed->ne[1]) + 6; + image_embed_idx += 1 + static_cast(image_embed.shape()[1]) + 6; img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652] - int64_t num_image_tokens = image_embed->ne[1]; + int64_t num_image_tokens = image_embed.shape()[1]; img_prompt.reserve(num_image_tokens * placeholder.size()); for (int j = 0; j < num_image_tokens; j++) { img_prompt += placeholder; @@ -2077,10 +1888,10 @@ struct LLMEmbedder : public Conditioner { prompt_template_encode_start_idx = 0; out_layers = {35}; // -2 - if (!conditioner_params.ref_images.empty()) { + if (conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) { LOG_INFO("ZImageOmniPipeline"); prompt = "<|im_start|>user\n<|vision_start|>"; - for (int i = 0; i < conditioner_params.ref_images.size() - 1; i++) { + for (int i = 0; i < conditioner_params.ref_images->size() - 1; i++) { extra_prompts.push_back("<|vision_end|><|vision_start|>"); } extra_prompts.push_back("<|vision_end|>" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n<|vision_start|>"); @@ -2121,8 +1932,7 @@ struct LLMEmbedder : public Conditioner { GGML_ABORT("unknown version %d", version); } - auto hidden_states = encode_prompt(work_ctx, - n_threads, + auto hidden_states = encode_prompt(n_threads, prompt, prompt_attn_range, max_length, @@ -2130,11 +1940,9 @@ struct LLMEmbedder : public Conditioner { image_embeds, out_layers, prompt_template_encode_start_idx); - - std::vector extra_hidden_states_vec; + std::vector> extra_hidden_states_vec; for (int i = 0; i < extra_prompts.size(); i++) { - auto extra_hidden_states = encode_prompt(work_ctx, - n_threads, + auto extra_hidden_states = encode_prompt(n_threads, extra_prompts[i], extra_prompts_attn_range[i], max_length, @@ -2142,12 +1950,15 @@ struct LLMEmbedder : public Conditioner { image_embeds, out_layers, prompt_template_encode_start_idx); - extra_hidden_states_vec.push_back(extra_hidden_states); + extra_hidden_states_vec.push_back(std::move(extra_hidden_states)); } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - return {hidden_states, nullptr, nullptr, extra_hidden_states_vec}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.extra_c_crossattns = std::move(extra_hidden_states_vec); + return result; } }; diff --git a/src/control.hpp b/src/control.hpp index 93df10a..d227ec9 100644 --- a/src/control.hpp +++ b/src/control.hpp @@ -310,11 +310,13 @@ struct ControlNet : public GGMLRunner { SDVersion version = VERSION_SD1; ControlNetBlock control_net; - ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory + ggml_backend_buffer_t control_buffer = nullptr; ggml_context* control_ctx = nullptr; - std::vector controls; // (12 input block outputs, 1 middle block output) SD 1.5 - ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference - bool guided_hint_cached = false; + std::vector control_outputs_ggml; + ggml_tensor* guided_hint_output_ggml = nullptr; + std::vector> controls; + sd::Tensor guided_hint; + bool guided_hint_cached = false; ControlNet(ggml_backend_t backend, bool offload_params_to_cpu, @@ -335,16 +337,16 @@ struct ControlNet : public GGMLRunner { params.no_alloc = true; control_ctx = ggml_init(params); - controls.resize(outs.size() - 1); + control_outputs_ggml.resize(outs.size() - 1); size_t control_buffer_size = 0; - guided_hint = ggml_dup_tensor(control_ctx, outs[0]); - control_buffer_size += ggml_nbytes(guided_hint); + guided_hint_output_ggml = ggml_dup_tensor(control_ctx, outs[0]); + control_buffer_size += ggml_nbytes(guided_hint_output_ggml); for (int i = 0; i < outs.size() - 1; i++) { - controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]); - control_buffer_size += ggml_nbytes(controls[i]); + control_outputs_ggml[i] = ggml_dup_tensor(control_ctx, outs[i + 1]); + control_buffer_size += ggml_nbytes(control_outputs_ggml[i]); } control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend); @@ -361,8 +363,10 @@ struct ControlNet : public GGMLRunner { ggml_free(control_ctx); control_ctx = nullptr; } - guided_hint = nullptr; - guided_hint_cached = false; + guided_hint_output_ggml = nullptr; + guided_hint_cached = false; + guided_hint = {}; + control_outputs_ggml.clear(); controls.clear(); } @@ -374,29 +378,33 @@ struct ControlNet : public GGMLRunner { control_net.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* hint, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* y = nullptr) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& hint_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& y_tensor = {}) { ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE); - x = to_backend(x); - if (guided_hint_cached) { - hint = nullptr; + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* hint = nullptr; + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* y = make_optional_input(y_tensor); + + ggml_tensor* guided_hint_input = nullptr; + if (guided_hint_cached && !guided_hint.empty()) { + guided_hint_input = make_input(guided_hint); + hint = nullptr; } else { - hint = to_backend(hint); + hint = make_input(hint_tensor); } - context = to_backend(context); - y = to_backend(y); - timesteps = to_backend(timesteps); auto runner_ctx = get_context(); auto outs = control_net.forward(&runner_ctx, x, hint, - guided_hint_cached ? guided_hint : nullptr, + guided_hint_input, timesteps, context, y); @@ -405,22 +413,20 @@ struct ControlNet : public GGMLRunner { alloc_control_ctx(outs); } - ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint)); + ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint_output_ggml)); for (int i = 0; i < outs.size() - 1; i++) { - ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i])); + ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], control_outputs_ggml[i])); } return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* hint, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* y, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + std::optional>> compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& hint, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& y = {}) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] @@ -429,12 +435,24 @@ struct ControlNet : public GGMLRunner { return build_graph(x, hint, timesteps, context, y); }; - bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); - if (res) { - // cache guided_hint - guided_hint_cached = true; + auto compute_result = GGMLRunner::compute(get_graph, n_threads, false); + if (!compute_result.has_value()) { + return std::nullopt; } - return res; + + if (guided_hint_output_ggml != nullptr) { + guided_hint = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml(guided_hint_output_ggml), + 4); + } + controls.clear(); + controls.reserve(control_outputs_ggml.size()); + for (ggml_tensor* control : control_outputs_ggml) { + auto control_host = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml(control), 4); + GGML_ASSERT(!control_host.empty()); + controls.push_back(std::move(control_host)); + } + guided_hint_cached = true; + return controls; } bool load_from_file(const std::string& file_path, int n_threads) { @@ -462,4 +480,4 @@ struct ControlNet : public GGMLRunner { } }; -#endif // __CONTROL_HPP__ \ No newline at end of file +#endif // __CONTROL_HPP__ diff --git a/src/denoiser.hpp b/src/denoiser.hpp index b92ca4e..077a1b7 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -5,6 +5,7 @@ #include "ggml_extend.hpp" #include "gits_noise.inl" +#include "tensor.hpp" /*================================================= CompVisDenoiser ==================================================*/ @@ -73,9 +74,9 @@ constexpr double interp(double left, double right, double perc) noexcept { /* This will make the assumption that the reference x and y values are * already sorted in ascending order because they are being generated as * such in the calling function */ -std::vector linear_interp(std::vector new_x, - const std::vector ref_x, - const std::vector ref_y) { +inline std::vector linear_interp(std::vector new_x, + const std::vector ref_x, + const std::vector ref_y) { const size_t len_x = new_x.size(); size_t i = 0; size_t j = 0; @@ -109,7 +110,7 @@ std::vector linear_interp(std::vector new_x, return new_y; } -std::vector linear_space(const float start, const float end, const size_t num_points) { +inline std::vector linear_space(const float start, const float end, const size_t num_points) { std::vector result(num_points); const float inc = (end - start) / (static_cast(num_points - 1)); @@ -124,8 +125,8 @@ std::vector linear_space(const float start, const float end, const size_t return result; } -std::vector log_linear_interpolation(std::vector sigma_in, - const size_t new_len) { +inline std::vector log_linear_interpolation(std::vector sigma_in, + const size_t new_len) { const size_t s_len = sigma_in.size(); std::vector x_vals = linear_space(0.f, 1.f, s_len); std::vector y_vals(s_len); @@ -478,13 +479,16 @@ struct KLOptimalScheduler : SigmaScheduler { }; struct Denoiser { - virtual float sigma_min() = 0; - virtual float sigma_max() = 0; - virtual float sigma_to_t(float sigma) = 0; - virtual float t_to_sigma(float t) = 0; - virtual std::vector get_scalings(float sigma) = 0; - virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0; - virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) = 0; + virtual float sigma_min() = 0; + virtual float sigma_max() = 0; + virtual float sigma_to_t(float sigma) = 0; + virtual float t_to_sigma(float t) = 0; + virtual std::vector get_scalings(float sigma) = 0; + virtual sd::Tensor noise_scaling(float sigma, + const sd::Tensor& noise, + const sd::Tensor& latent) = 0; + virtual sd::Tensor inverse_noise_scaling(float sigma, + const sd::Tensor& latent) = 0; virtual std::vector get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) { auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); @@ -598,14 +602,15 @@ struct CompVisDenoiser : public Denoiser { return {c_skip, c_out, c_in}; } - // this function will modify noise/latent - ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { - ggml_ext_tensor_scale_inplace(noise, sigma); - ggml_ext_tensor_add_inplace(latent, noise); - return latent; + virtual sd::Tensor noise_scaling(float sigma, + const sd::Tensor& noise, + const sd::Tensor& latent) override { + GGML_ASSERT(noise.numel() == latent.numel()); + return latent + noise * sigma; } - ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { + sd::Tensor inverse_noise_scaling(float sigma, const sd::Tensor& latent) override { + SD_UNUSED(sigma); return latent; } }; @@ -644,7 +649,7 @@ struct EDMVDenoiser : public CompVisVDenoiser { } }; -float time_snr_shift(float alpha, float t) { +inline float time_snr_shift(float alpha, float t) { if (alpha == 1.0f) { return t; } @@ -696,21 +701,18 @@ struct DiscreteFlowDenoiser : public Denoiser { return {c_skip, c_out, c_in}; } - // this function will modify noise/latent - ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { - ggml_ext_tensor_scale_inplace(noise, sigma); - ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma); - ggml_ext_tensor_add_inplace(latent, noise); - return latent; + sd::Tensor noise_scaling(float sigma, + const sd::Tensor& noise, + const sd::Tensor& latent) override { + GGML_ASSERT(noise.numel() == latent.numel()); + return latent * (1.0f - sigma) + noise * sigma; } - - ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { - ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma)); - return latent; + sd::Tensor inverse_noise_scaling(float sigma, const sd::Tensor& latent) override { + return latent * (1.0f / (1.0f - sigma)); } }; -float flux_time_shift(float mu, float sigma, float t) { +inline float flux_time_shift(float mu, float sigma, float t) { return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma)); } @@ -759,938 +761,289 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser { } }; -typedef std::function denoise_cb_t; +typedef std::function(const sd::Tensor&, float, int)> denoise_cb_t; // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t -static bool sample_k_diffusion(sample_method_t method, - denoise_cb_t model, - ggml_context* work_ctx, - ggml_tensor* x, - std::vector sigmas, - std::shared_ptr rng, - float eta) { +static sd::Tensor sample_k_diffusion(sample_method_t method, + denoise_cb_t model, + sd::Tensor x, + std::vector sigmas, + std::shared_ptr rng, + float eta) { size_t steps = sigmas.size() - 1; - // sample_euler_ancestral switch (method) { case EULER_A_SAMPLE_METHOD: { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - for (int i = 0; i < steps; i++) { - float sigma = sigmas[i]; - - // denoise - ggml_tensor* denoised = model(x, sigma, i + 1); - if (denoised == nullptr) { - return false; + float sigma = sigmas[i]; + auto denoised_opt = model(x, sigma, i + 1); + if (denoised_opt.empty()) { + return {}; } - - // d = (x - denoised) / sigma - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int i = 0; i < ggml_nelements(d); i++) { - vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma; - } - } - - // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); - float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); - - // Euler method - float dt = sigma_down - sigmas[i]; - // x = x + d * dt - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - - for (int i = 0; i < ggml_nelements(x); i++) { - vec_x[i] = vec_x[i] + vec_d[i] * dt; - } - } - + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - denoised) / sigma; + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); + float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); + float dt = sigma_down - sigmas[i]; + x += d * dt; if (sigmas[i + 1] > 0) { - // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up - ggml_ext_im_set_randn_f32(noise, rng); - // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin"); - { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int i = 0; i < ggml_nelements(x); i++) { - vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; - } - } + x += sd::Tensor::randn_like(x, rng) * sigma_up; } } - } break; - case EULER_SAMPLE_METHOD: // Implemented without any sigma churn - { - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - + return x; + } + case EULER_SAMPLE_METHOD: { for (int i = 0; i < steps; i++) { - float sigma = sigmas[i]; - - // denoise - ggml_tensor* denoised = model(x, sigma, i + 1); - if (denoised == nullptr) { - return false; - } - - // d = (x - denoised) / sigma - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(d); j++) { - vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma; - } - } - - float dt = sigmas[i + 1] - sigma; - // x = x + d * dt - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_d[j] * dt; - } + float sigma = sigmas[i]; + auto denoised_opt = model(x, sigma, i + 1); + if (denoised_opt.empty()) { + return {}; } + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - denoised) / sigma; + float dt = sigmas[i + 1] - sigma; + x += d * dt; } - } break; + return x; + } case HEUN_SAMPLE_METHOD: { - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); - for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], -(i + 1)); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], -(i + 1)); + if (denoised_opt.empty()) { + return {}; } - - // d = (x - denoised) / sigma - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; - } - } - - float dt = sigmas[i + 1] - sigmas[i]; + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - denoised) / sigmas[i]; + float dt = sigmas[i + 1] - sigmas[i]; if (sigmas[i + 1] == 0) { - // Euler step - // x = x + d * dt - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_d[j] * dt; - } + x += d * dt; } else { - // Heun step - float* vec_d = (float*)d->data; - float* vec_d2 = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x2[j] = vec_x[j] + vec_d[j] * dt; - } - - ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); - if (denoised == nullptr) { - return false; - } - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1]; - vec_d[j] = (vec_d[j] + d2) / 2; - vec_x[j] = vec_x[j] + vec_d[j] * dt; + sd::Tensor x2 = x + d * dt; + auto denoised2_opt = model(x2, sigmas[i + 1], i + 1); + if (denoised2_opt.empty()) { + return {}; } + sd::Tensor denoised2 = std::move(denoised2_opt); + d = (d + (x2 - denoised2) / sigmas[i + 1]) / 2.0f; + x += d * dt; } } - } break; + return x; + } case DPM2_SAMPLE_METHOD: { - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); - for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], -(i + 1)); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], -(i + 1)); + if (denoised_opt.empty()) { + return {}; } - - // d = (x - denoised) / sigma - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; - } - } - + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - denoised) / sigmas[i]; if (sigmas[i + 1] == 0) { - // Euler step - // x = x + d * dt - float dt = sigmas[i + 1] - sigmas[i]; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_d[j] * dt; - } + float dt = sigmas[i + 1] - sigmas[i]; + x += d * dt; } else { - // DPM-Solver-2 - float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1]))); - float dt_1 = sigma_mid - sigmas[i]; - float dt_2 = sigmas[i + 1] - sigmas[i]; - - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x2[j] = vec_x[j] + vec_d[j] * dt_1; - } - - ggml_tensor* denoised = model(x2, sigma_mid, i + 1); - if (denoised == nullptr) { - return false; - } - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid; - vec_x[j] = vec_x[j] + d2 * dt_2; + float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1]))); + float dt_1 = sigma_mid - sigmas[i]; + float dt_2 = sigmas[i + 1] - sigmas[i]; + sd::Tensor x2 = x + d * dt_1; + auto denoised2_opt = model(x2, sigma_mid, i + 1); + if (denoised2_opt.empty()) { + return {}; } + sd::Tensor denoised2 = std::move(denoised2_opt); + x += ((x2 - denoised2) / sigma_mid) * dt_2; } } - - } break; + return x; + } case DPMPP2S_A_SAMPLE_METHOD: { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); - for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], -(i + 1)); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], -(i + 1)); + if (denoised_opt.empty()) { + return {}; } - - // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); - float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - auto sigma_fn = [](float t) -> float { return exp(-t); }; + sd::Tensor denoised = std::move(denoised_opt); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); + float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); + auto t_fn = [](float sigma) -> float { return -log(sigma); }; + auto sigma_fn = [](float t) -> float { return exp(-t); }; if (sigma_down == 0) { - // d = (x - denoised) / sigmas[i]; - // dt = sigma_down - sigmas[i]; - // x += d * dt; - // => x = denoised - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_denoised[j]; - } + x = denoised; } else { - // DPM-Solver++(2S) - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigma_down); - float h = t_next - t; - float s = t + 0.5f * h; - - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; - float* vec_denoised = (float*)denoised->data; - - // First half-step - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x2[j] = (sigma_fn(s) / sigma_fn(t)) * vec_x[j] - (exp(-h * 0.5f) - 1) * vec_denoised[j]; - } - - ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); - if (denoised == nullptr) { - return false; - } - - // Second half-step - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = (sigma_fn(t_next) / sigma_fn(t)) * vec_x[j] - (exp(-h) - 1) * vec_denoised[j]; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigma_down); + float h = t_next - t; + float s = t + 0.5f * h; + sd::Tensor x2 = (sigma_fn(s) / sigma_fn(t)) * x - (exp(-h * 0.5f) - 1) * denoised; + auto denoised2_opt = model(x2, sigmas[i + 1], i + 1); + if (denoised2_opt.empty()) { + return {}; } + sd::Tensor denoised2 = std::move(denoised2_opt); + x = (sigma_fn(t_next) / sigma_fn(t)) * (x) - (exp(-h) - 1) * denoised2; } - // Noise addition if (sigmas[i + 1] > 0) { - ggml_ext_im_set_randn_f32(noise, rng); - { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int i = 0; i < ggml_nelements(x); i++) { - vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; - } - } + x += sd::Tensor::randn_like(x, rng) * sigma_up; } } - } break; - case DPMPP2M_SAMPLE_METHOD: // DPM++ (2M) from Karras et al (2022) - { - ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); - - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - + return x; + } + case DPMPP2M_SAMPLE_METHOD: { + sd::Tensor old_denoised = x; + auto t_fn = [](float sigma) -> float { return -log(sigma); }; for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], i + 1); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], i + 1); + if (denoised_opt.empty()) { + return {}; } - - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float b = exp(-h) - 1.f; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; + sd::Tensor denoised = std::move(denoised_opt); + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float b = exp(-h) - 1.f; if (i == 0 || sigmas[i + 1] == 0) { - // Simpler step for the edge cases - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = a * vec_x[j] - b * vec_denoised[j]; - } + x = a * (x)-b * denoised; } else { - float h_last = t - t_fn(sigmas[i - 1]); - float r = h_last / h; - for (int j = 0; j < ggml_nelements(x); j++) { - float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; - vec_x[j] = a * vec_x[j] - b * denoised_d; - } - } - - // old_denoised = denoised - for (int j = 0; j < ggml_nelements(x); j++) { - vec_old_denoised[j] = vec_denoised[j]; + float h_last = t - t_fn(sigmas[i - 1]); + float r = h_last / h; + sd::Tensor denoised_d = (1.f + 1.f / (2.f * r)) * denoised - (1.f / (2.f * r)) * old_denoised; + x = a * (x)-b * denoised_d; } + old_denoised = denoised; } - } break; - case DPMPP2Mv2_SAMPLE_METHOD: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 - { - ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); - - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - + return x; + } + case DPMPP2Mv2_SAMPLE_METHOD: { + sd::Tensor old_denoised = x; + auto t_fn = [](float sigma) -> float { return -log(sigma); }; for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], i + 1); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], i + 1); + if (denoised_opt.empty()) { + return {}; } - - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; - + sd::Tensor denoised = std::move(denoised_opt); + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; if (i == 0 || sigmas[i + 1] == 0) { - // Simpler step for the edge cases float b = exp(-h) - 1.f; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = a * vec_x[j] - b * vec_denoised[j]; - } + x = a * (x)-b * denoised; } else { - float h_last = t - t_fn(sigmas[i - 1]); - float h_min = std::min(h_last, h); - float h_max = std::max(h_last, h); - float r = h_max / h_min; - float h_d = (h_max + h_min) / 2.f; - float b = exp(-h_d) - 1.f; - for (int j = 0; j < ggml_nelements(x); j++) { - float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; - vec_x[j] = a * vec_x[j] - b * denoised_d; - } + float h_last = t - t_fn(sigmas[i - 1]); + float h_min = std::min(h_last, h); + float h_max = std::max(h_last, h); + float r = h_max / h_min; + float h_d = (h_max + h_min) / 2.f; + float b = exp(-h_d) - 1.f; + sd::Tensor denoised_d = (1.f + 1.f / (2.f * r)) * denoised - (1.f / (2.f * r)) * old_denoised; + x = a * (x)-b * denoised_d; } + old_denoised = denoised; + } + return x; + } + case LCM_SAMPLE_METHOD: { + for (int i = 0; i < steps; i++) { + auto denoised_opt = model(x, sigmas[i], i + 1); + if (denoised_opt.empty()) { + return {}; + } + sd::Tensor denoised = std::move(denoised_opt); - // old_denoised = denoised - for (int j = 0; j < ggml_nelements(x); j++) { - vec_old_denoised[j] = vec_denoised[j]; + x = denoised; + if (sigmas[i + 1] > 0) { + x += sd::Tensor::randn_like(x, rng) * sigmas[i + 1]; } } - } break; - case IPNDM_SAMPLE_METHOD: // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main - { - int max_order = 4; - ggml_tensor* x_next = x; - std::vector buffer_model; - + return x; + } + case IPNDM_SAMPLE_METHOD: { + int max_order = 4; + std::vector> hist = {}; for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; float sigma_next = sigmas[i + 1]; - ggml_tensor* x_cur = x_next; - float* vec_x_cur = (float*)x_cur->data; - float* vec_x_next = (float*)x_next->data; - - // Denoising step - ggml_tensor* denoised = model(x_cur, sigma, i + 1); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigma, i + 1); + if (denoised_opt.empty()) { + return {}; } - float* vec_denoised = (float*)denoised->data; - // d_cur = (x_cur - denoised) / sigma - ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur); - float* vec_d_cur = (float*)d_cur->data; + sd::Tensor denoised = std::move(denoised_opt); - for (int j = 0; j < ggml_nelements(d_cur); j++) { - vec_d_cur[j] = (vec_x_cur[j] - vec_denoised[j]) / sigma; - } + sd::Tensor d_cur = (x - denoised) / sigma; + int order = std::min(max_order, i + 1); + float dt = sigma_next - sigma; - int order = std::min(max_order, i + 1); - - // Calculate vec_x_next based on the order switch (order) { - case 1: // First Euler step - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * vec_d_cur[j]; - } + case 1: + x += d_cur * dt; + break; + case 2: + x += ((3.f * d_cur - hist.back()) / 2.f) * dt; + break; + case 3: + x += ((23.f * d_cur - 16.f * hist[hist.size() - 1] + 5.f * hist[hist.size() - 2]) / 12.f) * dt; + break; + case 4: + x += ((55.f * d_cur - 59.f * hist[hist.size() - 1] + 37.f * hist[hist.size() - 2] - 9.f * hist[hist.size() - 3]) / 24.f) * dt; break; - - case 2: // Use one history point - { - float* vec_d_prev1 = (float*)buffer_model.back()->data; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (3 * vec_d_cur[j] - vec_d_prev1[j]) / 2; - } - } break; - - case 3: // Use two history points - { - float* vec_d_prev1 = (float*)buffer_model.back()->data; - float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12; - } - } break; - - case 4: // Use three history points - { - float* vec_d_prev1 = (float*)buffer_model.back()->data; - float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data; - float* vec_d_prev3 = (float*)buffer_model[buffer_model.size() - 3]->data; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24; - } - } break; } - // Manage buffer_model - if (buffer_model.size() == max_order - 1) { - // Shift elements to the left - for (int k = 0; k < max_order - 2; k++) { - buffer_model[k] = buffer_model[k + 1]; - } - buffer_model.back() = d_cur; // Replace the last element with d_cur - } else { - buffer_model.push_back(d_cur); + if (hist.size() == static_cast(max_order - 1)) { + hist.erase(hist.begin()); } + hist.push_back(std::move(d_cur)); } - } break; - case IPNDM_V_SAMPLE_METHOD: // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main - { - int max_order = 4; - std::vector buffer_model; - ggml_tensor* x_next = x; - + return x; + } + case IPNDM_V_SAMPLE_METHOD: { + int max_order = 4; + std::vector> hist = {}; for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; float t_next = sigmas[i + 1]; - // Denoising step - ggml_tensor* denoised = model(x, sigma, i + 1); - float* vec_denoised = (float*)denoised->data; - ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x); - float* vec_d_cur = (float*)d_cur->data; - float* vec_x = (float*)x->data; - - // d_cur = (x - denoised) / sigma - for (int j = 0; j < ggml_nelements(d_cur); j++) { - vec_d_cur[j] = (vec_x[j] - vec_denoised[j]) / sigma; + auto denoised_opt = model(x, sigma, i + 1); + if (denoised_opt.empty()) { + return {}; } + sd::Tensor denoised = std::move(denoised_opt); - int order = std::min(max_order, i + 1); - float h_n = t_next - sigma; - float h_n_1 = (i > 0) ? (sigma - sigmas[i - 1]) : h_n; + sd::Tensor d_cur = (x - denoised) / sigma; + int order = std::min(max_order, i + 1); + float h_n = t_next - sigma; + float h_n_1 = (i > 0) ? (sigma - sigmas[i - 1]) : h_n; switch (order) { - case 1: // First Euler step - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x[j] += vec_d_cur[j] * h_n; - } + case 1: + x += d_cur * h_n; break; - - case 2: { - float* vec_d_prev1 = (float*)buffer_model.back()->data; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x[j] += h_n * ((2 + (h_n / h_n_1)) * vec_d_cur[j] - (h_n / h_n_1) * vec_d_prev1[j]) / 2; - } + case 2: + x += (((2.f + (h_n / h_n_1)) * d_cur - (h_n / h_n_1) * hist.back()) / 2.f) * h_n; break; - } - - case 3: { - float h_n_2 = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1; - float* vec_d_prev1 = (float*)buffer_model.back()->data; - float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x[j] += h_n * ((23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12); - } + case 3: + x += ((23.f * d_cur - 16.f * hist[hist.size() - 1] + 5.f * hist[hist.size() - 2]) / 12.f) * h_n; break; - } - - case 4: { - float h_n_2 = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1; - float h_n_3 = (i > 2) ? (sigmas[i - 2] - sigmas[i - 3]) : h_n_2; - float* vec_d_prev1 = (float*)buffer_model.back()->data; - float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1; - float* vec_d_prev3 = (buffer_model.size() > 2) ? (float*)buffer_model[buffer_model.size() - 3]->data : vec_d_prev2; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x[j] += h_n * ((55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24); - } + case 4: + x += ((55.f * d_cur - 59.f * hist[hist.size() - 1] + 37.f * hist[hist.size() - 2] - 9.f * hist[hist.size() - 3]) / 24.f) * h_n; break; - } } - // Manage buffer_model - if (buffer_model.size() == max_order - 1) { - buffer_model.erase(buffer_model.begin()); + if (hist.size() == static_cast(max_order - 1)) { + hist.erase(hist.begin()); } - buffer_model.push_back(d_cur); - - // Prepare the next d tensor - d_cur = ggml_dup_tensor(work_ctx, x_next); + hist.push_back(std::move(d_cur)); } - } break; - case LCM_SAMPLE_METHOD: // Latent Consistency Models - { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - - for (int i = 0; i < steps; i++) { - float sigma = sigmas[i]; - - // denoise - ggml_tensor* denoised = model(x, sigma, i + 1); - if (denoised == nullptr) { - return false; - } - - // x = denoised - { - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_denoised[j]; - } - } - - if (sigmas[i + 1] > 0) { - // x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1]) - ggml_ext_im_set_randn_f32(noise, rng); - // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); - { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j]; - } - } - } - } - } break; - case DDIM_TRAILING_SAMPLE_METHOD: // Denoising Diffusion Implicit Models - // with the "trailing" timestep spacing - { - // See J. Song et al., "Denoising Diffusion Implicit - // Models", arXiv:2010.02502 [cs.LG] - // - // DDIM itself needs alphas_cumprod (DDPM, J. Ho et al., - // arXiv:2006.11239 [cs.LG] with k-diffusion's start and - // end beta) (which unfortunately k-diffusion's data - // structure hides from the denoiser), and the sigmas are - // also needed to invert the behavior of CompVisDenoiser - // (k-diffusion's LMSDiscreteSchedulerr) - float beta_start = 0.00085f; - float beta_end = 0.0120f; - std::vector alphas_cumprod; - std::vector compvis_sigmas; - - alphas_cumprod.reserve(TIMESTEPS); - compvis_sigmas.reserve(TIMESTEPS); - for (int i = 0; i < TIMESTEPS; i++) { - alphas_cumprod[i] = - (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * - (1.0f - - std::pow(sqrtf(beta_start) + - (sqrtf(beta_end) - sqrtf(beta_start)) * - ((float)i / (TIMESTEPS - 1)), - 2)); - compvis_sigmas[i] = - std::sqrt((1 - alphas_cumprod[i]) / - alphas_cumprod[i]); - } - - ggml_tensor* pred_original_sample = - ggml_dup_tensor(work_ctx, x); - ggml_tensor* variance_noise = - ggml_dup_tensor(work_ctx, x); - - for (int i = 0; i < steps; i++) { - // The "trailing" DDIM timestep, see S. Lin et al., - // "Common Diffusion Noise Schedulers and Sample Steps - // are Flawed", arXiv:2305.08891 [cs], p. 4, Table - // 2. Most variables below follow Diffusers naming - // - // Diffuser naming vs. Song et al. (2010), p. 5, (12) - // and p. 16, (16) ( -> ): - // - // - pred_noise_t -> epsilon_theta^(t)(x_t) - // - pred_original_sample -> f_theta^(t)(x_t) or x_0 - // - std_dev_t -> sigma_t (not the LMS sigma) - // - eta -> eta (set to 0 at the moment) - // - pred_sample_direction -> "direction pointing to - // x_t" - // - pred_prev_sample -> "x_t-1" - int timestep = static_cast(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1; - // 1. get previous step value (=t-1) - int prev_timestep = timestep - TIMESTEPS / static_cast(steps); - // The sigma here is chosen to cause the - // CompVisDenoiser to produce t = timestep - float sigma = static_cast(compvis_sigmas[timestep]); - if (i == 0) { - // The function add_noise intializes x to - // Diffusers' latents * sigma (as in Diffusers' - // pipeline) or sample * sigma (Diffusers' - // scheduler), where this sigma = init_noise_sigma - // in Diffusers. For DDPM and DDIM however, - // init_noise_sigma = 1. But the k-diffusion - // model() also evaluates F_theta(c_in(sigma) x; - // ...) instead of the bare U-net F_theta, with - // c_in = 1 / sqrt(sigma^2 + 1), as defined in - // T. Karras et al., "Elucidating the Design Space - // of Diffusion-Based Generative Models", - // arXiv:2206.00364 [cs.CV], p. 3, Table 1. Hence - // the first call has to be prescaled as x <- x / - // (c_in * sigma) with the k-diffusion pipeline - // and CompVisDenoiser. - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] *= std::sqrt(sigma * sigma + 1) / - sigma; - } - } else { - // For the subsequent steps after the first one, - // at this point x = latents or x = sample, and - // needs to be prescaled with x <- sample / c_in - // to compensate for model() applying the scale - // c_in before the U-net F_theta - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] *= std::sqrt(sigma * sigma + 1); - } - } - // Note (also noise_pred in Diffuser's pipeline) - // model_output = model() is the D(x, sigma) as - // defined in Karras et al. (2022), p. 3, Table 1 and - // p. 8 (7), compare also p. 38 (226) therein. - ggml_tensor* model_output = - model(x, sigma, i + 1); - // Here model_output is still the k-diffusion denoiser - // output, not the U-net output F_theta(c_in(sigma) x; - // ...) in Karras et al. (2022), whereas Diffusers' - // model_output is F_theta(...). Recover the actual - // model_output, which is also referred to as the - // "Karras ODE derivative" d or d_cur in several - // samplers above. - { - float* vec_x = (float*)x->data; - float* vec_model_output = - (float*)model_output->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_model_output[j] = - (vec_x[j] - vec_model_output[j]) * - (1 / sigma); - } - } - // 2. compute alphas, betas - float alpha_prod_t = static_cast(alphas_cumprod[timestep]); - // Note final_alpha_cumprod = alphas_cumprod[0] due to - // trailing timestep spacing - float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); - float beta_prod_t = 1 - alpha_prod_t; - // 3. compute predicted original sample from predicted - // noise also called "predicted x_0" of formula (12) - // from https://arxiv.org/pdf/2010.02502.pdf - { - float* vec_x = (float*)x->data; - float* vec_model_output = - (float*)model_output->data; - float* vec_pred_original_sample = - (float*)pred_original_sample->data; - // Note the substitution of latents or sample = x - // * c_in = x / sqrt(sigma^2 + 1) - for (int j = 0; j < ggml_nelements(x); j++) { - vec_pred_original_sample[j] = - (vec_x[j] / std::sqrt(sigma * sigma + 1) - - std::sqrt(beta_prod_t) * - vec_model_output[j]) * - (1 / std::sqrt(alpha_prod_t)); - } - } - // Assuming the "epsilon" prediction type, where below - // pred_epsilon = model_output is inserted, and is not - // defined/copied explicitly. - // - // 5. compute variance: "sigma_t(eta)" -> see formula - // (16) - // - // sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) * - // sqrt(1 - alpha_t/alpha_t-1) - float beta_prod_t_prev = 1 - alpha_prod_t_prev; - float variance = (beta_prod_t_prev / beta_prod_t) * - (1 - alpha_prod_t / alpha_prod_t_prev); - float std_dev_t = eta * std::sqrt(variance); - // 6. compute "direction pointing to x_t" of formula - // (12) from https://arxiv.org/pdf/2010.02502.pdf - // 7. compute x_t without "random noise" of formula - // (12) from https://arxiv.org/pdf/2010.02502.pdf - { - float* vec_model_output = (float*)model_output->data; - float* vec_pred_original_sample = - (float*)pred_original_sample->data; - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - // Two step inner loop without an explicit - // tensor - float pred_sample_direction = - ::sqrtf(1 - alpha_prod_t_prev - - ::powf(std_dev_t, 2)) * - vec_model_output[j]; - vec_x[j] = std::sqrt(alpha_prod_t_prev) * - vec_pred_original_sample[j] + - pred_sample_direction; - } - } - if (eta > 0) { - ggml_ext_im_set_randn_f32(variance_noise, rng); - float* vec_variance_noise = - (float*)variance_noise->data; - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] += std_dev_t * vec_variance_noise[j]; - } - } - // See the note above: x = latents or sample here, and - // is not scaled by the c_in. For the final output - // this is correct, but for subsequent iterations, x - // needs to be prescaled again, since k-diffusion's - // model() differes from the bare U-net F_theta by the - // factor c_in. - } - } break; - case TCD_SAMPLE_METHOD: // Strategic Stochastic Sampling (Algorithm 4) in - // Trajectory Consistency Distillation - { - // See J. Zheng et al., "Trajectory Consistency - // Distillation: Improved Latent Consistency Distillation - // by Semi-Linear Consistency Function with Trajectory - // Mapping", arXiv:2402.19159 [cs.CV] - float beta_start = 0.00085f; - float beta_end = 0.0120f; - std::vector alphas_cumprod; - std::vector compvis_sigmas; - - alphas_cumprod.reserve(TIMESTEPS); - compvis_sigmas.reserve(TIMESTEPS); - for (int i = 0; i < TIMESTEPS; i++) { - alphas_cumprod[i] = - (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * - (1.0f - - std::pow(sqrtf(beta_start) + - (sqrtf(beta_end) - sqrtf(beta_start)) * - ((float)i / (TIMESTEPS - 1)), - 2)); - compvis_sigmas[i] = - std::sqrt((1 - alphas_cumprod[i]) / - alphas_cumprod[i]); - } - int original_steps = 50; - - ggml_tensor* pred_original_sample = - ggml_dup_tensor(work_ctx, x); - ggml_tensor* noise = - ggml_dup_tensor(work_ctx, x); - - for (int i = 0; i < steps; i++) { - // Analytic form for TCD timesteps - int timestep = TIMESTEPS - 1 - - (TIMESTEPS / original_steps) * - (int)floor(i * ((float)original_steps / steps)); - // 1. get previous step value - int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps)); - // Here timestep_s is tau_n' in Algorithm 4. The _s - // notation appears to be that from C. Lu, - // "DPM-Solver: A Fast ODE Solver for Diffusion - // Probabilistic Model Sampling in Around 10 Steps", - // arXiv:2206.00927 [cs.LG], but this notation is not - // continued in Algorithm 4, where _n' is used. - int timestep_s = - (int)floor((1 - eta) * prev_timestep); - // Begin k-diffusion specific workaround for - // evaluating F_theta(x; ...) from D(x, sigma), same - // as in DDIM (and see there for detailed comments) - float sigma = static_cast(compvis_sigmas[timestep]); - if (i == 0) { - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] *= std::sqrt(sigma * sigma + 1) / - sigma; - } - } else { - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] *= std::sqrt(sigma * sigma + 1); - } - } - ggml_tensor* model_output = - model(x, sigma, i + 1); - { - float* vec_x = (float*)x->data; - float* vec_model_output = - (float*)model_output->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_model_output[j] = - (vec_x[j] - vec_model_output[j]) * - (1 / sigma); - } - } - // 2. compute alphas, betas - // - // When comparing TCD with DDPM/DDIM note that Zheng - // et al. (2024) follows the DPM-Solver notation for - // alpha. One can find the following comment in the - // original DPM-Solver code - // (https://github.com/LuChengTHU/dpm-solver/): - // "**Important**: Please pay special attention for - // the args for `alphas_cumprod`: The `alphas_cumprod` - // is the \hat{alpha_n} arrays in the notations of - // DDPM. [...] Therefore, the notation \hat{alpha_n} - // is different from the notation alpha_t in - // DPM-Solver. In fact, we have alpha_{t_n} = - // \sqrt{\hat{alpha_n}}, [...]" - float alpha_prod_t = static_cast(alphas_cumprod[timestep]); - float beta_prod_t = 1 - alpha_prod_t; - // Note final_alpha_cumprod = alphas_cumprod[0] since - // TCD is always "trailing" - float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); - // The subscript _s are the only portion in this - // section (2) unique to TCD - float alpha_prod_s = static_cast(alphas_cumprod[timestep_s]); - float beta_prod_s = 1 - alpha_prod_s; - // 3. Compute the predicted noised sample x_s based on - // the model parameterization - // - // This section is also exactly the same as DDIM - { - float* vec_x = (float*)x->data; - float* vec_model_output = - (float*)model_output->data; - float* vec_pred_original_sample = - (float*)pred_original_sample->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_pred_original_sample[j] = - (vec_x[j] / std::sqrt(sigma * sigma + 1) - - std::sqrt(beta_prod_t) * - vec_model_output[j]) * - (1 / std::sqrt(alpha_prod_t)); - } - } - // This consistency function step can be difficult to - // decipher from Algorithm 4, as it is simply stated - // using a consistency function. This step is the - // modified DDIM, i.e. p. 8 (32) in Zheng et - // al. (2024), with eta set to 0 (see the paragraph - // immediately thereafter that states this somewhat - // obliquely). - { - float* vec_pred_original_sample = - (float*)pred_original_sample->data; - float* vec_model_output = - (float*)model_output->data; - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - // Substituting x = pred_noised_sample and - // pred_epsilon = model_output - vec_x[j] = - std::sqrt(alpha_prod_s) * - vec_pred_original_sample[j] + - std::sqrt(beta_prod_s) * - vec_model_output[j]; - } - } - // 4. Sample and inject noise z ~ N(0, I) for - // MultiStep Inference Noise is not used on the final - // timestep of the timestep schedule. This also means - // that noise is not used for one-step sampling. Eta - // (referred to as "gamma" in the paper) was - // introduced to control the stochasticity in every - // step. When eta = 0, it represents deterministic - // sampling, whereas eta = 1 indicates full stochastic - // sampling. - if (eta > 0 && i != steps - 1) { - // In this case, x is still pred_noised_sample, - // continue in-place - ggml_ext_im_set_randn_f32(noise, rng); - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - for (int j = 0; j < ggml_nelements(x); j++) { - // Corresponding to (35) in Zheng et - // al. (2024), substituting x = - // pred_noised_sample - vec_x[j] = - std::sqrt(alpha_prod_t_prev / - alpha_prod_s) * - vec_x[j] + - std::sqrt(1 - alpha_prod_t_prev / - alpha_prod_s) * - vec_noise[j]; - } - } - } - } break; - case RES_MULTISTEP_SAMPLE_METHOD: // Res Multistep sampler - { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); + return x; + } + case RES_MULTISTEP_SAMPLE_METHOD: { + sd::Tensor old_denoised = x; bool have_old_sigma = false; float old_sigma_down = 0.0f; @@ -1712,10 +1065,11 @@ static bool sample_k_diffusion(sample_method_t method, }; for (int i = 0; i < steps; i++) { - ggml_tensor* denoised = model(x, sigmas[i], i + 1); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], i + 1); + if (denoised_opt.empty()) { + return {}; } + sd::Tensor denoised = std::move(denoised_opt); float sigma_from = sigmas[i]; float sigma_to = sigmas[i + 1]; @@ -1737,14 +1091,7 @@ static bool sample_k_diffusion(sample_method_t method, } if (sigma_down == 0.0f || !have_old_sigma) { - float dt = sigma_down - sigma_from; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - float d = (vec_x[j] - vec_denoised[j]) / sigma_from; - vec_x[j] = vec_x[j] + d * dt; - } + x += ((x - denoised) / sigma_from) * (sigma_down - sigma_from); } else { float t = t_fn(sigma_from); float t_old = t_fn(old_sigma_down); @@ -1765,42 +1112,20 @@ static bool sample_k_diffusion(sample_method_t method, b2 = 0.0f; } - float sigma_h = sigma_fn(h); - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = sigma_h * vec_x[j] + h * (b1 * vec_denoised[j] + b2 * vec_old_denoised[j]); - } + x = sigma_fn(h) * (x) + h * (b1 * denoised + b2 * old_denoised); } if (sigmas[i + 1] > 0 && sigma_up > 0.0f) { - ggml_ext_im_set_randn_f32(noise, rng); - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up; - } - } - - float* vec_old_denoised = (float*)old_denoised->data; - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_old_denoised[j] = vec_denoised[j]; + x += sd::Tensor::randn_like(x, rng) * sigma_up; } + old_denoised = denoised; old_sigma_down = sigma_down; have_old_sigma = true; } - } break; - case RES_2S_SAMPLE_METHOD: // Res 2s sampler - { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x0 = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); - + return x; + } + case RES_2S_SAMPLE_METHOD: { const float c2 = 0.5f; auto t_fn = [](float sigma) -> float { return -logf(sigma); }; auto phi1_fn = [](float t) -> float { @@ -1821,10 +1146,11 @@ static bool sample_k_diffusion(sample_method_t method, float sigma_from = sigmas[i]; float sigma_to = sigmas[i + 1]; - ggml_tensor* denoised = model(x, sigma_from, -(i + 1)); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigma_from, -(i + 1)); + if (denoised_opt.empty()) { + return {}; } + sd::Tensor denoised = std::move(denoised_opt); float sigma_up = 0.0f; float sigma_down = sigma_to; @@ -1842,17 +1168,9 @@ static bool sample_k_diffusion(sample_method_t method, sigma_down = sigma_down_sq > 0.0f ? std::sqrt(sigma_down_sq) : 0.0f; } - float* vec_x = (float*)x->data; - float* vec_x0 = (float*)x0->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x0[j] = vec_x[j]; - } - + sd::Tensor x0 = x; if (sigma_down == 0.0f || sigma_from == 0.0f) { - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_denoised[j]; - } + x = denoised; } else { float t = t_fn(sigma_from); float t_next = t_fn(sigma_down); @@ -1864,45 +1182,140 @@ static bool sample_k_diffusion(sample_method_t method, float b2 = phi2_val / c2; float b1 = phi1_val - b2; - float sigma_c2 = expf(-(t + h * c2)); + float sigma_c2 = expf(-(t + h * c2)); + sd::Tensor eps1 = denoised - x0; + sd::Tensor x2 = x0 + eps1 * (h * a21); - float* vec_denoised = (float*)denoised->data; - float* vec_x2 = (float*)x2->data; - for (int j = 0; j < ggml_nelements(x); j++) { - float eps1 = vec_denoised[j] - vec_x0[j]; - vec_x2[j] = vec_x0[j] + h * a21 * eps1; - } - - ggml_tensor* denoised2 = model(x2, sigma_c2, i + 1); - if (denoised2 == nullptr) { - return false; - } - float* vec_denoised2 = (float*)denoised2->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - float eps1 = vec_denoised[j] - vec_x0[j]; - float eps2 = vec_denoised2[j] - vec_x0[j]; - vec_x[j] = vec_x0[j] + h * (b1 * eps1 + b2 * eps2); + auto denoised2_opt = model(x2, sigma_c2, i + 1); + if (denoised2_opt.empty()) { + return {}; } + sd::Tensor denoised2 = std::move(denoised2_opt); + sd::Tensor eps2 = denoised2 - x0; + x = x0 + h * (b1 * eps1 + b2 * eps2); } if (sigmas[i + 1] > 0 && sigma_up > 0.0f) { - ggml_ext_im_set_randn_f32(noise, rng); - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up; - } + x += sd::Tensor::randn_like(x, rng) * sigma_up; } } - } break; + return x; + } + case DDIM_TRAILING_SAMPLE_METHOD: { + float beta_start = 0.00085f; + float beta_end = 0.0120f; + std::vector alphas_cumprod(TIMESTEPS); + std::vector compvis_sigmas(TIMESTEPS); + for (int i = 0; i < TIMESTEPS; i++) { + alphas_cumprod[i] = + (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * + (1.0f - + std::pow(sqrtf(beta_start) + + (sqrtf(beta_end) - sqrtf(beta_start)) * + ((float)i / (TIMESTEPS - 1)), + 2)); + compvis_sigmas[i] = + std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); + } + for (int i = 0; i < steps; i++) { + int timestep = static_cast(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1; + int prev_timestep = timestep - TIMESTEPS / static_cast(steps); + float sigma = static_cast(compvis_sigmas[timestep]); + if (i == 0) { + x *= std::sqrt(sigma * sigma + 1) / sigma; + } else { + x *= std::sqrt(sigma * sigma + 1); + } + + auto model_output_opt = model(x, sigma, i + 1); + if (model_output_opt.empty()) { + return {}; + } + sd::Tensor model_output = std::move(model_output_opt); + model_output = (x - model_output) * (1.0f / sigma); + + float alpha_prod_t = static_cast(alphas_cumprod[timestep]); + float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); + float beta_prod_t = 1.0f - alpha_prod_t; + + sd::Tensor pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) - + std::sqrt(beta_prod_t) * model_output) * + (1.0f / std::sqrt(alpha_prod_t)); + + float beta_prod_t_prev = 1.0f - alpha_prod_t_prev; + float variance = (beta_prod_t_prev / beta_prod_t) * + (1.0f - alpha_prod_t / alpha_prod_t_prev); + float std_dev_t = eta * std::sqrt(variance); + + x = std::sqrt(alpha_prod_t_prev) * pred_original_sample + + std::sqrt(1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) * model_output; + + if (eta > 0) { + x += std_dev_t * sd::Tensor::randn_like(x, rng); + } + } + return x; + } + case TCD_SAMPLE_METHOD: { + float beta_start = 0.00085f; + float beta_end = 0.0120f; + std::vector alphas_cumprod(TIMESTEPS); + std::vector compvis_sigmas(TIMESTEPS); + for (int i = 0; i < TIMESTEPS; i++) { + alphas_cumprod[i] = + (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * + (1.0f - + std::pow(sqrtf(beta_start) + + (sqrtf(beta_end) - sqrtf(beta_start)) * + ((float)i / (TIMESTEPS - 1)), + 2)); + compvis_sigmas[i] = + std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); + } + int original_steps = 50; + for (int i = 0; i < steps; i++) { + int timestep = TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor(i * ((float)original_steps / steps)); + int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps)); + int timestep_s = (int)floor((1 - eta) * prev_timestep); + float sigma = static_cast(compvis_sigmas[timestep]); + + if (i == 0) { + x *= std::sqrt(sigma * sigma + 1) / sigma; + } else { + x *= std::sqrt(sigma * sigma + 1); + } + + auto model_output_opt = model(x, sigma, i + 1); + if (model_output_opt.empty()) { + return {}; + } + sd::Tensor model_output = std::move(model_output_opt); + model_output = (x - model_output) * (1.0f / sigma); + + float alpha_prod_t = static_cast(alphas_cumprod[timestep]); + float beta_prod_t = 1.0f - alpha_prod_t; + float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); + float alpha_prod_s = static_cast(alphas_cumprod[timestep_s]); + float beta_prod_s = 1.0f - alpha_prod_s; + + sd::Tensor pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) - + std::sqrt(beta_prod_t) * model_output) * + (1.0f / std::sqrt(alpha_prod_t)); + + x = std::sqrt(alpha_prod_s) * pred_original_sample + + std::sqrt(beta_prod_s) * model_output; + + if (eta > 0 && i != steps - 1) { + x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * (x) + + std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor::randn_like(x, rng); + } + } + return x; + } default: - LOG_ERROR("Attempting to sample with nonexisting sample method %i", method); - return false; + return {}; } - return true; } #endif // __DENOISER_HPP__ diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 07d9df8..eb0debf 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -1,37 +1,45 @@ #ifndef __DIFFUSION_MODEL_H__ #define __DIFFUSION_MODEL_H__ +#include #include "anima.hpp" #include "flux.hpp" #include "mmdit.hpp" #include "qwen_image.hpp" +#include "tensor_ggml.hpp" #include "unet.hpp" #include "wan.hpp" #include "z_image.hpp" struct DiffusionParams { - ggml_tensor* x = nullptr; - ggml_tensor* timesteps = nullptr; - ggml_tensor* context = nullptr; - ggml_tensor* c_concat = nullptr; - ggml_tensor* y = nullptr; - ggml_tensor* guidance = nullptr; - std::vector ref_latents = {}; - bool increase_ref_index = false; - int num_video_frames = -1; - std::vector controls = {}; - float control_strength = 0.f; - ggml_tensor* vace_context = nullptr; - float vace_strength = 1.f; - std::vector skip_layers = {}; + const sd::Tensor* x = nullptr; + const sd::Tensor* timesteps = nullptr; + const sd::Tensor* context = nullptr; + const sd::Tensor* c_concat = nullptr; + const sd::Tensor* y = nullptr; + const sd::Tensor* t5_ids = nullptr; + const sd::Tensor* t5_weights = nullptr; + const sd::Tensor* guidance = nullptr; + const std::vector>* ref_latents = nullptr; + bool increase_ref_index = false; + int num_video_frames = -1; + const std::vector>* controls = nullptr; + float control_strength = 0.f; + const sd::Tensor* vace_context = nullptr; + float vace_strength = 1.f; + const std::vector* skip_layers = nullptr; }; +template +static inline const sd::Tensor& tensor_or_empty(const sd::Tensor* tensor) { + static const sd::Tensor kEmpty; + return tensor != nullptr ? *tensor : kEmpty; +} + struct DiffusionModel { virtual std::string get_desc() = 0; - virtual bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) = 0; + virtual sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) = 0; virtual void alloc_params_buffer() = 0; virtual void free_params_buffer() = 0; virtual void free_compute_buffer() = 0; @@ -93,19 +101,20 @@ struct UNetModel : public DiffusionModel { unet.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector> empty_controls; return unet.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.c_concat, - diffusion_params.y, + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.c_concat), + tensor_or_empty(diffusion_params.y), diffusion_params.num_video_frames, - diffusion_params.controls, - diffusion_params.control_strength, output, output_ctx); + diffusion_params.controls ? *diffusion_params.controls : empty_controls, + diffusion_params.control_strength); } }; @@ -158,18 +167,17 @@ struct MMDiTModel : public DiffusionModel { mmdit.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector empty_skip_layers; return mmdit.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.y, - output, - output_ctx, - diffusion_params.skip_layers); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.y), + diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers); } }; @@ -224,22 +232,22 @@ struct FluxModel : public DiffusionModel { flux.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector> empty_ref_latents; + static const std::vector empty_skip_layers; return flux.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.c_concat, - diffusion_params.y, - diffusion_params.guidance, - diffusion_params.ref_latents, + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.c_concat), + tensor_or_empty(diffusion_params.y), + tensor_or_empty(diffusion_params.guidance), + diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents, diffusion_params.increase_ref_index, - output, - output_ctx, - diffusion_params.skip_layers); + diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers); } }; @@ -294,18 +302,16 @@ struct AnimaModel : public DiffusionModel { anima.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); return anima.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.c_concat, - diffusion_params.y, - output, - output_ctx); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.t5_ids), + tensor_or_empty(diffusion_params.t5_weights)); } }; @@ -361,21 +367,19 @@ struct WanModel : public DiffusionModel { wan.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); return wan.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.y, - diffusion_params.c_concat, - nullptr, - diffusion_params.vace_context, - diffusion_params.vace_strength, - output, - output_ctx); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.y), + tensor_or_empty(diffusion_params.c_concat), + sd::Tensor(), + tensor_or_empty(diffusion_params.vace_context), + diffusion_params.vace_strength); } }; @@ -432,18 +436,17 @@ struct QwenImageModel : public DiffusionModel { qwen_image.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector> empty_ref_latents; return qwen_image.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.ref_latents, - true, // increase_ref_index - output, - output_ctx); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents, + true); } }; @@ -499,18 +502,17 @@ struct ZImageModel : public DiffusionModel { z_image.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector> empty_ref_latents; return z_image.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.ref_latents, - true, // increase_ref_index - output, - output_ctx); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents, + true); } }; diff --git a/src/easycache.hpp b/src/easycache.hpp index 3f0287a..409a464 100644 --- a/src/easycache.hpp +++ b/src/easycache.hpp @@ -1,10 +1,15 @@ +#ifndef __EASYCACHE_HPP__ +#define __EASYCACHE_HPP__ + #include #include #include #include +#include "condition_cache_utils.hpp" #include "denoiser.hpp" #include "ggml_extend.hpp" +#include "tensor.hpp" struct EasyCacheConfig { bool enabled = false; @@ -19,15 +24,15 @@ struct EasyCacheCacheEntry { struct EasyCacheState { EasyCacheConfig config; - Denoiser* denoiser = nullptr; - float start_sigma = std::numeric_limits::max(); - float end_sigma = 0.0f; - bool initialized = false; - bool initial_step = true; - bool skip_current_step = false; - bool step_active = false; - const SDCondition* anchor_condition = nullptr; - std::unordered_map cache_diffs; + Denoiser* denoiser = nullptr; + float start_sigma = std::numeric_limits::max(); + float end_sigma = 0.0f; + bool initialized = false; + bool initial_step = true; + bool skip_current_step = false; + bool step_active = false; + const void* anchor_condition = nullptr; + std::unordered_map cache_diffs; std::vector prev_input; std::vector prev_output; float output_prev_norm = 0.0f; @@ -120,41 +125,30 @@ struct EasyCacheState { return enabled() && step_active && skip_current_step; } - bool has_cache(const SDCondition* cond) const { + bool has_cache(const void* cond) const { auto it = cache_diffs.find(cond); return it != cache_diffs.end() && !it->second.diff.empty(); } - void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void update_cache(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { EasyCacheCacheEntry& entry = cache_diffs[cond]; - size_t ne = static_cast(ggml_nelements(output)); - entry.diff.resize(ne); - float* out_data = (float*)output->data; - float* in_data = (float*)input->data; - for (size_t i = 0; i < ne; ++i) { - entry.diff[i] = out_data[i] - in_data[i]; - } + sd::store_condition_cache_diff(&entry.diff, input, output); } - void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void apply_cache(const void* cond, const sd::Tensor& input, sd::Tensor* output) { auto it = cache_diffs.find(cond); if (it == cache_diffs.end() || it->second.diff.empty()) { return; } - copy_ggml_tensor(output, input); - float* out_data = (float*)output->data; - const std::vector& diff = it->second.diff; - for (size_t i = 0; i < diff.size(); ++i) { - out_data[i] += diff[i]; - } + sd::apply_condition_cache_diff(it->second.diff, input, output); } - bool before_condition(const SDCondition* cond, - ggml_tensor* input, - ggml_tensor* output, + bool before_condition(const void* cond, + const sd::Tensor& input, + sd::Tensor* output, float sigma, int step_index) { - if (!enabled() || step_index < 0) { + if (!enabled() || step_index < 0 || output == nullptr) { return false; } if (step_index != current_step_index) { @@ -181,12 +175,12 @@ struct EasyCacheState { if (!has_prev_input || !has_prev_output || !has_cache(cond)) { return false; } - size_t ne = static_cast(ggml_nelements(input)); + size_t ne = static_cast(input.numel()); if (prev_input.size() != ne) { return false; } - float* input_data = (float*)input->data; - last_input_change = 0.0f; + const float* input_data = input.data(); + last_input_change = 0.0f; for (size_t i = 0; i < ne; ++i) { last_input_change += std::fabs(input_data[i] - prev_input[i]); } @@ -211,7 +205,7 @@ struct EasyCacheState { return false; } - void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void after_condition(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { if (!step_is_active()) { return; } @@ -220,16 +214,16 @@ struct EasyCacheState { return; } - size_t ne = static_cast(ggml_nelements(input)); - float* in_data = (float*)input->data; + size_t ne = static_cast(input.numel()); + const float* in_data = input.data(); prev_input.resize(ne); for (size_t i = 0; i < ne; ++i) { prev_input[i] = in_data[i]; } has_prev_input = true; - float* out_data = (float*)output->data; - float output_change = 0.0f; + const float* out_data = output.data(); + float output_change = 0.0f; if (has_prev_output && prev_output.size() == ne) { for (size_t i = 0; i < ne; ++i) { output_change += std::fabs(out_data[i] - prev_output[i]); @@ -262,4 +256,6 @@ struct EasyCacheState { cumulative_change_rate = 0.0f; has_last_input_change = false; } -}; \ No newline at end of file +}; + +#endif diff --git a/src/esrgan.hpp b/src/esrgan.hpp index efb3aed..26c46f5 100644 --- a/src/esrgan.hpp +++ b/src/esrgan.hpp @@ -341,12 +341,12 @@ struct ESRGAN : public GGMLRunner { return success; } - ggml_cgraph* build_graph(ggml_tensor* x) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor) { if (!rrdb_net) return nullptr; constexpr int kGraphNodes = 1 << 16; // 65k ggml_cgraph* gf = new_graph_custom(kGraphNodes); - x = to_backend(x); + ggml_tensor* x = make_input(x_tensor); auto runner_ctx = get_context(); ggml_tensor* out = rrdb_net->forward(&runner_ctx, x); @@ -354,15 +354,12 @@ struct ESRGAN : public GGMLRunner { return gf; } - bool compute(const int n_threads, - ggml_tensor* x, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(x); - }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + sd::Tensor compute(const int n_threads, + const sd::Tensor& x) { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); }; + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return result; } }; -#endif // __ESRGAN_HPP__ \ No newline at end of file +#endif // __ESRGAN_HPP__ diff --git a/src/flux.hpp b/src/flux.hpp index 93b9350..e6bf002 100644 --- a/src/flux.hpp +++ b/src/flux.hpp @@ -1178,6 +1178,7 @@ namespace Flux { std::vector pe_vec; std::vector mod_index_arange_vec; std::vector dct_vec; + sd::Tensor guidance_tensor; SDVersion version; bool use_mask = false; @@ -1353,29 +1354,42 @@ namespace Flux { return dct; } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* c_concat, - ggml_tensor* y, - ggml_tensor* guidance, - std::vector ref_latents = {}, - bool increase_ref_index = false, - std::vector skip_layers = {}) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& c_concat_tensor = {}, + const sd::Tensor& y_tensor = {}, + const sd::Tensor& guidance_tensor = {}, + const std::vector>& ref_latents_tensor = {}, + bool increase_ref_index = false, + std::vector skip_layers = {}) { + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* c_concat = make_optional_input(c_concat_tensor); + ggml_tensor* y = make_optional_input(y_tensor); + if (flux_params.guidance_embed || flux_params.is_chroma) { + if (!guidance_tensor.empty()) { + this->guidance_tensor = guidance_tensor; + if (flux_params.is_chroma) { + this->guidance_tensor.fill_(0.f); + } + } + } + ggml_tensor* guidance = make_optional_input(this->guidance_tensor); + std::vector ref_latents; + ref_latents.reserve(ref_latents_tensor.size()); + for (const auto& ref_latent_tensor : ref_latents_tensor) { + ref_latents.push_back(make_input(ref_latent_tensor)); + } + GGML_ASSERT(x->ne[3] == 1); ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE); ggml_tensor* mod_index_arange = nullptr; ggml_tensor* dct = nullptr; // for chroma radiance - x = to_backend(x); - context = to_backend(context); - if (c_concat != nullptr) { - c_concat = to_backend(c_concat); - } if (flux_params.is_chroma) { - guidance = ggml_set_f32(guidance, 0); - if (!use_mask) { y = nullptr; } @@ -1385,16 +1399,6 @@ namespace Flux { mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size()); set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data()); } - y = to_backend(y); - - timesteps = to_backend(timesteps); - if (flux_params.guidance_embed || flux_params.is_chroma) { - guidance = to_backend(guidance); - } - for (int i = 0; i < ref_latents.size(); i++) { - ref_latents[i] = to_backend(ref_latents[i]); - } - std::set txt_arange_dims; if (sd_version_is_flux2(version)) { txt_arange_dims = {3}; @@ -1455,18 +1459,16 @@ namespace Flux { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* c_concat, - ggml_tensor* y, - ggml_tensor* guidance, - std::vector ref_latents = {}, - bool increase_ref_index = false, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr, - std::vector skip_layers = std::vector()) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& c_concat = {}, + const sd::Tensor& y = {}, + const sd::Tensor& guidance = {}, + const std::vector>& ref_latents = {}, + bool increase_ref_index = false, + std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] @@ -1476,7 +1478,8 @@ namespace Flux { return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return result; } void test() { @@ -1485,41 +1488,51 @@ namespace Flux { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // cpu f16: // cuda f16: nan // cuda q8_0: pass - auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 128, 1); + sd::Tensor x({16, 16, 128, 1}); // ggml_set_f32(x, 0.01f); - // auto x = load_tensor_from_file(work_ctx, "chroma_x.bin"); + // auto x = load_tensor_from_file(ctx, "chroma_x.bin"); // print_ggml_tensor(x); std::vector timesteps_vec(1, 1.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); std::vector guidance_vec(1, 0.f); - auto guidance = vector_to_ggml_tensor(work_ctx, guidance_vec); + auto guidance = sd::Tensor::from_vector(guidance_vec); - auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 15360, 256, 1); + sd::Tensor context({15360, 256, 1}); // ggml_set_f32(context, 0.01f); - // auto context = load_tensor_from_file(work_ctx, "chroma_context.bin"); + // auto context = load_tensor_from_file(ctx, "chroma_context.bin"); // print_ggml_tensor(context); - // auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, 1); + // auto y = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 768, 1); // ggml_set_f32(y, 0.01f); auto y = nullptr; // print_ggml_tensor(y); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, nullptr, y, guidance, {}, false, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + {}, + {}, + guidance, + {}, + false); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("flux test done in %lldms", t1 - t0); } } diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index e6b27cc..859270c 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "ggml.h" #include "model.h" +#include "tensor.hpp" #ifdef SD_USE_CUDA #include "ggml-cuda.h" @@ -49,6 +51,7 @@ #endif #include "rng.hpp" +#include "tensor_ggml.hpp" #include "util.h" #define EPS 1e-05f @@ -205,14 +208,6 @@ __STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int64_t iw, int64_t i return value; } -__STATIC_INLINE__ float sd_image_get_f32(sd_image_f32_t image, int64_t iw, int64_t ih, int64_t ic, bool scale = true) { - float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic); - if (scale) { - value /= 255.f; - } - return value; -} - __STATIC_INLINE__ void print_ggml_tensor(ggml_tensor* tensor, bool shape_only = false, const char* mark = "") { printf("%s (%s): shape(%zu, %zu, %zu, %zu)\n", mark, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); fflush(stdout); @@ -250,6 +245,56 @@ __STATIC_INLINE__ void print_ggml_tensor(ggml_tensor* tensor, bool shape_only = } } +template +__STATIC_INLINE__ void print_sd_tensor(const sd::Tensor& tensor, bool shape_only = false, const char* mark = "") { + printf("%s: shape(", mark); + for (size_t i = 0; i < static_cast(tensor.dim()); ++i) { + printf("%s%lld", i == 0 ? "" : ", ", static_cast(tensor.shape()[i])); + } + printf(")\n"); + fflush(stdout); + if (shape_only) { + return; + } + int range = 3; + std::vector shape = tensor.shape(); + while (shape.size() < 4) { + shape.push_back(1); + } + for (int64_t i3 = 0; i3 < shape[3]; i3++) { + if (i3 >= range && i3 + range < shape[3]) { + continue; + } + for (int64_t i2 = 0; i2 < shape[2]; i2++) { + if (i2 >= range && i2 + range < shape[2]) { + continue; + } + for (int64_t i1 = 0; i1 < shape[1]; i1++) { + if (i1 >= range && i1 + range < shape[1]) { + continue; + } + for (int64_t i0 = 0; i0 < shape[0]; i0++) { + if (i0 >= range && i0 + range < shape[0]) { + continue; + } + size_t offset = static_cast(i0 + shape[0] * (i1 + shape[1] * (i2 + shape[2] * i3))); + printf(" [%lld, %lld, %lld, %lld] = ", static_cast(i3), static_cast(i2), static_cast(i1), static_cast(i0)); + if constexpr (std::is_same_v) { + printf("%f\n", tensor[static_cast(offset)]); + } else if constexpr (std::is_same_v) { + printf("%f\n", ggml_fp16_to_fp32(tensor[static_cast(offset)])); + } else if constexpr (std::is_same_v) { + printf("%d\n", tensor[static_cast(offset)]); + } else if constexpr (std::is_same_v) { + printf("%lld\n", static_cast(tensor[static_cast(offset)])); + } + fflush(stdout); + } + } + } + } +} + __STATIC_INLINE__ void ggml_ext_tensor_iter( ggml_tensor* tensor, const std::function& fn) { @@ -475,99 +520,6 @@ __STATIC_INLINE__ void ggml_ext_tensor_apply_mask(ggml_tensor* image_data, } } -__STATIC_INLINE__ void sd_image_f32_to_ggml_tensor(sd_image_f32_t image, - ggml_tensor* tensor, - bool scale = true) { - GGML_ASSERT(image.width == tensor->ne[0]); - GGML_ASSERT(image.height == tensor->ne[1]); - GGML_ASSERT(image.channel == tensor->ne[2]); - GGML_ASSERT(1 == tensor->ne[3]); - GGML_ASSERT(tensor->type == GGML_TYPE_F32); - ggml_ext_tensor_iter(tensor, [&](ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = sd_image_get_f32(image, i0, i1, i2, scale); - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2, i3); - }); -} - -__STATIC_INLINE__ void ggml_ext_tensor_split_2d(ggml_tensor* input, - ggml_tensor* output, - int x, - int y) { - int64_t width = output->ne[0]; - int64_t height = output->ne[1]; - int64_t channels = output->ne[2]; - int64_t ne3 = output->ne[3]; - - int64_t input_width = input->ne[0]; - int64_t input_height = input->ne[1]; - - GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32); - for (int iy = 0; iy < height; iy++) { - for (int ix = 0; ix < width; ix++) { - for (int k = 0; k < channels; k++) { - for (int l = 0; l < ne3; l++) { - float value = ggml_ext_tensor_get_f32(input, (ix + x) % input_width, (iy + y) % input_height, k, l); - ggml_ext_tensor_set_f32(output, value, ix, iy, k, l); - } - } - } - } -} - -// unclamped -> expects x in the range [0-1] -__STATIC_INLINE__ float smootherstep_f32(const float x) { - GGML_ASSERT(x >= 0.f && x <= 1.f); - return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f); -} - -__STATIC_INLINE__ void ggml_ext_tensor_merge_2d(ggml_tensor* input, - ggml_tensor* output, - int x, - int y, - int overlap_x, - int overlap_y, - bool circular_x, - bool circular_y, - int x_skip = 0, - int y_skip = 0) { - int64_t width = input->ne[0]; - int64_t height = input->ne[1]; - int64_t channels = input->ne[2]; - int64_t ne3 = input->ne[3]; - - int64_t img_width = output->ne[0]; - int64_t img_height = output->ne[1]; - - GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32); - for (int iy = y_skip; iy < height; iy++) { - for (int ix = x_skip; ix < width; ix++) { - for (int k = 0; k < channels; k++) { - for (int l = 0; l < ne3; l++) { - float new_value = ggml_ext_tensor_get_f32(input, ix, iy, k, l); - if (overlap_x > 0 || overlap_y > 0) { // blend colors in overlapped area - float old_value = ggml_ext_tensor_get_f32(output, (x + ix) % img_width, (y + iy) % img_height, k, l); - - const float x_f_0 = (circular_x || (overlap_x > 0 && x > 0)) ? (ix - x_skip) / float(overlap_x) : 1; - const float x_f_1 = (circular_x || (overlap_x > 0 && x < (img_width - width))) ? (width - ix) / float(overlap_x) : 1; - const float y_f_0 = (circular_y || (overlap_y > 0 && y > 0)) ? (iy - y_skip) / float(overlap_y) : 1; - const float y_f_1 = (circular_y || (overlap_y > 0 && y < (img_height - height))) ? (height - iy) / float(overlap_y) : 1; - - const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f); - const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f); - - ggml_ext_tensor_set_f32( - output, - old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f), - (x + ix) % img_width, (y + iy) % img_height, k, l); - } else { - ggml_ext_tensor_set_f32(output, new_value, (x + ix) % img_width, (y + iy) % img_height, k, l); - } - } - } - } - } -} - __STATIC_INLINE__ float ggml_ext_tensor_mean(ggml_tensor* src) { float mean = 0.0f; int64_t nelements = ggml_nelements(src); @@ -832,22 +784,102 @@ __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim, } // Tiling -__STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, - ggml_tensor* output, - const int scale, - const int p_tile_size_x, - const int p_tile_size_y, - const float tile_overlap_factor, - const bool circular_x, - const bool circular_y, - on_tile_process on_processing, - bool slient = false) { - output = ggml_set_f32(output, 0); - int input_width = (int)input->ne[0]; - int input_height = (int)input->ne[1]; - int output_width = (int)output->ne[0]; - int output_height = (int)output->ne[1]; +__STATIC_INLINE__ int64_t sd_tensor_plane_size(const sd::Tensor& tensor) { + GGML_ASSERT(tensor.dim() >= 2); + return tensor.shape()[0] * tensor.shape()[1]; +} + +__STATIC_INLINE__ sd::Tensor sd_tensor_split_2d(const sd::Tensor& input, int width, int height, int x, int y) { + GGML_ASSERT(input.dim() >= 4); + std::vector output_shape = input.shape(); + output_shape[0] = width; + output_shape[1] = height; + sd::Tensor output(std::move(output_shape)); + int64_t input_width = input.shape()[0]; + int64_t input_height = input.shape()[1]; + int64_t input_plane = sd_tensor_plane_size(input); + int64_t output_plane = sd_tensor_plane_size(output); + int64_t plane_count = input.numel() / input_plane; + for (int iy = 0; iy < height; iy++) { + for (int ix = 0; ix < width; ix++) { + int64_t src_xy = (ix + x) % input_width + input_width * ((iy + y) % input_height); + int64_t dst_xy = ix + width * iy; + for (int64_t plane = 0; plane < plane_count; ++plane) { + output[plane * output_plane + dst_xy] = input[plane * input_plane + src_xy]; + } + } + } + return output; +} + +__STATIC_INLINE__ void sd_tensor_merge_2d(const sd::Tensor& input, + sd::Tensor* output, + int x, + int y, + int overlap_x, + int overlap_y, + bool circular_x, + bool circular_y, + int x_skip = 0, + int y_skip = 0) { + GGML_ASSERT(output != nullptr); + int64_t width = input.shape()[0]; + int64_t height = input.shape()[1]; + int64_t img_width = output->shape()[0]; + int64_t img_height = output->shape()[1]; + int64_t input_plane = sd_tensor_plane_size(input); + int64_t output_plane = sd_tensor_plane_size(*output); + int64_t plane_count = input.numel() / input_plane; + GGML_ASSERT(output->numel() / output_plane == plane_count); + + // unclamped -> expects x in the range [0-1] + auto smootherstep_f32 = [](const float x) -> float { + GGML_ASSERT(x >= 0.f && x <= 1.f); + return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f); + }; + + for (int iy = y_skip; iy < height; iy++) { + for (int ix = x_skip; ix < width; ix++) { + int64_t src_xy = ix + width * iy; + int64_t ox = (x + ix) % img_width; + int64_t oy = (y + iy) % img_height; + int64_t dst_xy = ox + img_width * oy; + for (int64_t plane = 0; plane < plane_count; ++plane) { + float new_value = input[plane * input_plane + src_xy]; + if (overlap_x > 0 || overlap_y > 0) { + float old_value = (*output)[plane * output_plane + dst_xy]; + const float x_f_0 = (circular_x || (overlap_x > 0 && x > 0)) ? (ix - x_skip) / float(overlap_x) : 1.f; + const float x_f_1 = (circular_x || (overlap_x > 0 && x < (img_width - width))) ? (width - ix) / float(overlap_x) : 1.f; + const float y_f_0 = (circular_y || (overlap_y > 0 && y > 0)) ? (iy - y_skip) / float(overlap_y) : 1.f; + const float y_f_1 = (circular_y || (overlap_y > 0 && y < (img_height - height))) ? (height - iy) / float(overlap_y) : 1.f; + const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f); + const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f); + (*output)[plane * output_plane + dst_xy] = + old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f); + } else { + (*output)[plane * output_plane + dst_xy] = new_value; + } + } + } + } +} + +template +__STATIC_INLINE__ sd::Tensor process_tiles_2d(const sd::Tensor& input, + int output_width, + int output_height, + int scale, + int p_tile_size_x, + int p_tile_size_y, + float tile_overlap_factor, + bool circular_x, + bool circular_y, + Fn&& on_processing, + bool silent = false) { + sd::Tensor output; + int input_width = static_cast(input.shape()[0]); + int input_height = static_cast(input.shape()[1]); GGML_ASSERT(((input_width / output_width) == (input_height / output_height)) && ((output_width / input_width) == (output_height / input_height))); @@ -856,8 +888,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, int small_width = output_width; int small_height = output_height; - - bool decode = output_width > input_width; + bool decode = output_width > input_width; if (decode) { small_width = input_width; small_height = input_height; @@ -871,25 +902,16 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, float tile_overlap_factor_y; sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor, circular_y); - if (!slient) { - LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y); - LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor); - } - - int tile_overlap_x = (int32_t)(p_tile_size_x * tile_overlap_factor_x); + int tile_overlap_x = static_cast(p_tile_size_x * tile_overlap_factor_x); int non_tile_overlap_x = p_tile_size_x - tile_overlap_x; - - int tile_overlap_y = (int32_t)(p_tile_size_y * tile_overlap_factor_y); + int tile_overlap_y = static_cast(p_tile_size_y * tile_overlap_factor_y); int non_tile_overlap_y = p_tile_size_y - tile_overlap_y; - - int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width; - int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height; - + int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width; + int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height; int input_tile_size_x = tile_size_x; int input_tile_size_y = tile_size_y; int output_tile_size_x = tile_size_x; int output_tile_size_y = tile_size_y; - if (decode) { output_tile_size_x *= scale; output_tile_size_y *= scale; @@ -898,41 +920,23 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, input_tile_size_y *= scale; } - ggml_init_params params = {}; - params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float); // input chunk - params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float); // output chunk - params.mem_size += 3 * ggml_tensor_overhead(); - params.mem_buffer = nullptr; - params.no_alloc = false; - - if (!slient) { - LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); - } - - // draft context - ggml_context* tiles_ctx = ggml_init(params); - if (!tiles_ctx) { - LOG_ERROR("ggml_init() failed"); - return; - } - - // tiling - ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]); - ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]); - int num_tiles = num_tiles_x * num_tiles_y; - if (!slient) { + int num_tiles = num_tiles_x * num_tiles_y; + int tile_count = 1; + bool last_y = false; + bool last_x = false; + float last_time = 0.0f; + if (!silent) { + LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y); + LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor); LOG_DEBUG("processing %i tiles", num_tiles); pretty_progress(0, num_tiles, 0.0f); } - int tile_count = 1; - bool last_y = false, last_x = false; - float last_time = 0.0f; for (int y = 0; y < small_height && !last_y; y += non_tile_overlap_y) { int dy = 0; if (!circular_y && y + tile_size_y >= small_height) { - int _y = y; - y = small_height - tile_size_y; - dy = _y - y; + int original_y = y; + y = small_height - tile_size_y; + dy = original_y - y; if (decode) { dy *= scale; } @@ -941,9 +945,9 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, for (int x = 0; x < small_width && !last_x; x += non_tile_overlap_x) { int dx = 0; if (!circular_x && x + tile_size_x >= small_width) { - int _x = x; - x = small_width - tile_size_x; - dx = _x - x; + int original_x = x; + x = small_width - tile_size_x; + dx = original_x - x; if (decode) { dx *= scale; } @@ -958,38 +962,37 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, int overlap_x_out = decode ? tile_overlap_x * scale : tile_overlap_x; int overlap_y_out = decode ? tile_overlap_y * scale : tile_overlap_y; - int64_t t1 = ggml_time_ms(); - ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in); - if (on_processing(input_tile, output_tile, false)) { - ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, circular_x, circular_y, dx, dy); + int64_t t1 = ggml_time_ms(); + auto input_tile = sd_tensor_split_2d(input, input_tile_size_x, input_tile_size_y, x_in, y_in); + auto output_tile = on_processing(input_tile); + if (output_tile.empty()) { + return {}; + } + GGML_ASSERT(output_tile.shape()[0] == output_tile_size_x && output_tile.shape()[1] == output_tile_size_y); + if (output.empty()) { + std::vector output_shape = output_tile.shape(); + output_shape[0] = output_width; + output_shape[1] = output_height; + output = sd::Tensor::zeros(std::move(output_shape)); + } + sd_tensor_merge_2d(output_tile, &output, x_out, y_out, overlap_x_out, overlap_y_out, circular_x, circular_y, dx, dy); + if (!silent) { int64_t t2 = ggml_time_ms(); last_time = (t2 - t1) / 1000.0f; pretty_progress(tile_count, num_tiles, last_time); - } else { - LOG_ERROR("Failed to process patch %d at (%d, %d)", tile_count, x, y); } tile_count++; } last_x = false; } - if (!slient) { - if (tile_count < num_tiles) { - pretty_progress(num_tiles, num_tiles, last_time); - } + if (!silent && tile_count < num_tiles) { + pretty_progress(num_tiles, num_tiles, last_time); } - ggml_free(tiles_ctx); -} - -__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, - ggml_tensor* output, - const int scale, - const int tile_size, - const float tile_overlap_factor, - const bool circular_x, - const bool circular_y, - on_tile_process on_processing) { - sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, circular_x, circular_y, on_processing); + if (output.empty()) { + return {}; + } + return output; } __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm_32(ggml_context* ctx, @@ -1588,6 +1591,18 @@ __STATIC_INLINE__ void set_timestep_embedding(std::vector timesteps, memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding)); } +__STATIC_INLINE__ void set_timestep_embedding(std::vector timesteps, + sd::Tensor* embedding, + int dim, + int max_period = 10000) { + GGML_ASSERT(embedding != nullptr); + std::vector embedding_vec = timestep_embedding(timesteps, dim, max_period); + if (embedding->numel() != static_cast(embedding_vec.size())) { + embedding->resize({dim, static_cast(timesteps.size())}); + } + std::copy(embedding_vec.begin(), embedding_vec.end(), embedding->values().begin()); +} + __STATIC_INLINE__ ggml_tensor* new_timestep_embedding(ggml_context* ctx, std::vector timesteps, int dim, @@ -1705,6 +1720,32 @@ protected: bool circular_x_enabled = false; bool circular_y_enabled = false; + template + static sd::Tensor take_or_empty(std::optional> tensor) { + if (!tensor.has_value()) { + return {}; + } + return std::move(*tensor); + } + + template + static sd::Tensor restore_trailing_singleton_dims(std::optional> tensor, + size_t expected_dim) { + return restore_trailing_singleton_dims(take_or_empty(std::move(tensor)), expected_dim); + } + + template + static sd::Tensor restore_trailing_singleton_dims(sd::Tensor tensor, + size_t expected_dim) { + if (tensor.empty()) { + return tensor; + } + while (static_cast(tensor.dim()) < expected_dim) { + tensor.unsqueeze_(tensor.dim()); + } + return tensor; + } + void alloc_params_ctx() { ggml_init_params params; params.mem_size = static_cast(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead()); @@ -2042,6 +2083,29 @@ public: backend_tensor_data_map[tensor] = data; } + template + ggml_tensor* make_input(const sd::Tensor& tensor) { + ggml_tensor* input = sd::make_ggml_tensor(compute_ctx, tensor, false); + set_backend_tensor_data(input, tensor.data()); + return input; + } + + template + ggml_tensor* make_optional_input(const sd::Tensor& tensor) { + if (tensor.empty()) { + return nullptr; + } + return make_input(tensor); + } + + template + ggml_tensor* make_optional_input(const sd::Tensor* tensor) { + if (tensor == nullptr) { + return nullptr; + } + return make_input(*tensor); + } + ggml_tensor* to_backend(ggml_tensor* tensor) { GGML_ASSERT(compute_ctx != nullptr); if (tensor == nullptr) { @@ -2070,24 +2134,24 @@ public: return ggml_get_tensor(cache_ctx, name.c_str()); } - bool compute(get_graph_cb_t get_graph, - int n_threads, - bool free_compute_buffer_immediately = true, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + template + std::optional> compute(get_graph_cb_t get_graph, + int n_threads, + bool free_compute_buffer_immediately, + bool no_return = false) { if (!offload_params_to_runtime_backend()) { LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str()); - return false; + return std::nullopt; } if (!alloc_compute_buffer(get_graph)) { LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); - return false; + return std::nullopt; } reset_compute_ctx(); ggml_cgraph* gf = get_compute_graph(get_graph); if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); - return false; + return std::nullopt; } copy_data_to_backend_tensor(); if (ggml_backend_is_cpu(runtime_backend)) { @@ -2097,26 +2161,19 @@ public: ggml_status status = ggml_backend_graph_compute(runtime_backend, gf); if (status != GGML_STATUS_SUCCESS) { LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); - return false; + return std::nullopt; } -#ifdef GGML_PERF - ggml_graph_print(gf); -#endif copy_cache_tensors_to_cache_buffer(); - if (output != nullptr) { - auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); - if (*output == nullptr && output_ctx != nullptr) { - *output = ggml_dup_tensor(output_ctx, result); - } - if (*output != nullptr) { - ggml_ext_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output)); - } + auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); + std::optional> output; + if (!no_return) { + output = sd::make_sd_tensor_from_ggml(result); } if (free_compute_buffer_immediately) { free_compute_buffer(); } - return true; + return output; } void set_flash_attention_enabled(bool enabled) { diff --git a/src/latent-preview.h b/src/latent-preview.h index 5078a6b..7f30734 100644 --- a/src/latent-preview.h +++ b/src/latent-preview.h @@ -1,6 +1,8 @@ +#include #include #include #include "ggml.h" +#include "tensor.hpp" const float wan_21_latent_rgb_proj[16][3] = { {0.015123f, -0.148418f, 0.479828f}, @@ -232,3 +234,67 @@ void preview_latent_video(uint8_t* buffer, ggml_tensor* latents, const float (*l } } } + +static inline bool preview_latent_tensor_is_video(const sd::Tensor& latents) { + return latents.dim() == 5; +} + +void preview_latent_video(uint8_t* buffer, const sd::Tensor& latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) { + uint32_t latent_width = static_cast(latents.shape()[0]); + uint32_t latent_height = static_cast(latents.shape()[1]); + bool is_video = preview_latent_tensor_is_video(latents); + uint32_t frames = is_video ? static_cast(latents.shape()[2]) : 1; + uint32_t dim = is_video ? static_cast(latents.shape()[3]) : static_cast(latents.shape()[2]); + + uint32_t rgb_width = latent_width * patch_size; + uint32_t rgb_height = latent_height * patch_size; + uint32_t unpatched_dim = dim / (patch_size * patch_size); + + for (uint32_t k = 0; k < frames; k++) { + for (uint32_t rgb_x = 0; rgb_x < rgb_width; rgb_x++) { + for (uint32_t rgb_y = 0; rgb_y < rgb_height; rgb_y++) { + uint32_t latent_x = rgb_x / patch_size; + uint32_t latent_y = rgb_y / patch_size; + + uint32_t channel_offset = 0; + if (patch_size > 1) { + channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size)); + } + + size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x; + auto latent_value = [&](uint32_t latent_channel) -> float { + return is_video + ? latents.values()[latent_x + latent_width * (latent_y + latent_height * (k + frames * latent_channel))] + : latents.values()[latent_x + latent_width * (latent_y + latent_height * latent_channel)]; + }; + + float r = 0.f, g = 0.f, b = 0.f; + if (latent_rgb_proj != nullptr) { + for (uint32_t d = 0; d < unpatched_dim; d++) { + uint32_t latent_channel = d * patch_size * patch_size + channel_offset; + float value = latent_value(latent_channel); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } + } else { + r = latent_value(0); + g = latent_value(1); + b = latent_value(2); + } + if (latent_rgb_bias != nullptr) { + r += latent_rgb_bias[0]; + g += latent_rgb_bias[1]; + b += latent_rgb_bias[2]; + } + r = std::min(1.0f, std::max(0.0f, r * .5f + .5f)); + g = std::min(1.0f, std::max(0.0f, g * .5f + .5f)); + b = std::min(1.0f, std::max(0.0f, b * .5f + .5f)); + + buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255); + buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255); + buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255); + } + } + } +} diff --git a/src/llm.hpp b/src/llm.hpp index 5a9c25c..c6c2961 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -194,6 +194,7 @@ namespace LLM { bool padding = false) { if (add_bos_token) { tokens.insert(tokens.begin(), BOS_TOKEN_ID); + weights.insert(weights.begin(), 1.f); } if (max_length > 0 && padding) { size_t n = static_cast(std::ceil(tokens.size() * 1.f / max_length)); @@ -1180,16 +1181,17 @@ namespace LLM { return hidden_states; } - ggml_cgraph* build_graph(ggml_tensor* input_ids, - ggml_tensor* attention_mask, - std::vector> image_embeds, + ggml_cgraph* build_graph(const sd::Tensor& input_ids_tensor, + const sd::Tensor& attention_mask_tensor, + const std::vector>>& image_embeds_tensor, std::set out_layers) { - ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - input_ids = to_backend(input_ids); - - for (auto& image_embed : image_embeds) { - image_embed.second = to_backend(image_embed.second); + ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_tensor* input_ids = make_input(input_ids_tensor); + std::vector> image_embeds; + image_embeds.reserve(image_embeds_tensor.size()); + for (const auto& [idx, embed_tensor] : image_embeds_tensor) { + ggml_tensor* embed = make_input(embed_tensor); + image_embeds.emplace_back(idx, embed); } int64_t n_tokens = input_ids->ne[0]; @@ -1213,8 +1215,9 @@ namespace LLM { input_pos_vec.size()); set_backend_tensor_data(input_pos, input_pos_vec.data()); - if (attention_mask != nullptr) { - attention_mask = to_backend(attention_mask); + ggml_tensor* attention_mask = nullptr; + if (!attention_mask_tensor.empty()) { + attention_mask = make_input(attention_mask_tensor); } else { attention_mask_vec.resize(n_tokens * n_tokens); for (int i0 = 0; i0 < n_tokens; i0++) { @@ -1239,17 +1242,15 @@ namespace LLM { return gf; } - bool compute(const int n_threads, - ggml_tensor* input_ids, - ggml_tensor* attention_mask, - std::vector> image_embeds, - std::set out_layers, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(const int n_threads, + const sd::Tensor& input_ids, + const sd::Tensor& attention_mask, + const std::vector>>& image_embeds, + std::set out_layers) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, attention_mask, image_embeds, out_layers); }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); } int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) { @@ -1288,8 +1289,9 @@ namespace LLM { return image; } - ggml_cgraph* build_encode_image_graph(ggml_tensor* image) { - ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE); + ggml_cgraph* build_encode_image_graph(const sd::Tensor& image_tensor) { + ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE); + ggml_tensor* image = make_input(image_tensor); GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); @@ -1301,8 +1303,6 @@ namespace LLM { int llm_grid_w = grid_w / params.vision.spatial_merge_size; int vit_merger_window_size = params.vision.window_size / params.vision.patch_size / params.vision.spatial_merge_size; - image = to_backend(image); - auto pixel_values = process_image(compute_ctx, image); // window index @@ -1411,14 +1411,12 @@ namespace LLM { return gf; } - void encode_image(const int n_threads, - ggml_tensor* image, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor encode_image(const int n_threads, + const sd::Tensor& image) { auto get_graph = [&]() -> ggml_cgraph* { return build_encode_image_graph(image); }; - GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, false)); } }; @@ -1497,39 +1495,41 @@ namespace LLM { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); bool test_mistral = false; bool test_qwen3 = true; bool test_vit = false; bool test_decoder_with_vit = false; if (test_decoder_with_vit) { - ggml_tensor* image_embed = nullptr; + sd::Tensor image_embed; { - auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); - print_ggml_tensor(image, false, "image"); - ggml_tensor* out = nullptr; + auto image = sd::load_tensor_from_file_as_tensor("qwen2vl_normalized.bin"); + print_sd_tensor(image, false, "image"); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.encode_image(8, image, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.encode_image(8, image); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out, false, "image_embed"); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out, false, "image_embed"); image_embed = out; LOG_DEBUG("llm encode_image test done in %lldms", t1 - t0); } std::string placeholder = "<|image_pad|>"; std::string img_prompt = "Picture 1: <|vision_start|>"; // [24669, 220, 16, 25, 220, 151652] - int64_t num_image_tokens = image_embed->ne[1]; + int64_t num_image_tokens = image_embed.shape()[1]; img_prompt.reserve(num_image_tokens * placeholder.size()); for (int i = 0; i < num_image_tokens; i++) { img_prompt += placeholder; } img_prompt += "<|vision_end|>"; - std::vector> image_embeds; + std::vector>> image_embeds; image_embeds.emplace_back(64, image_embed); std::pair prompt_attn_range; @@ -1547,29 +1547,33 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - ggml_tensor* out = nullptr; + auto input_ids = sd::Tensor::from_vector(tokens); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, nullptr, image_embeds, {}, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, sd::Tensor(), image_embeds, {}); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("llm test done in %lldms", t1 - t0); } else if (test_vit) { - // auto image = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 280, 280, 3); + // auto image = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 280, 280, 3); // ggml_set_f32(image, 0.f); - auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); - print_ggml_tensor(image, false, "image"); - ggml_tensor* out = nullptr; + auto image = sd::load_tensor_from_file_as_tensor("qwen2vl_normalized.bin"); + print_sd_tensor(image, false, "image"); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.encode_image(8, image, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.encode_image(8, image); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out, false, "out"); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out, false, "out"); - // auto ref_out = load_tensor_from_file(work_ctx, "qwen2vl.bin"); + // auto ref_out = load_tensor_from_file(ctx, "qwen2vl.bin"); // ggml_ext_tensor_diff(ref_out, out, 0.01f); LOG_DEBUG("llm test done in %lldms", t1 - t0); @@ -1587,14 +1591,16 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - ggml_tensor* out = nullptr; + auto input_ids = sd::Tensor::from_vector(tokens); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, nullptr, {}, {10, 20, 30}, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, sd::Tensor(), {}, {10, 20, 30}); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("llm test done in %lldms", t1 - t0); } else if (test_qwen3) { std::pair prompt_attn_range; @@ -1610,14 +1616,16 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - ggml_tensor* out = nullptr; + auto input_ids = sd::Tensor::from_vector(tokens); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, nullptr, {}, {35}, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, sd::Tensor(), {}, {35}); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("llm test done in %lldms", t1 - t0); } else { std::pair prompt_attn_range; @@ -1633,14 +1641,16 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - ggml_tensor* out = nullptr; + auto input_ids = sd::Tensor::from_vector(tokens); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, nullptr, {}, {}, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, sd::Tensor(), {}, {}); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("llm test done in %lldms", t1 - t0); } } diff --git a/src/lora.hpp b/src/lora.hpp index 7df04ea..d4a749e 100644 --- a/src/lora.hpp +++ b/src/lora.hpp @@ -792,7 +792,7 @@ struct LoraModel : public GGMLRunner { auto get_graph = [&]() -> ggml_cgraph* { return build_lora_graph(model_tensors, version); }; - GGMLRunner::compute(get_graph, n_threads, false); + GGMLRunner::compute(get_graph, n_threads, false, true); stat(); for (auto item : original_tensor_to_final_tensor) { ggml_tensor* original_tensor = item.first; diff --git a/src/mmdit.hpp b/src/mmdit.hpp index 7fbb2b2..e75736c 100644 --- a/src/mmdit.hpp +++ b/src/mmdit.hpp @@ -836,17 +836,17 @@ struct MMDiTRunner : public GGMLRunner { mmdit.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* y, - std::vector skip_layers = std::vector()) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& y_tensor = {}, + std::vector skip_layers = std::vector()) { ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE); - x = to_backend(x); - context = to_backend(context); - y = to_backend(y); - timesteps = to_backend(timesteps); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* y = make_optional_input(y_tensor); auto runner_ctx = get_context(); ggml_tensor* out = mmdit.forward(&runner_ctx, @@ -861,14 +861,12 @@ struct MMDiTRunner : public GGMLRunner { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* y, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr, - std::vector skip_layers = std::vector()) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& y = {}, + std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size] @@ -877,7 +875,7 @@ struct MMDiTRunner : public GGMLRunner { return build_graph(x, timesteps, context, y, skip_layers); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -886,35 +884,41 @@ struct MMDiTRunner : public GGMLRunner { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // cpu f16: pass // cpu f32: pass // cuda f16: pass // cuda f32: pass - auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 128, 128, 16, 1); + sd::Tensor x({128, 128, 16, 1}); std::vector timesteps_vec(1, 999.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); - ggml_set_f32(x, 0.01f); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); + x.fill_(0.01f); // print_ggml_tensor(x); - auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 154, 1); - ggml_set_f32(context, 0.01f); + sd::Tensor context({4096, 154, 1}); + context.fill_(0.01f); // print_ggml_tensor(context); - auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 2048, 1); - ggml_set_f32(y, 0.01f); + sd::Tensor y({2048, 1}); + y.fill_(0.01f); // print_ggml_tensor(y); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, y, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + y); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("mmdit test done in %lldms", t1 - t0); } } diff --git a/src/pmid.hpp b/src/pmid.hpp index 30c4732..f19a8c3 100644 --- a/src/pmid.hpp +++ b/src/pmid.hpp @@ -443,11 +443,10 @@ public: id_encoder2.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph( // ggml_allocr* allocr, - ggml_tensor* id_pixel_values, - ggml_tensor* prompt_embeds, - std::vector& class_tokens_mask, - ggml_tensor* id_embeds) { + ggml_cgraph* build_graph(const sd::Tensor& id_pixel_values_tensor, + const sd::Tensor& prompt_embeds_tensor, + std::vector& class_tokens_mask, + const sd::Tensor& id_embeds_tensor = {}) { ctm.clear(); ctmf16.clear(); ctmpos.clear(); @@ -460,16 +459,16 @@ public: ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_tensor* id_pixel_values = make_input(id_pixel_values_tensor); + ggml_tensor* prompt_embeds = make_input(prompt_embeds_tensor); + ggml_tensor* id_embeds = make_optional_input(id_embeds_tensor); + int64_t hidden_size = prompt_embeds->ne[0]; int64_t seq_length = prompt_embeds->ne[1]; ggml_type type = GGML_TYPE_F32; ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size()); - ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values); - ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds); - ggml_tensor* id_embeds_d = to_backend(id_embeds); - ggml_tensor* left = nullptr; ggml_tensor* right = nullptr; for (int i = 0; i < class_tokens_mask.size(); i++) { @@ -529,18 +528,18 @@ public: ggml_tensor* updated_prompt_embeds = nullptr; if (pm_version == PM_VERSION_1) updated_prompt_embeds = id_encoder.forward(&runner_ctx, - id_pixel_values_d, - prompt_embeds_d, + id_pixel_values, + prompt_embeds, class_tokens_mask_d, class_tokens_mask_pos, left, right); else if (pm_version == PM_VERSION_2) updated_prompt_embeds = id_encoder2.forward(&runner_ctx, - id_pixel_values_d, - prompt_embeds_d, + id_pixel_values, + prompt_embeds, class_tokens_mask_d, class_tokens_mask_pos, - id_embeds_d, + id_embeds, left, right); ggml_build_forward_expand(gf, updated_prompt_embeds); @@ -548,20 +547,16 @@ public: return gf; } - bool compute(const int n_threads, - ggml_tensor* id_pixel_values, - ggml_tensor* prompt_embeds, - ggml_tensor* id_embeds, - std::vector& class_tokens_mask, - ggml_tensor** updated_prompt_embeds, - ggml_context* output_ctx) { + sd::Tensor compute(const int n_threads, + const sd::Tensor& id_pixel_values, + const sd::Tensor& prompt_embeds, + const sd::Tensor& id_embeds, + std::vector& class_tokens_mask) { auto get_graph = [&]() -> ggml_cgraph* { - // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask); return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds); }; - // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds); - return GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); } }; diff --git a/src/preprocessing.hpp b/src/preprocessing.hpp index ca05ca2..7c83a28 100644 --- a/src/preprocessing.hpp +++ b/src/preprocessing.hpp @@ -1,179 +1,241 @@ #ifndef __PREPROCESSING_HPP__ #define __PREPROCESSING_HPP__ +#include +#include + #include "ggml_extend.hpp" + #define M_PI_ 3.14159265358979323846f -void convolve(ggml_tensor* input, ggml_tensor* output, ggml_tensor* kernel, int padding) { - ggml_init_params params; - params.mem_size = 80 * input->ne[0] * input->ne[1]; // 20M for 512x512 - params.mem_buffer = nullptr; - params.no_alloc = false; - ggml_context* ctx0 = ggml_init(params); - ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1); - ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel)); - ggml_tensor* h = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1); - ggml_cgraph* gf = ggml_new_graph(ctx0); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output)); - ggml_graph_compute_with_ctx(ctx0, gf, 1); - ggml_free(ctx0); +static inline int64_t preprocessing_offset_4d(const sd::Tensor& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { + const auto& shape = tensor.shape(); + int64_t n0 = shape.size() > 0 ? shape[0] : 1; + int64_t n1 = shape.size() > 1 ? shape[1] : 1; + int64_t n2 = shape.size() > 2 ? shape[2] : 1; + return ((i3 * n2 + i2) * n1 + i1) * n0 + i0; } -void gaussian_kernel(ggml_tensor* kernel) { - int ks_mid = static_cast(kernel->ne[0] / 2); +static inline float preprocessing_get_4d(const sd::Tensor& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { + return tensor.values()[static_cast(preprocessing_offset_4d(tensor, i0, i1, i2, i3))]; +} + +static inline void preprocessing_set_4d(sd::Tensor& tensor, float value, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { + tensor.values()[static_cast(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value; +} + +static inline sd::Tensor sd_image_to_preprocessing_tensor(sd_image_t image) { + sd::Tensor tensor({static_cast(image.width), static_cast(image.height), static_cast(image.channel), 1}); + for (uint32_t y = 0; y < image.height; ++y) { + for (uint32_t x = 0; x < image.width; ++x) { + for (uint32_t c = 0; c < image.channel; ++c) { + preprocessing_set_4d(tensor, sd_image_get_f32(image, x, y, c), x, y, c, 0); + } + } + } + return tensor; +} + +static inline void preprocessing_tensor_to_sd_image(const sd::Tensor& tensor, uint8_t* image_data) { + GGML_ASSERT(tensor.dim() == 4); + GGML_ASSERT(tensor.shape()[3] == 1); + GGML_ASSERT(image_data != nullptr); + + int width = static_cast(tensor.shape()[0]); + int height = static_cast(tensor.shape()[1]); + int channel = static_cast(tensor.shape()[2]); + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < channel; ++c) { + float value = preprocessing_get_4d(tensor, x, y, c, 0); + value = std::min(1.0f, std::max(0.0f, value)); + image_data[(y * width + x) * channel + c] = static_cast(std::round(value * 255.0f)); + } + } + } +} + +static inline sd::Tensor gaussian_kernel_tensor(int kernel_size) { + sd::Tensor kernel({kernel_size, kernel_size, 1, 1}); + int ks_mid = kernel_size / 2; float sigma = 1.4f; - float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f)); - for (int y = 0; y < kernel->ne[0]; y++) { + float normal = 1.f / (2.0f * M_PI_ * std::pow(sigma, 2.0f)); + for (int y = 0; y < kernel_size; ++y) { float gx = static_cast(-ks_mid + y); - for (int x = 0; x < kernel->ne[1]; x++) { + for (int x = 0; x < kernel_size; ++x) { float gy = static_cast(-ks_mid + x); - float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal; - ggml_ext_tensor_set_f32(kernel, k_, x, y); + float k = std::exp(-((gx * gx + gy * gy) / (2.0f * std::pow(sigma, 2.0f)))) * normal; + preprocessing_set_4d(kernel, k, x, y, 0, 0); } } + return kernel; } -void grayscale(ggml_tensor* rgb_img, ggml_tensor* grayscale) { - for (int iy = 0; iy < rgb_img->ne[1]; iy++) { - for (int ix = 0; ix < rgb_img->ne[0]; ix++) { - float r = ggml_ext_tensor_get_f32(rgb_img, ix, iy); - float g = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 1); - float b = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 2); +static inline sd::Tensor convolve_tensor(const sd::Tensor& input, const sd::Tensor& kernel, int padding) { + GGML_ASSERT(input.dim() == 4); + GGML_ASSERT(kernel.dim() == 4); + GGML_ASSERT(input.shape()[3] == 1); + GGML_ASSERT(kernel.shape()[2] == 1); + GGML_ASSERT(kernel.shape()[3] == 1); + + sd::Tensor output(input.shape()); + int64_t width = input.shape()[0]; + int64_t height = input.shape()[1]; + int64_t channels = input.shape()[2]; + int64_t kernel_w = kernel.shape()[0]; + int64_t kernel_h = kernel.shape()[1]; + + for (int64_t c = 0; c < channels; ++c) { + for (int64_t y = 0; y < height; ++y) { + for (int64_t x = 0; x < width; ++x) { + float sum = 0.0f; + for (int64_t ky = 0; ky < kernel_h; ++ky) { + int64_t iy = y + ky - padding; + if (iy < 0 || iy >= height) { + continue; + } + for (int64_t kx = 0; kx < kernel_w; ++kx) { + int64_t ix = x + kx - padding; + if (ix < 0 || ix >= width) { + continue; + } + sum += preprocessing_get_4d(input, ix, iy, c, 0) * preprocessing_get_4d(kernel, kx, ky, 0, 0); + } + } + preprocessing_set_4d(output, sum, x, y, c, 0); + } + } + } + return output; +} + +static inline sd::Tensor grayscale_tensor(const sd::Tensor& rgb_img) { + GGML_ASSERT(rgb_img.dim() == 4); + GGML_ASSERT(rgb_img.shape()[2] >= 3); + sd::Tensor grayscale({rgb_img.shape()[0], rgb_img.shape()[1], 1, rgb_img.shape()[3]}); + for (int64_t iy = 0; iy < rgb_img.shape()[1]; ++iy) { + for (int64_t ix = 0; ix < rgb_img.shape()[0]; ++ix) { + float r = preprocessing_get_4d(rgb_img, ix, iy, 0, 0); + float g = preprocessing_get_4d(rgb_img, ix, iy, 1, 0); + float b = preprocessing_get_4d(rgb_img, ix, iy, 2, 0); float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b; - ggml_ext_tensor_set_f32(grayscale, gray, ix, iy); + preprocessing_set_4d(grayscale, gray, ix, iy, 0, 0); } } + return grayscale; } -void prop_hypot(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) { - int n_elements = static_cast(ggml_nelements(h)); - float* dx = (float*)x->data; - float* dy = (float*)y->data; - float* dh = (float*)h->data; - for (int i = 0; i < n_elements; i++) { - dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]); +static inline sd::Tensor tensor_hypot(const sd::Tensor& x, const sd::Tensor& y) { + sd::tensor_check_same_shape(x, y); + sd::Tensor out(x.shape()); + for (int64_t i = 0; i < out.numel(); ++i) { + out[i] = std::sqrt(x[i] * x[i] + y[i] * y[i]); } + return out; } -void prop_arctan2(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) { - int n_elements = static_cast(ggml_nelements(h)); - float* dx = (float*)x->data; - float* dy = (float*)y->data; - float* dh = (float*)h->data; - for (int i = 0; i < n_elements; i++) { - dh[i] = atan2f(dy[i], dx[i]); +static inline sd::Tensor tensor_arctan2(const sd::Tensor& x, const sd::Tensor& y) { + sd::tensor_check_same_shape(x, y); + sd::Tensor out(x.shape()); + for (int64_t i = 0; i < out.numel(); ++i) { + out[i] = std::atan2(y[i], x[i]); } + return out; } -void normalize_tensor(ggml_tensor* g) { - int n_elements = static_cast(ggml_nelements(g)); - float* dg = (float*)g->data; - float max = -INFINITY; - for (int i = 0; i < n_elements; i++) { - max = dg[i] > max ? dg[i] : max; +static inline void normalize_tensor(sd::Tensor* g) { + GGML_ASSERT(g != nullptr); + if (g->empty()) { + return; } - max = 1.0f / max; - for (int i = 0; i < n_elements; i++) { - dg[i] *= max; + float max_value = -std::numeric_limits::infinity(); + for (int64_t i = 0; i < g->numel(); ++i) { + max_value = std::max(max_value, (*g)[i]); } + if (max_value == 0.0f || !std::isfinite(max_value)) { + return; + } + *g *= (1.0f / max_value); } -void non_max_supression(ggml_tensor* result, ggml_tensor* G, ggml_tensor* D) { - for (int iy = 1; iy < result->ne[1] - 1; iy++) { - for (int ix = 1; ix < result->ne[0] - 1; ix++) { - float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_; - angle = angle < 0.0f ? angle += 180.0f : angle; +static inline sd::Tensor non_max_supression(const sd::Tensor& G, const sd::Tensor& D) { + GGML_ASSERT(G.shape() == D.shape()); + sd::Tensor result = sd::Tensor::zeros(G.shape()); + for (int64_t iy = 1; iy < result.shape()[1] - 1; ++iy) { + for (int64_t ix = 1; ix < result.shape()[0] - 1; ++ix) { + float angle = preprocessing_get_4d(D, ix, iy, 0, 0) * 180.0f / M_PI_; + angle = angle < 0.0f ? angle + 180.0f : angle; float q = 1.0f; float r = 1.0f; - // angle 0 - if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) { - q = ggml_ext_tensor_get_f32(G, ix, iy + 1); - r = ggml_ext_tensor_get_f32(G, ix, iy - 1); - } - // angle 45 - else if (22.5f >= angle && angle < 67.5f) { - q = ggml_ext_tensor_get_f32(G, ix + 1, iy - 1); - r = ggml_ext_tensor_get_f32(G, ix - 1, iy + 1); - } - // angle 90 - else if (67.5f >= angle && angle < 112.5) { - q = ggml_ext_tensor_get_f32(G, ix + 1, iy); - r = ggml_ext_tensor_get_f32(G, ix - 1, iy); - } - // angle 135 - else if (112.5 >= angle && angle < 157.5f) { - q = ggml_ext_tensor_get_f32(G, ix - 1, iy - 1); - r = ggml_ext_tensor_get_f32(G, ix + 1, iy + 1); + if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180.0f)) { + q = preprocessing_get_4d(G, ix, iy + 1, 0, 0); + r = preprocessing_get_4d(G, ix, iy - 1, 0, 0); + } else if (22.5f >= angle && angle < 67.5f) { + q = preprocessing_get_4d(G, ix + 1, iy - 1, 0, 0); + r = preprocessing_get_4d(G, ix - 1, iy + 1, 0, 0); + } else if (67.5f >= angle && angle < 112.5f) { + q = preprocessing_get_4d(G, ix + 1, iy, 0, 0); + r = preprocessing_get_4d(G, ix - 1, iy, 0, 0); + } else if (112.5f >= angle && angle < 157.5f) { + q = preprocessing_get_4d(G, ix - 1, iy - 1, 0, 0); + r = preprocessing_get_4d(G, ix + 1, iy + 1, 0, 0); } - float cur = ggml_ext_tensor_get_f32(G, ix, iy); - if ((cur >= q) && (cur >= r)) { - ggml_ext_tensor_set_f32(result, cur, ix, iy); - } else { - ggml_ext_tensor_set_f32(result, 0.0f, ix, iy); - } + float cur = preprocessing_get_4d(G, ix, iy, 0, 0); + preprocessing_set_4d(result, (cur >= q && cur >= r) ? cur : 0.0f, ix, iy, 0, 0); } } + return result; } -void threshold_hystersis(ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) { - int n_elements = static_cast(ggml_nelements(img)); - float* imd = (float*)img->data; - float max = -INFINITY; - for (int i = 0; i < n_elements; i++) { - max = imd[i] > max ? imd[i] : max; +static inline void threshold_hystersis(sd::Tensor* img, float high_threshold, float low_threshold, float weak, float strong) { + GGML_ASSERT(img != nullptr); + if (img->empty()) { + return; } - float ht = max * high_threshold; + float max_value = -std::numeric_limits::infinity(); + for (int64_t i = 0; i < img->numel(); ++i) { + max_value = std::max(max_value, (*img)[i]); + } + + float ht = max_value * high_threshold; float lt = ht * low_threshold; - for (int i = 0; i < n_elements; i++) { - float img_v = imd[i]; - if (img_v >= ht) { // strong pixel - imd[i] = strong; - } else if (img_v <= ht && img_v >= lt) { // strong pixel - imd[i] = weak; + for (int64_t i = 0; i < img->numel(); ++i) { + float img_v = (*img)[i]; + if (img_v >= ht) { + (*img)[i] = strong; + } else if (img_v <= ht && img_v >= lt) { + (*img)[i] = weak; } } - for (int iy = 0; iy < img->ne[1]; iy++) { - for (int ix = 0; ix < img->ne[0]; ix++) { - if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) { - ggml_ext_tensor_set_f32(img, ggml_ext_tensor_get_f32(img, ix, iy), ix, iy); - } else { - ggml_ext_tensor_set_f32(img, 0.0f, ix, iy); + for (int64_t iy = 0; iy < img->shape()[1]; ++iy) { + for (int64_t ix = 0; ix < img->shape()[0]; ++ix) { + if (!(ix >= 3 && ix <= img->shape()[0] - 3 && iy >= 3 && iy <= img->shape()[1] - 3)) { + preprocessing_set_4d(*img, 0.0f, ix, iy, 0, 0); } } } - // hysteresis - for (int iy = 1; iy < img->ne[1] - 1; iy++) { - for (int ix = 1; ix < img->ne[0] - 1; ix++) { - float imd_v = ggml_ext_tensor_get_f32(img, ix, iy); + for (int64_t iy = 1; iy < img->shape()[1] - 1; ++iy) { + for (int64_t ix = 1; ix < img->shape()[0] - 1; ++ix) { + float imd_v = preprocessing_get_4d(*img, ix, iy, 0, 0); if (imd_v == weak) { - if (ggml_ext_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix + 1, iy) == strong || - ggml_ext_tensor_get_f32(img, ix, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix, iy + 1) == strong || - ggml_ext_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix - 1, iy) == strong) { - ggml_ext_tensor_set_f32(img, strong, ix, iy); - } else { - ggml_ext_tensor_set_f32(img, 0.0f, ix, iy); - } + bool has_strong_neighbor = + preprocessing_get_4d(*img, ix + 1, iy - 1, 0, 0) == strong || + preprocessing_get_4d(*img, ix + 1, iy, 0, 0) == strong || + preprocessing_get_4d(*img, ix, iy - 1, 0, 0) == strong || + preprocessing_get_4d(*img, ix, iy + 1, 0, 0) == strong || + preprocessing_get_4d(*img, ix - 1, iy - 1, 0, 0) == strong || + preprocessing_get_4d(*img, ix - 1, iy, 0, 0) == strong; + preprocessing_set_4d(*img, has_strong_neighbor ? strong : 0.0f, ix, iy, 0, 0); } } } } bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) { - ggml_init_params params; - params.mem_size = static_cast(40 * img.width * img.height); // 10MB for 512x512 - params.mem_buffer = nullptr; - params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); - return false; - } - float kX[9] = { -1, 0, 1, -2, 0, 2, @@ -184,43 +246,33 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, 0, 0, 0, -1, -2, -1}; - // generate kernel - int kernel_size = 5; - ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1); - ggml_tensor* sf_kx = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); - memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx)); - ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); - memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky)); - gaussian_kernel(gkernel); - ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1); - ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1); - ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray); - ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray); - ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray); - ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray); - sd_image_to_ggml_tensor(img, image); - grayscale(image, image_gray); - convolve(image_gray, image_gray, gkernel, 2); - convolve(image_gray, iX, sf_kx, 1); - convolve(image_gray, iY, sf_ky, 1); - prop_hypot(iX, iY, G); - normalize_tensor(G); - prop_arctan2(iX, iY, tetha); - non_max_supression(image_gray, G, tetha); - threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong); - // to RGB channels - for (uint32_t iy = 0; iy < img.height; iy++) { - for (uint32_t ix = 0; ix < img.width; ix++) { - float gray = ggml_ext_tensor_get_f32(image_gray, ix, iy); + sd::Tensor gkernel = gaussian_kernel_tensor(5); + sd::Tensor sf_kx({3, 3, 1, 1}, std::vector(kX, kX + 9)); + sd::Tensor sf_ky({3, 3, 1, 1}, std::vector(kY, kY + 9)); + + sd::Tensor image = sd_image_to_preprocessing_tensor(img); + sd::Tensor image_gray = grayscale_tensor(image); + image_gray = convolve_tensor(image_gray, gkernel, 2); + sd::Tensor iX = convolve_tensor(image_gray, sf_kx, 1); + sd::Tensor iY = convolve_tensor(image_gray, sf_ky, 1); + sd::Tensor G = tensor_hypot(iX, iY); + normalize_tensor(&G); + sd::Tensor theta = tensor_arctan2(iX, iY); + image_gray = non_max_supression(G, theta); + threshold_hystersis(&image_gray, high_threshold, low_threshold, weak, strong); + + for (uint32_t iy = 0; iy < img.height; ++iy) { + for (uint32_t ix = 0; ix < img.width; ++ix) { + float gray = preprocessing_get_4d(image_gray, ix, iy, 0, 0); gray = inverse ? 1.0f - gray : gray; - ggml_ext_tensor_set_f32(image, gray, ix, iy); - ggml_ext_tensor_set_f32(image, gray, ix, iy, 1); - ggml_ext_tensor_set_f32(image, gray, ix, iy, 2); + for (uint32_t c = 0; c < img.channel; ++c) { + preprocessing_set_4d(image, gray, ix, iy, c, 0); + } } } - ggml_tensor_to_sd_image(image, img.data); - ggml_free(work_ctx); + + preprocessing_tensor_to_sd_image(image, img.data); return true; } -#endif // __PREPROCESSING_HPP__ \ No newline at end of file +#endif // __PREPROCESSING_HPP__ diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp index 68af0e8..83c8cec 100644 --- a/src/qwen_image.hpp +++ b/src/qwen_image.hpp @@ -525,20 +525,21 @@ namespace Qwen { qwen_image.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor, + const std::vector>& ref_latents_tensor = {}, + bool increase_ref_index = false) { + ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); GGML_ASSERT(x->ne[3] == 1); - ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE); - - x = to_backend(x); - context = to_backend(context); - timesteps = to_backend(timesteps); - - for (int i = 0; i < ref_latents.size(); i++) { - ref_latents[i] = to_backend(ref_latents[i]); + GGML_ASSERT(!context_tensor.empty()); + ggml_tensor* context = make_input(context_tensor); + std::vector ref_latents; + ref_latents.reserve(ref_latents_tensor.size()); + for (const auto& ref_latent_tensor : ref_latents_tensor) { + ref_latents.push_back(make_input(ref_latent_tensor)); } pe_vec = Rope::gen_qwen_image_pe(static_cast(x->ne[1]), @@ -600,14 +601,12 @@ namespace Qwen { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context, + const std::vector>& ref_latents = {}, + bool increase_ref_index = false) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] @@ -615,7 +614,7 @@ namespace Qwen { return build_graph(x, timesteps, context, ref_latents, increase_ref_index); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -624,30 +623,37 @@ namespace Qwen { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { - // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1); + // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 16, 16, 16, 1); // ggml_set_f32(x, 0.01f); - auto x = load_tensor_from_file(work_ctx, "./qwen_image_x.bin"); - print_ggml_tensor(x); + auto x = sd::load_tensor_from_file_as_tensor("./qwen_image_x.bin"); + print_sd_tensor(x); std::vector timesteps_vec(1, 1000.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); - // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 3584, 256, 1); + // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3584, 256, 1); // ggml_set_f32(context, 0.01f); - auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin"); - print_ggml_tensor(context); + auto context = sd::load_tensor_from_file_as_tensor("./qwen_image_context.bin"); + print_sd_tensor(context); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, {}, false, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + {}, + false); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("qwen_image test done in %lldms", t1 - t0); } } diff --git a/src/sample-cache.cpp b/src/sample-cache.cpp new file mode 100644 index 0000000..5739178 --- /dev/null +++ b/src/sample-cache.cpp @@ -0,0 +1,361 @@ +#include "sample-cache.h" + +namespace sd_sample { + + static float get_cache_reuse_threshold(const sd_cache_params_t& params) { + float reuse_threshold = params.reuse_threshold; + if (reuse_threshold == INFINITY) { + if (params.mode == SD_CACHE_EASYCACHE) { + reuse_threshold = 0.2f; + } else if (params.mode == SD_CACHE_UCACHE) { + reuse_threshold = 1.0f; + } + } + return std::max(0.0f, reuse_threshold); + } + + bool SampleCacheRuntime::easycache_enabled() const { + return mode == SampleCacheMode::EASYCACHE; + } + + bool SampleCacheRuntime::ucache_enabled() const { + return mode == SampleCacheMode::UCACHE; + } + + bool SampleCacheRuntime::cachedit_enabled() const { + return mode == SampleCacheMode::CACHEDIT; + } + + static bool has_valid_cache_percent_range(const sd_cache_params_t& cache_params) { + if (cache_params.mode != SD_CACHE_EASYCACHE && cache_params.mode != SD_CACHE_UCACHE) { + return true; + } + + return cache_params.start_percent >= 0.0f && + cache_params.start_percent < 1.0f && + cache_params.end_percent > 0.0f && + cache_params.end_percent <= 1.0f && + cache_params.start_percent < cache_params.end_percent; + } + + static void init_easycache_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + Denoiser* denoiser) { + if (!sd_version_is_dit(version)) { + LOG_WARN("EasyCache requested but not supported for this model type"); + return; + } + + EasyCacheConfig config; + config.enabled = true; + config.reuse_threshold = get_cache_reuse_threshold(cache_params); + config.start_percent = cache_params.start_percent; + config.end_percent = cache_params.end_percent; + + runtime.easycache.init(config, denoiser); + if (!runtime.easycache.enabled()) { + LOG_WARN("EasyCache requested but could not be initialized for this run"); + return; + } + + runtime.mode = SampleCacheMode::EASYCACHE; + LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f", + config.reuse_threshold, + config.start_percent, + config.end_percent); + } + + static void init_ucache_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + Denoiser* denoiser, + const std::vector& sigmas) { + if (!sd_version_is_unet(version)) { + LOG_WARN("UCache requested but not supported for this model type (only UNET models)"); + return; + } + + UCacheConfig config; + config.enabled = true; + config.reuse_threshold = get_cache_reuse_threshold(cache_params); + config.start_percent = cache_params.start_percent; + config.end_percent = cache_params.end_percent; + config.error_decay_rate = std::max(0.0f, std::min(1.0f, cache_params.error_decay_rate)); + config.use_relative_threshold = cache_params.use_relative_threshold; + config.reset_error_on_compute = cache_params.reset_error_on_compute; + + runtime.ucache.init(config, denoiser); + if (!runtime.ucache.enabled()) { + LOG_WARN("UCache requested but could not be initialized for this run"); + return; + } + + runtime.ucache.set_sigmas(sigmas); + runtime.mode = SampleCacheMode::UCACHE; + LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s", + config.reuse_threshold, + config.start_percent, + config.end_percent, + config.error_decay_rate, + config.use_relative_threshold ? "true" : "false", + config.reset_error_on_compute ? "true" : "false"); + } + + static void init_cachedit_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + const std::vector& sigmas) { + if (!sd_version_is_dit(version)) { + LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)"); + return; + } + + DBCacheConfig dbcfg; + dbcfg.enabled = (cache_params.mode == SD_CACHE_DBCACHE || cache_params.mode == SD_CACHE_CACHE_DIT); + dbcfg.Fn_compute_blocks = cache_params.Fn_compute_blocks; + dbcfg.Bn_compute_blocks = cache_params.Bn_compute_blocks; + dbcfg.residual_diff_threshold = cache_params.residual_diff_threshold; + dbcfg.max_warmup_steps = cache_params.max_warmup_steps; + dbcfg.max_cached_steps = cache_params.max_cached_steps; + dbcfg.max_continuous_cached_steps = cache_params.max_continuous_cached_steps; + if (cache_params.scm_mask != nullptr && strlen(cache_params.scm_mask) > 0) { + dbcfg.steps_computation_mask = parse_scm_mask(cache_params.scm_mask); + } + dbcfg.scm_policy_dynamic = cache_params.scm_policy_dynamic; + + TaylorSeerConfig tcfg; + tcfg.enabled = (cache_params.mode == SD_CACHE_TAYLORSEER || cache_params.mode == SD_CACHE_CACHE_DIT); + tcfg.n_derivatives = cache_params.taylorseer_n_derivatives; + tcfg.skip_interval_steps = cache_params.taylorseer_skip_interval; + + runtime.cachedit.init(dbcfg, tcfg); + if (!runtime.cachedit.enabled()) { + LOG_WARN("CacheDIT requested but could not be initialized for this run"); + return; + } + + runtime.cachedit.set_sigmas(sigmas); + runtime.mode = SampleCacheMode::CACHEDIT; + LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d", + cache_params.mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params.mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"), + dbcfg.Fn_compute_blocks, + dbcfg.Bn_compute_blocks, + dbcfg.residual_diff_threshold, + dbcfg.max_warmup_steps); + } + + static void init_spectrum_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + const std::vector& sigmas) { + if (!sd_version_is_unet(version) && !sd_version_is_dit(version)) { + LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)"); + return; + } + + SpectrumConfig config; + config.w = cache_params.spectrum_w; + config.m = cache_params.spectrum_m; + config.lam = cache_params.spectrum_lam; + config.window_size = cache_params.spectrum_window_size; + config.flex_window = cache_params.spectrum_flex_window; + config.warmup_steps = cache_params.spectrum_warmup_steps; + config.stop_percent = cache_params.spectrum_stop_percent; + + size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; + runtime.spectrum.init(config, total_steps); + runtime.spectrum_enabled = true; + + LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%", + config.w, config.m, config.lam, + config.window_size, config.flex_window, + config.warmup_steps, config.stop_percent * 100.0f); + } + + SampleCacheRuntime init_sample_cache_runtime(SDVersion version, + const sd_cache_params_t* cache_params, + Denoiser* denoiser, + const std::vector& sigmas) { + SampleCacheRuntime runtime; + if (cache_params == nullptr || cache_params->mode == SD_CACHE_DISABLED) { + return runtime; + } + + if (!has_valid_cache_percent_range(*cache_params)) { + LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)", + cache_params->start_percent, + cache_params->end_percent); + return runtime; + } + + switch (cache_params->mode) { + case SD_CACHE_EASYCACHE: + init_easycache_runtime(runtime, version, *cache_params, denoiser); + break; + case SD_CACHE_UCACHE: + init_ucache_runtime(runtime, version, *cache_params, denoiser, sigmas); + break; + case SD_CACHE_DBCACHE: + case SD_CACHE_TAYLORSEER: + case SD_CACHE_CACHE_DIT: + init_cachedit_runtime(runtime, version, *cache_params, sigmas); + break; + case SD_CACHE_SPECTRUM: + init_spectrum_runtime(runtime, version, *cache_params, sigmas); + break; + default: + break; + } + + return runtime; + } + + SampleStepCacheDispatcher::SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma) + : runtime(runtime), step(step), sigma(sigma), step_index(step > 0 ? (step - 1) : -1) { + if (step_index < 0) { + return; + } + + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + runtime.easycache.begin_step(step_index, sigma); + break; + case SampleCacheMode::UCACHE: + runtime.ucache.begin_step(step_index, sigma); + break; + case SampleCacheMode::CACHEDIT: + runtime.cachedit.begin_step(step_index, sigma); + break; + case SampleCacheMode::NONE: + break; + } + } + + bool SampleStepCacheDispatcher::before_condition(const void* condition, + const sd::Tensor& input, + sd::Tensor* output) { + if (step_index < 0 || condition == nullptr || output == nullptr) { + return false; + } + + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + return runtime.easycache.before_condition(condition, input, output, sigma, step_index); + case SampleCacheMode::UCACHE: + return runtime.ucache.before_condition(condition, input, output, sigma, step_index); + case SampleCacheMode::CACHEDIT: + return runtime.cachedit.before_condition(condition, input, output, sigma, step_index); + case SampleCacheMode::NONE: + return false; + } + + return false; + } + + void SampleStepCacheDispatcher::after_condition(const void* condition, + const sd::Tensor& input, + const sd::Tensor& output) { + if (step_index < 0 || condition == nullptr) { + return; + } + + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + runtime.easycache.after_condition(condition, input, output); + break; + case SampleCacheMode::UCACHE: + runtime.ucache.after_condition(condition, input, output); + break; + case SampleCacheMode::CACHEDIT: + runtime.cachedit.after_condition(condition, input, output); + break; + case SampleCacheMode::NONE: + break; + } + } + + bool SampleStepCacheDispatcher::is_step_skipped() const { + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + return runtime.easycache.is_step_skipped(); + case SampleCacheMode::UCACHE: + return runtime.ucache.is_step_skipped(); + case SampleCacheMode::CACHEDIT: + return runtime.cachedit.is_step_skipped(); + case SampleCacheMode::NONE: + return false; + } + + return false; + } + + void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps) { + if (runtime.easycache_enabled()) { + if (runtime.easycache.total_steps_skipped > 0 && total_steps > 0) { + if (runtime.easycache.total_steps_skipped < static_cast(total_steps)) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.easycache.total_steps_skipped); + LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.easycache.total_steps_skipped, + total_steps, + speedup); + } else { + LOG_INFO("EasyCache skipped %d/%zu steps", + runtime.easycache.total_steps_skipped, + total_steps); + } + } else if (total_steps > 0) { + LOG_INFO("EasyCache completed without skipping steps"); + } + } + + if (runtime.ucache_enabled()) { + if (runtime.ucache.total_steps_skipped > 0 && total_steps > 0) { + if (runtime.ucache.total_steps_skipped < static_cast(total_steps)) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.ucache.total_steps_skipped); + LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.ucache.total_steps_skipped, + total_steps, + speedup); + } else { + LOG_INFO("UCache skipped %d/%zu steps", + runtime.ucache.total_steps_skipped, + total_steps); + } + } else if (total_steps > 0) { + LOG_INFO("UCache completed without skipping steps"); + } + } + + if (runtime.cachedit_enabled()) { + if (runtime.cachedit.total_steps_skipped > 0 && total_steps > 0) { + if (runtime.cachedit.total_steps_skipped < static_cast(total_steps)) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.cachedit.total_steps_skipped); + LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.cachedit.total_steps_skipped, + total_steps, + speedup); + } else { + LOG_INFO("CacheDIT skipped %d/%zu steps", + runtime.cachedit.total_steps_skipped, + total_steps); + } + } else if (total_steps > 0) { + LOG_INFO("CacheDIT completed without skipping steps"); + } + } + + if (runtime.spectrum_enabled && runtime.spectrum.total_steps_skipped > 0 && total_steps > 0) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.spectrum.total_steps_skipped); + LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.spectrum.total_steps_skipped, + total_steps, + speedup); + } + } + +} // namespace sd_sample diff --git a/src/sample-cache.h b/src/sample-cache.h new file mode 100644 index 0000000..398ad06 --- /dev/null +++ b/src/sample-cache.h @@ -0,0 +1,61 @@ +#ifndef __SAMPLE_CACHE_H__ +#define __SAMPLE_CACHE_H__ + +#include + +#include "cache_dit.hpp" +#include "denoiser.hpp" +#include "easycache.hpp" +#include "model.h" +#include "spectrum.hpp" +#include "tensor.hpp" +#include "ucache.hpp" +#include "util.h" + +namespace sd_sample { + + enum class SampleCacheMode { + NONE, + EASYCACHE, + UCACHE, + CACHEDIT, + }; + + struct SampleCacheRuntime { + SampleCacheMode mode = SampleCacheMode::NONE; + + EasyCacheState easycache; + UCacheState ucache; + CacheDitConditionState cachedit; + SpectrumState spectrum; + + bool spectrum_enabled = false; + + bool easycache_enabled() const; + bool ucache_enabled() const; + bool cachedit_enabled() const; + }; + + struct SampleStepCacheDispatcher { + SampleCacheRuntime& runtime; + int step; + float sigma; + int step_index; + + SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma); + + bool before_condition(const void* condition, const sd::Tensor& input, sd::Tensor* output); + void after_condition(const void* condition, const sd::Tensor& input, const sd::Tensor& output); + bool is_step_skipped() const; + }; + + SampleCacheRuntime init_sample_cache_runtime(SDVersion version, + const sd_cache_params_t* cache_params, + Denoiser* denoiser, + const std::vector& sigmas); + + void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps); + +} // namespace sd_sample + +#endif // __SAMPLE_CACHE_H__ diff --git a/src/spectrum.hpp b/src/spectrum.hpp index 9542a8f..add1796 100644 --- a/src/spectrum.hpp +++ b/src/spectrum.hpp @@ -6,6 +6,7 @@ #include #include "ggml_extend.hpp" +#include "tensor.hpp" struct SpectrumConfig { float w = 0.40f; @@ -57,11 +58,8 @@ struct SpectrumState { return (num_cached + 1) % ws != 0; } - void update(const ggml_tensor* denoised) { - int64_t ne = ggml_nelements(denoised); - const float* data = (const float*)denoised->data; - - H_buf.emplace_back(data, data + ne); + void update(const sd::Tensor& denoised) { + H_buf.emplace_back(denoised.data(), denoised.data() + denoised.numel()); T_buf.push_back(taus(cnt)); while ((int)H_buf.size() > K) { @@ -76,13 +74,13 @@ struct SpectrumState { cnt++; } - void predict(ggml_tensor* denoised) { + void predict(sd::Tensor* denoised) { + GGML_ASSERT(denoised != nullptr); int64_t F = (int64_t)H_buf[0].size(); int K_curr = (int)H_buf.size(); int M1 = config.m + 1; float tau_at = taus(cnt); - // Design matrix X: K_curr x M1 (Chebyshev basis) std::vector X(K_curr * M1); for (int i = 0; i < K_curr; i++) { X[i * M1] = 1.0f; @@ -92,7 +90,6 @@ struct SpectrumState { X[i * M1 + j] = 2.0f * T_buf[i] * X[i * M1 + j - 1] - X[i * M1 + j - 2]; } - // x_star: Chebyshev basis at current tau std::vector x_star(M1); x_star[0] = 1.0f; if (M1 > 1) @@ -100,7 +97,6 @@ struct SpectrumState { for (int j = 2; j < M1; j++) x_star[j] = 2.0f * tau_at * x_star[j - 1] - x_star[j - 2]; - // XtX = X^T X + lambda I std::vector XtX(M1 * M1, 0.0f); for (int i = 0; i < M1; i++) { for (int j = 0; j < M1; j++) { @@ -111,7 +107,6 @@ struct SpectrumState { } } - // Cholesky decomposition std::vector L(M1 * M1, 0.0f); if (!cholesky_decompose(XtX.data(), L.data(), M1)) { float trace = 0.0f; @@ -122,18 +117,15 @@ struct SpectrumState { cholesky_decompose(XtX.data(), L.data(), M1); } - // Solve XtX v = x_star std::vector v(M1); cholesky_solve(L.data(), x_star.data(), v.data(), M1); - // Prediction weights per history entry std::vector weights(K_curr, 0.0f); for (int k = 0; k < K_curr; k++) for (int j = 0; j < M1; j++) weights[k] += X[k * M1 + j] * v[j]; - // Blend Chebyshev and Taylor predictions - float* out = (float*)denoised->data; + float* out = denoised->data(); float w_cheb = config.w; float w_taylor = 1.0f - w_cheb; const float* h_last = H_buf.back().data(); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index bbf2f97..a59ff23 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -8,18 +8,15 @@ #include "util.h" #include "auto_encoder_kl.hpp" -#include "cache_dit.hpp" #include "conditioner.hpp" #include "control.hpp" #include "denoiser.hpp" #include "diffusion_model.hpp" -#include "easycache.hpp" #include "esrgan.hpp" #include "lora.hpp" #include "pmid.hpp" -#include "spectrum.hpp" +#include "sample-cache.h" #include "tae.hpp" -#include "ucache.hpp" #include "vae.hpp" #include "latent-preview.h" @@ -78,7 +75,7 @@ const char* sampling_methods_str[] = { void calculate_alphas_cumprod(float* alphas_cumprod, float linear_start = 0.00085f, - float linear_end = 0.0120, + float linear_end = 0.0120f, int timesteps = TIMESTEPS) { float ls_sqrt = sqrtf(linear_start); float le_sqrt = sqrtf(linear_end); @@ -95,387 +92,14 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { float reuse_threshold = params.reuse_threshold; if (reuse_threshold == INFINITY) { if (params.mode == SD_CACHE_EASYCACHE) { - reuse_threshold = 0.2; + reuse_threshold = 0.2f; } else if (params.mode == SD_CACHE_UCACHE) { - reuse_threshold = 1.0; + reuse_threshold = 1.0f; } } return std::max(0.0f, reuse_threshold); } -enum class SampleCacheMode { - NONE, - EASYCACHE, - UCACHE, - CACHEDIT, -}; - -struct SampleCacheRuntime { - SampleCacheMode mode = SampleCacheMode::NONE; - - EasyCacheState easycache; - UCacheState ucache; - CacheDitConditionState cachedit; - SpectrumState spectrum; - - bool spectrum_enabled = false; - - bool has_step_cache() const { - return mode != SampleCacheMode::NONE; - } - - bool easycache_enabled() const { - return mode == SampleCacheMode::EASYCACHE; - } - - bool ucache_enabled() const { - return mode == SampleCacheMode::UCACHE; - } - - bool cachedit_enabled() const { - return mode == SampleCacheMode::CACHEDIT; - } -}; - -static bool has_valid_cache_percent_range(const sd_cache_params_t& cache_params) { - if (cache_params.mode != SD_CACHE_EASYCACHE && cache_params.mode != SD_CACHE_UCACHE) { - return true; - } - - return cache_params.start_percent >= 0.0f && - cache_params.start_percent < 1.0f && - cache_params.end_percent > 0.0f && - cache_params.end_percent <= 1.0f && - cache_params.start_percent < cache_params.end_percent; -} - -static void init_easycache_runtime(SampleCacheRuntime& runtime, - SDVersion version, - const sd_cache_params_t& cache_params, - Denoiser* denoiser) { - if (!sd_version_is_dit(version)) { - LOG_WARN("EasyCache requested but not supported for this model type"); - return; - } - - EasyCacheConfig config; - config.enabled = true; - config.reuse_threshold = get_cache_reuse_threshold(cache_params); - config.start_percent = cache_params.start_percent; - config.end_percent = cache_params.end_percent; - - runtime.easycache.init(config, denoiser); - if (!runtime.easycache.enabled()) { - LOG_WARN("EasyCache requested but could not be initialized for this run"); - return; - } - - runtime.mode = SampleCacheMode::EASYCACHE; - LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f", - config.reuse_threshold, - config.start_percent, - config.end_percent); -} - -static void init_ucache_runtime(SampleCacheRuntime& runtime, - SDVersion version, - const sd_cache_params_t& cache_params, - Denoiser* denoiser, - const std::vector& sigmas) { - if (!sd_version_is_unet(version)) { - LOG_WARN("UCache requested but not supported for this model type (only UNET models)"); - return; - } - - UCacheConfig config; - config.enabled = true; - config.reuse_threshold = get_cache_reuse_threshold(cache_params); - config.start_percent = cache_params.start_percent; - config.end_percent = cache_params.end_percent; - config.error_decay_rate = std::max(0.0f, std::min(1.0f, cache_params.error_decay_rate)); - config.use_relative_threshold = cache_params.use_relative_threshold; - config.reset_error_on_compute = cache_params.reset_error_on_compute; - - runtime.ucache.init(config, denoiser); - if (!runtime.ucache.enabled()) { - LOG_WARN("UCache requested but could not be initialized for this run"); - return; - } - - runtime.ucache.set_sigmas(sigmas); - runtime.mode = SampleCacheMode::UCACHE; - LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s", - config.reuse_threshold, - config.start_percent, - config.end_percent, - config.error_decay_rate, - config.use_relative_threshold ? "true" : "false", - config.reset_error_on_compute ? "true" : "false"); -} - -static void init_cachedit_runtime(SampleCacheRuntime& runtime, - SDVersion version, - const sd_cache_params_t& cache_params, - const std::vector& sigmas) { - if (!sd_version_is_dit(version)) { - LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)"); - return; - } - - DBCacheConfig dbcfg; - dbcfg.enabled = (cache_params.mode == SD_CACHE_DBCACHE || - cache_params.mode == SD_CACHE_CACHE_DIT); - dbcfg.Fn_compute_blocks = cache_params.Fn_compute_blocks; - dbcfg.Bn_compute_blocks = cache_params.Bn_compute_blocks; - dbcfg.residual_diff_threshold = cache_params.residual_diff_threshold; - dbcfg.max_warmup_steps = cache_params.max_warmup_steps; - dbcfg.max_cached_steps = cache_params.max_cached_steps; - dbcfg.max_continuous_cached_steps = cache_params.max_continuous_cached_steps; - if (cache_params.scm_mask != nullptr && strlen(cache_params.scm_mask) > 0) { - dbcfg.steps_computation_mask = parse_scm_mask(cache_params.scm_mask); - } - dbcfg.scm_policy_dynamic = cache_params.scm_policy_dynamic; - - TaylorSeerConfig tcfg; - tcfg.enabled = (cache_params.mode == SD_CACHE_TAYLORSEER || - cache_params.mode == SD_CACHE_CACHE_DIT); - tcfg.n_derivatives = cache_params.taylorseer_n_derivatives; - tcfg.skip_interval_steps = cache_params.taylorseer_skip_interval; - - runtime.cachedit.init(dbcfg, tcfg); - if (!runtime.cachedit.enabled()) { - LOG_WARN("CacheDIT requested but could not be initialized for this run"); - return; - } - - runtime.cachedit.set_sigmas(sigmas); - runtime.mode = SampleCacheMode::CACHEDIT; - LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d", - cache_params.mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params.mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"), - dbcfg.Fn_compute_blocks, - dbcfg.Bn_compute_blocks, - dbcfg.residual_diff_threshold, - dbcfg.max_warmup_steps); -} - -static void init_spectrum_runtime(SampleCacheRuntime& runtime, - SDVersion version, - const sd_cache_params_t& cache_params, - const std::vector& sigmas) { - if (!sd_version_is_unet(version) && !sd_version_is_dit(version)) { - LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)"); - return; - } - - SpectrumConfig config; - config.w = cache_params.spectrum_w; - config.m = cache_params.spectrum_m; - config.lam = cache_params.spectrum_lam; - config.window_size = cache_params.spectrum_window_size; - config.flex_window = cache_params.spectrum_flex_window; - config.warmup_steps = cache_params.spectrum_warmup_steps; - config.stop_percent = cache_params.spectrum_stop_percent; - - size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; - runtime.spectrum.init(config, total_steps); - runtime.spectrum_enabled = true; - - LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%", - config.w, config.m, config.lam, - config.window_size, config.flex_window, - config.warmup_steps, config.stop_percent * 100.0f); -} - -static SampleCacheRuntime init_sample_cache_runtime(SDVersion version, - const sd_cache_params_t* cache_params, - Denoiser* denoiser, - const std::vector& sigmas) { - SampleCacheRuntime runtime; - if (cache_params == nullptr || cache_params->mode == SD_CACHE_DISABLED) { - return runtime; - } - - if (!has_valid_cache_percent_range(*cache_params)) { - LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)", - cache_params->start_percent, - cache_params->end_percent); - return runtime; - } - - switch (cache_params->mode) { - case SD_CACHE_EASYCACHE: - init_easycache_runtime(runtime, version, *cache_params, denoiser); - break; - case SD_CACHE_UCACHE: - init_ucache_runtime(runtime, version, *cache_params, denoiser, sigmas); - break; - case SD_CACHE_DBCACHE: - case SD_CACHE_TAYLORSEER: - case SD_CACHE_CACHE_DIT: - init_cachedit_runtime(runtime, version, *cache_params, sigmas); - break; - case SD_CACHE_SPECTRUM: - init_spectrum_runtime(runtime, version, *cache_params, sigmas); - break; - default: - break; - } - - return runtime; -} - -struct SampleStepCacheDispatcher { - SampleCacheRuntime& runtime; - int step; - float sigma; - int step_index; - - SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma) - : runtime(runtime), step(step), sigma(sigma), step_index(step > 0 ? (step - 1) : -1) { - if (step_index < 0) { - return; - } - - switch (runtime.mode) { - case SampleCacheMode::EASYCACHE: - runtime.easycache.begin_step(step_index, sigma); - break; - case SampleCacheMode::UCACHE: - runtime.ucache.begin_step(step_index, sigma); - break; - case SampleCacheMode::CACHEDIT: - runtime.cachedit.begin_step(step_index, sigma); - break; - case SampleCacheMode::NONE: - break; - } - } - - bool before_condition(const SDCondition* condition, ggml_tensor* input, ggml_tensor* output) { - if (step_index < 0 || condition == nullptr || input == nullptr || output == nullptr) { - return false; - } - - switch (runtime.mode) { - case SampleCacheMode::EASYCACHE: - return runtime.easycache.before_condition(condition, input, output, sigma, step_index); - case SampleCacheMode::UCACHE: - return runtime.ucache.before_condition(condition, input, output, sigma, step_index); - case SampleCacheMode::CACHEDIT: - return runtime.cachedit.before_condition(condition, input, output, sigma, step_index); - case SampleCacheMode::NONE: - return false; - } - - return false; - } - - void after_condition(const SDCondition* condition, ggml_tensor* input, ggml_tensor* output) { - if (step_index < 0 || condition == nullptr || input == nullptr || output == nullptr) { - return; - } - - switch (runtime.mode) { - case SampleCacheMode::EASYCACHE: - runtime.easycache.after_condition(condition, input, output); - break; - case SampleCacheMode::UCACHE: - runtime.ucache.after_condition(condition, input, output); - break; - case SampleCacheMode::CACHEDIT: - runtime.cachedit.after_condition(condition, input, output); - break; - case SampleCacheMode::NONE: - break; - } - } - - bool is_step_skipped() const { - switch (runtime.mode) { - case SampleCacheMode::EASYCACHE: - return runtime.easycache.is_step_skipped(); - case SampleCacheMode::UCACHE: - return runtime.ucache.is_step_skipped(); - case SampleCacheMode::CACHEDIT: - return runtime.cachedit.is_step_skipped(); - case SampleCacheMode::NONE: - return false; - } - - return false; - } -}; - -static void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps) { - if (runtime.easycache_enabled()) { - if (runtime.easycache.total_steps_skipped > 0 && total_steps > 0) { - if (runtime.easycache.total_steps_skipped < static_cast(total_steps)) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - runtime.easycache.total_steps_skipped); - LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)", - runtime.easycache.total_steps_skipped, - total_steps, - speedup); - } else { - LOG_INFO("EasyCache skipped %d/%zu steps", - runtime.easycache.total_steps_skipped, - total_steps); - } - } else if (total_steps > 0) { - LOG_INFO("EasyCache completed without skipping steps"); - } - } - - if (runtime.ucache_enabled()) { - if (runtime.ucache.total_steps_skipped > 0 && total_steps > 0) { - if (runtime.ucache.total_steps_skipped < static_cast(total_steps)) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - runtime.ucache.total_steps_skipped); - LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)", - runtime.ucache.total_steps_skipped, - total_steps, - speedup); - } else { - LOG_INFO("UCache skipped %d/%zu steps", - runtime.ucache.total_steps_skipped, - total_steps); - } - } else if (total_steps > 0) { - LOG_INFO("UCache completed without skipping steps"); - } - } - - if (runtime.cachedit_enabled()) { - if (runtime.cachedit.total_steps_skipped > 0 && total_steps > 0) { - if (runtime.cachedit.total_steps_skipped < static_cast(total_steps)) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - runtime.cachedit.total_steps_skipped); - LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup), accum_diff: %.4f", - runtime.cachedit.total_steps_skipped, - total_steps, - speedup, - runtime.cachedit.accumulated_residual_diff); - } else { - LOG_INFO("CacheDIT skipped %d/%zu steps, accum_diff: %.4f", - runtime.cachedit.total_steps_skipped, - total_steps, - runtime.cachedit.accumulated_residual_diff); - } - } else if (total_steps > 0) { - LOG_INFO("CacheDIT completed without skipping steps"); - } - } - - if (runtime.spectrum_enabled && runtime.spectrum.total_steps_skipped > 0 && total_steps > 0) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - runtime.spectrum.total_steps_skipped); - LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)", - runtime.spectrum.total_steps_skipped, - total_steps, - speedup); - } -} - /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -1279,7 +903,7 @@ public: if (pred_type == PREDICTION_COUNT) { if (sd_version_is_sd2(version)) { // check is_using_v_parameterization_for_sd2 - if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) { + if (is_using_v_parameterization_for_sd2(sd_version_is_inpaint(version))) { pred_type = V_PRED; } else { pred_type = EPS_PRED; @@ -1369,43 +993,31 @@ public: return true; } - bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) { - ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); - ggml_set_f32(x_t, 0.5); - ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); - ggml_set_f32(c, 0.5); - - ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); - ggml_set_f32(timesteps, 999); - - ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : nullptr; - if (concat != nullptr) { - ggml_set_f32(concat, 0); + bool is_using_v_parameterization_for_sd2(bool is_inpaint = false) { + sd::Tensor x_t = sd::full({8, 8, 4, 1}, 0.5f); + sd::Tensor c = sd::full({1024, 2, 1, 1}, 0.5f); + sd::Tensor steps = sd::full({1}, 999.0f); + sd::Tensor concat; + if (is_inpaint) { + concat = sd::zeros({8, 8, 5, 1}); } - int64_t t0 = ggml_time_ms(); - ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); + int64_t t0 = ggml_time_ms(); + sd::Tensor out; DiffusionParams diffusion_params; - diffusion_params.x = x_t; - diffusion_params.timesteps = timesteps; - diffusion_params.context = c; - diffusion_params.c_concat = concat; - diffusion_model->compute(n_threads, diffusion_params, &out); + diffusion_params.x = &x_t; + diffusion_params.timesteps = &steps; + diffusion_params.context = &c; + if (!concat.empty()) { + diffusion_params.c_concat = &concat; + } + auto out_opt = diffusion_model->compute(n_threads, diffusion_params); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); diffusion_model->free_compute_buffer(); - double result = 0.f; - { - float* vec_x = (float*)x_t->data; - float* vec_out = (float*)out->data; - - int64_t n = ggml_nelements(out); - - for (int i = 0; i < n; i++) { - result += ((double)vec_out[i] - (double)vec_x[i]); - } - result /= n; - } - int64_t t1 = ggml_time_ms(); + double result = static_cast((out - x_t).mean()); + int64_t t1 = ggml_time_ms(); LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000); return result < -1; } @@ -1643,8 +1255,7 @@ public: } } - SDCondition get_pmid_conditon(ggml_context* work_ctx, - sd_pm_params_t pm_params, + SDCondition get_pmid_conditon(sd_pm_params_t pm_params, ConditionerParams& condition_params) { SDCondition id_cond; if (use_pmid) { @@ -1663,60 +1274,60 @@ public: if (pm_params.id_images_count > 0) { int clip_image_size = 224; pmid_model->style_strength = pm_params.style_strength; - - auto id_image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, clip_image_size, clip_image_size, 3, pm_params.id_images_count); - - std::vector processed_id_images; + sd::Tensor id_image_tensor; for (int i = 0; i < pm_params.id_images_count; i++) { - sd_image_f32_t id_image = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]); - sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size); - free(id_image.data); - id_image.data = nullptr; - processed_id_images.push_back(processed_id_image); + auto id_image = sd_image_to_tensor(pm_params.id_images[i]); + auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size); + if (id_image_tensor.empty()) { + id_image_tensor = processed_id_image; + } else { + id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3); + } } - ggml_ext_tensor_iter(id_image_tensor, [&](ggml_tensor* id_image_tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = sd_image_get_f32(processed_id_images[i3], i0, i1, i2, false); - ggml_ext_tensor_set_f32(id_image_tensor, value, i0, i1, i2, i3); - }); - - for (auto& image : processed_id_images) { - free(image.data); - image.data = nullptr; - } - processed_id_images.clear(); - int64_t t0 = ggml_time_ms(); condition_params.num_input_imgs = pm_params.id_images_count; - auto cond_tup = cond_stage_model->get_learned_condition_with_trigger(work_ctx, - n_threads, + auto cond_tup = cond_stage_model->get_learned_condition_with_trigger(n_threads, condition_params); id_cond = std::get<0>(cond_tup); auto class_tokens_mask = std::get<1>(cond_tup); - ggml_tensor* id_embeds = nullptr; + sd::Tensor id_embeds; if (pmv2 && pm_params.id_embed_path != nullptr) { - id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path); + try { + id_embeds = sd::load_tensor_from_file_as_tensor(pm_params.id_embed_path); + } catch (const std::exception&) { + id_embeds = {}; + } } - if (pmv2 && id_embeds == nullptr) { + if (pmv2 && id_embeds.empty()) { LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2"); LOG_WARN("Turn off PhotoMaker"); use_pmid = false; } else { - if (pmv2 && pm_params.id_images_count != id_embeds->ne[1]) { - LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, id_embeds->ne[1]); + if (pmv2 && pm_params.id_images_count != id_embeds.shape()[1]) { + LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, static_cast(id_embeds.shape()[1])); LOG_WARN("Turn off PhotoMaker"); use_pmid = false; } else { - ggml_tensor* res = nullptr; - pmid_model->compute(n_threads, id_image_tensor, id_cond.c_crossattn, id_embeds, class_tokens_mask, &res, work_ctx); - id_cond.c_crossattn = res; - int64_t t1 = ggml_time_ms(); - LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0); + auto res = pmid_model->compute(n_threads, + id_image_tensor, + id_cond.c_crossattn, + id_embeds, + class_tokens_mask); + if (res.empty()) { + LOG_ERROR("Photomaker ID Stacking failed"); + LOG_WARN("Turn off PhotoMaker"); + use_pmid = false; + } else { + id_cond.c_crossattn = std::move(res); + int64_t t1 = ggml_time_ms(); + LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0); + // Encode input prompt without the trigger word for delayed conditioning + condition_params.text = cond_stage_model->remove_trigger_from_prompt(condition_params.text); + } if (free_params_immediately) { pmid_model->free_params_buffer(); } - // Encode input prompt without the trigger word for delayed conditioning - condition_params.text = cond_stage_model->remove_trigger_from_prompt(work_ctx, condition_params.text); } } } else { @@ -1728,108 +1339,37 @@ public: return id_cond; } - ggml_tensor* get_clip_vision_output(ggml_context* work_ctx, - sd_image_t init_image, - bool return_pooled = true, - int clip_skip = -1, - bool zero_out_masked = false) { - ggml_tensor* output = nullptr; + sd::Tensor get_clip_vision_output(const sd::Tensor& image, + bool return_pooled = true, + int clip_skip = -1, + bool zero_out_masked = false) { + sd::Tensor output; if (zero_out_masked) { if (return_pooled) { - output = ggml_new_tensor_1d(work_ctx, - GGML_TYPE_F32, - clip_vision->vision_model.projection_dim); + output = sd::zeros({clip_vision->vision_model.projection_dim}); } else { - output = ggml_new_tensor_2d(work_ctx, - GGML_TYPE_F32, - clip_vision->vision_model.hidden_size, - 257); + output = sd::zeros({clip_vision->vision_model.hidden_size, 257}); } - - ggml_set_f32(output, 0.f); } else { - sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); - sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size); - free(image.data); - image.data = nullptr; - - ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1); - sd_image_f32_to_ggml_tensor(resized_image, pixel_values, false); - free(resized_image.data); - resized_image.data = nullptr; - - // print_ggml_tensor(pixel_values); - clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx); - // print_ggml_tensor(c_crossattn); + auto pixel_values = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size); + auto output_opt = clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip); + if (output_opt.empty()) { + LOG_ERROR("clip_vision compute failed"); + return {}; + } + output = std::move(output_opt); } return output; } - SDCondition get_svd_condition(ggml_context* work_ctx, - sd_image_t init_image, - int width, - int height, - int fps = 6, - int motion_bucket_id = 127, - float augmentation_level = 0.f, - bool zero_out_masked = false) { - // c_crossattn - int64_t t0 = ggml_time_ms(); - ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked); - - // c_concat - ggml_tensor* c_concat = nullptr; - { - if (zero_out_masked) { - c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / get_vae_scale_factor(), height / get_vae_scale_factor(), 4, 1); - ggml_set_f32(c_concat, 0.f); - } else { - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - - if (width != init_image.width || height != init_image.height) { - sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); - sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height); - free(image.data); - image.data = nullptr; - sd_image_f32_to_ggml_tensor(resized_image, init_img, false); - free(resized_image.data); - resized_image.data = nullptr; - } else { - sd_image_to_ggml_tensor(init_image, init_img); - } - if (augmentation_level > 0.f) { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img); - ggml_ext_im_set_randn_f32(noise, rng); - // encode_pixels += torch.randn_like(pixels) * augmentation_level - ggml_ext_tensor_scale_inplace(noise, augmentation_level); - ggml_ext_tensor_add_inplace(init_img, noise); - } - c_concat = encode_first_stage(work_ctx, init_img); - } - } - - // y - ggml_tensor* y = nullptr; - { - y = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels()); - int out_dim = 256; - int fps_id = fps - 1; - std::vector timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level}; - set_timestep_embedding(timesteps, y, out_dim); - } - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0); - return {c_crossattn, y, c_concat}; - } - std::vector process_timesteps(const std::vector& timesteps, - ggml_tensor* init_latent, - ggml_tensor* denoise_mask) { + const sd::Tensor& init_latent, + const sd::Tensor& denoise_mask) { if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") { - auto new_timesteps = std::vector(init_latent->ne[2], timesteps[0]); + auto new_timesteps = std::vector(static_cast(init_latent.shape()[2]), timesteps[0]); - if (denoise_mask != nullptr) { - float value = ggml_ext_tensor_get_f32(denoise_mask, 0, 0, 0, 0); + if (!denoise_mask.empty()) { + float value = denoise_mask.dim() == 5 ? denoise_mask.index(0, 0, 0, 0, 0) : denoise_mask.index(0, 0, 0, 0); if (value == 0.f) { new_timesteps[0] = 0.f; } @@ -1840,40 +1380,19 @@ public: } } - // a = a * mask + b * (1 - mask) - void apply_mask(ggml_tensor* a, ggml_tensor* b, ggml_tensor* mask) { - for (int64_t i0 = 0; i0 < a->ne[0]; i0++) { - for (int64_t i1 = 0; i1 < a->ne[1]; i1++) { - for (int64_t i2 = 0; i2 < a->ne[2]; i2++) { - for (int64_t i3 = 0; i3 < a->ne[3]; i3++) { - float a_value = ggml_ext_tensor_get_f32(a, i0, i1, i2, i3); - float b_value = ggml_ext_tensor_get_f32(b, i0, i1, i2, i3); - float mask_value = ggml_ext_tensor_get_f32(mask, i0 % mask->ne[0], i1 % mask->ne[1], i2 % mask->ne[2], i3 % mask->ne[3]); - ggml_ext_tensor_set_f32(a, a_value * mask_value + b_value * (1 - mask_value), i0, i1, i2, i3); - } - } - } - } - } - - void preview_image(ggml_context* work_ctx, - int step, - ggml_tensor* latents, + void preview_image(int step, + const sd::Tensor& latents, enum SDVersion version, preview_t preview_mode, - ggml_tensor* result, std::function step_callback, void* step_callback_data, bool is_noisy) { - const uint32_t channel = 3; - uint32_t width = static_cast(latents->ne[0]); - uint32_t height = static_cast(latents->ne[1]); - uint32_t dim = static_cast(latents->ne[ggml_n_dims(latents) - 1]); - if (preview_mode == PREVIEW_PROJ) { - int patch_sz = 1; - const float(*latent_rgb_proj)[channel] = nullptr; - float* latent_rgb_bias = nullptr; + int patch_sz = 1; + const float(*latent_rgb_proj)[3] = nullptr; + float* latent_rgb_bias = nullptr; + bool is_video = preview_latent_tensor_is_video(latents); + uint32_t dim = is_video ? static_cast(latents.shape()[3]) : static_cast(latents.shape()[2]); if (dim == 128) { if (sd_version_is_flux2(version)) { @@ -1887,12 +1406,9 @@ public: latent_rgb_bias = wan_22_latent_rgb_bias; } else { LOG_WARN("No latent to RGB projection known for this model"); - // unknown model return; } } else if (dim == 16) { - // 16 channels VAE -> Flux or SD3 - if (sd_version_is_sd3(version)) { latent_rgb_proj = sd3_latent_rgb_proj; latent_rgb_bias = sd3_latent_rgb_bias; @@ -1904,12 +1420,9 @@ public: latent_rgb_bias = wan_21_latent_rgb_bias; } else { LOG_WARN("No latent to RGB projection known for this model"); - // unknown model return; } - } else if (dim == 4) { - // 4 channels VAE if (sd_version_is_sdxl(version)) { latent_rgb_proj = sdxl_latent_rgb_proj; latent_rgb_bias = sdxl_latent_rgb_bias; @@ -1917,459 +1430,394 @@ public: latent_rgb_proj = sd_latent_rgb_proj; latent_rgb_bias = sd_latent_rgb_bias; } else { - // unknown model LOG_WARN("No latent to RGB projection known for this model"); return; } - } else if (dim == 3) { - // Do nothing, assuming already RGB latents - } else { + } else if (dim != 3) { LOG_WARN("No latent to RGB projection known for this model"); - // unknown latent space return; } - uint32_t frames = 1; - if (ggml_n_dims(latents) == 4) { - frames = static_cast(latents->ne[2]); - } - - uint32_t img_width = width * patch_sz; - uint32_t img_height = height * patch_sz; - - uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t)); + uint32_t frames = is_video ? static_cast(latents.shape()[2]) : 1; + uint32_t img_width = static_cast(latents.shape()[0]) * patch_sz; + uint32_t img_height = static_cast(latents.shape()[1]) * patch_sz; + uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * 3 * sizeof(uint8_t)); + GGML_ASSERT(data != nullptr); preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz); sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); + GGML_ASSERT(images != nullptr); for (uint32_t i = 0; i < frames; i++) { - images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel}; + images[i] = {img_width, img_height, 3, data + i * img_width * img_height * 3}; } step_callback(step, frames, images, is_noisy, step_callback_data); free(data); free(images); - } else { - if (preview_mode == PREVIEW_VAE || preview_mode == PREVIEW_TAE) { - if (preview_vae) { - latents = preview_vae->diffusion_to_vae_latents(work_ctx, latents); - result = preview_vae->decode(n_threads, work_ctx, latents, vae_tiling_params, false, circular_x, circular_y, result, true); - } else { - latents = first_stage_model->diffusion_to_vae_latents(work_ctx, latents); - result = first_stage_model->decode(n_threads, work_ctx, latents, vae_tiling_params, false, circular_x, circular_y, result, true); - } + return; + } + + if (preview_mode == PREVIEW_VAE || preview_mode == PREVIEW_TAE) { + sd::Tensor vae_latents; + sd::Tensor decoded; + bool is_video = preview_latent_tensor_is_video(latents); + if (preview_vae) { + vae_latents = preview_vae->diffusion_to_vae_latents(latents); + decoded = preview_vae->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true); } else { + vae_latents = first_stage_model->diffusion_to_vae_latents(latents); + decoded = first_stage_model->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true); + } + if (decoded.empty()) { + LOG_ERROR("preview decode failed at step %d", step); return; } - ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f); - uint32_t frames = 1; - if (ggml_n_dims(latents) == 4) { - frames = static_cast(result->ne[2]); - } - + is_video = preview_latent_tensor_is_video(decoded); + uint32_t frames = is_video ? static_cast(decoded.shape()[2]) : 1; sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); - // print_ggml_tensor(result,true); - for (size_t i = 0; i < frames; i++) { - images[i].width = static_cast(result->ne[0]); - images[i].height = static_cast(result->ne[1]); - images[i].channel = 3; - images[i].data = ggml_tensor_to_sd_image(result, static_cast(i), ggml_n_dims(latents) == 4); + GGML_ASSERT(images != nullptr); + for (uint32_t i = 0; i < frames; ++i) { + images[i] = tensor_to_sd_image(decoded, static_cast(i)); } step_callback(step, frames, images, is_noisy, step_callback_data); - - ggml_ext_tensor_scale_inplace(result, 0); - for (uint32_t i = 0; i < frames; i++) { + for (uint32_t i = 0; i < frames; ++i) { free(images[i].data); } - free(images); + return; + } + + if (preview_mode != PREVIEW_NONE) { + LOG_WARN("Unsupported preview mode: %d", static_cast(preview_mode)); } } - ggml_tensor* sample(ggml_context* work_ctx, - std::shared_ptr work_diffusion_model, - bool inverse_noise_scaling, - ggml_tensor* init_latent, - ggml_tensor* noise, - SDCondition cond, - SDCondition uncond, - SDCondition img_cond, - ggml_tensor* control_hint, - float control_strength, - sd_guidance_params_t guidance, - float eta, - int shifted_timestep, - sample_method_t method, - const std::vector& sigmas, - int start_merge_step, - SDCondition id_cond, - std::vector ref_latents = {}, - bool increase_ref_index = false, - ggml_tensor* denoise_mask = nullptr, - ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f, - const sd_cache_params_t* cache_params = nullptr) { - if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) { - LOG_WARN("timestep shifting is only supported for SDXL models!"); - shifted_timestep = 0; + std::vector prepare_sample_timesteps(float sigma, + int shifted_timestep) { + float t = denoiser->sigma_to_t(sigma); + if (shifted_timestep > 0) { + float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS)); + int64_t shifted_t = static_cast(roundf(shifted_t_float)); + shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); + LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); + return std::vector{(float)shifted_t}; } + if (sd_version_is_anima(version)) { + return std::vector{t / static_cast(TIMESTEPS)}; + } + if (sd_version_is_z_image(version)) { + return std::vector{1000.f - t}; + } + return std::vector{t}; + } + + void adjust_sample_step_scalings(int shifted_timestep, + const std::vector& timesteps_vec, + float c_in, + float* c_skip, + float* c_out) { + GGML_ASSERT(c_skip != nullptr); + GGML_ASSERT(c_out != nullptr); + if (shifted_timestep <= 0) { + return; + } + + int64_t shifted_t_idx = static_cast(roundf(timesteps_vec[0])); + float shifted_sigma = denoiser->t_to_sigma((float)shifted_t_idx); + std::vector shifted_scaling = denoiser->get_scalings(shifted_sigma); + float shifted_c_skip = shifted_scaling[0]; + float shifted_c_out = shifted_scaling[1]; + float shifted_c_in = shifted_scaling[2]; + + *c_skip = shifted_c_skip * c_in / shifted_c_in; + *c_out = shifted_c_out; + } + + struct SamplePreviewContext { + sd_preview_cb_t callback = nullptr; + void* data = nullptr; + preview_t mode = PREVIEW_NONE; + }; + + SamplePreviewContext prepare_sample_preview_context() { + return SamplePreviewContext{sd_get_preview_callback(), + sd_get_preview_callback_data(), + sd_get_preview_mode()}; + } + + void report_sample_progress(int step, size_t total_steps, int64_t t0) { + int64_t t1 = ggml_time_us(); + if (step > 0 || step == -(int)total_steps) { + int showstep = std::abs(step); + pretty_progress(showstep, (int)total_steps, (t1 - t0) / 1000000.f / showstep); + } + } + + void compute_sample_controls(const sd::Tensor& control_image, + const sd::Tensor& noised_input, + const sd::Tensor& timesteps_tensor, + const SDCondition& condition, + std::vector>* controls) { + GGML_ASSERT(controls != nullptr); + controls->clear(); + if (control_image.empty() || control_net == nullptr) { + return; + } + + auto control_result = control_net->compute(n_threads, + noised_input, + control_image, + timesteps_tensor, + condition.c_crossattn, + condition.c_vector); + if (!control_result.has_value()) { + LOG_ERROR("controlnet compute failed"); + return; + } + + *controls = std::move(*control_result); + } + + sd::Tensor sample(const std::shared_ptr& work_diffusion_model, + bool inverse_noise_scaling, + const sd::Tensor& init_latent, + sd::Tensor noise, + const SDCondition& cond, + const SDCondition& uncond, + const SDCondition& img_cond, + const SDCondition& id_cond, + const sd::Tensor& control_image, + float control_strength, + const sd_guidance_params_t& guidance, + float eta, + int shifted_timestep, + sample_method_t method, + const std::vector& sigmas, + int start_merge_step, + const std::vector>& ref_latents, + bool increase_ref_index, + const sd::Tensor& denoise_mask, + const sd::Tensor& vace_context, + float vace_strength, + const sd_cache_params_t* cache_params) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); - - float cfg_scale = guidance.txt_cfg; - if (cfg_scale < 1.f) { - if (cfg_scale == 0.f) { - // Diffusers follow the convention from the original paper - // (https://arxiv.org/abs/2207.12598v1), so many distilled model docs - // recommend 0 as guidance; warn the user that it'll disable prompt folowing - LOG_WARN("unconditioned mode, images won't follow the prompt (use cfg-scale=1 for distilled models)"); - } else { - LOG_WARN("cfg value out of expected range may produce unexpected results"); - } - } - - float img_cfg_scale = std::isfinite(guidance.img_cfg) ? guidance.img_cfg : guidance.txt_cfg; + float cfg_scale = guidance.txt_cfg; + float img_cfg_scale = guidance.img_cfg; float slg_scale = guidance.slg.scale; - if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) { - LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance..."); - img_cfg_scale = cfg_scale; + sd_sample::SampleCacheRuntime cache_runtime = sd_sample::init_sample_cache_runtime(version, + cache_params, + denoiser.get(), + sigmas); + size_t steps = sigmas.size() - 1; + bool has_skiplayer = slg_scale != 0.0f && !skip_layers.empty(); + if (has_skiplayer && !sd_version_is_dit(version)) { + has_skiplayer = false; + LOG_WARN("SLG is incompatible with this model type"); } - SampleCacheRuntime cache_runtime = init_sample_cache_runtime(version, cache_params, denoiser.get(), sigmas); + int64_t t0 = ggml_time_us(); + sd::Tensor x_t = !noise.empty() + ? denoiser->noise_scaling(sigmas[0], noise, init_latent) + : init_latent; + sd::Tensor denoised = x_t; + SamplePreviewContext preview = prepare_sample_preview_context(); - size_t steps = sigmas.size() - 1; - ggml_tensor* x = ggml_ext_dup_and_cpy_tensor(work_ctx, init_latent); - - if (noise) { - x = denoiser->noise_scaling(sigmas[0], noise, x); - } - - ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x); - - bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != nullptr; - bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != nullptr; - bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; - - // denoise wrapper - ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); - ggml_tensor* out_uncond = nullptr; - ggml_tensor* out_skip = nullptr; - ggml_tensor* out_img_cond = nullptr; - - if (has_unconditioned) { - out_uncond = ggml_dup_tensor(work_ctx, x); - } - if (has_skiplayer) { - if (sd_version_is_dit(version)) { - out_skip = ggml_dup_tensor(work_ctx, x); - } else { - has_skiplayer = false; - LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); - } - } - if (has_img_cond) { - out_img_cond = ggml_dup_tensor(work_ctx, x); - } - ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); - - int64_t t0 = ggml_time_us(); - - ggml_tensor* preview_tensor = nullptr; - auto sd_preview_mode = sd_get_preview_mode(); - if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { - int64_t W = x->ne[0] * get_vae_scale_factor(); - int64_t H = x->ne[1] * get_vae_scale_factor(); - if (ggml_n_dims(x) == 4) { - // assuming video mode (if batch processing gets implemented this will break) - int64_t T = x->ne[2]; - if (sd_version_is_wan(version)) { - T = ((T - 1) * 4) + 1; - } - preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, - W, - H, - T, - 3); - } else { - preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, - W, - H, - 3, - x->ne[3]); - } - } - - auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { - auto sd_preview_cb = sd_get_preview_callback(); - auto sd_preview_cb_data = sd_get_preview_callback_data(); - auto sd_preview_mode = sd_get_preview_mode(); + auto denoise = [&](const sd::Tensor& x, float sigma, int step) -> sd::Tensor { if (step == 1 || step == -1) { pretty_progress(0, (int)steps, 0); } - DiffusionParams diffusion_params; - SampleStepCacheDispatcher step_cache(cache_runtime, step, sigma); - std::vector scaling = denoiser->get_scalings(sigma); GGML_ASSERT(scaling.size() == 3); float c_skip = scaling[0]; float c_out = scaling[1]; float c_in = scaling[2]; - float t = denoiser->sigma_to_t(sigma); - std::vector timesteps_vec; - if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { - float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS)); - int64_t shifted_t = static_cast(roundf(shifted_t_float)); - shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); - LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); - timesteps_vec.assign(1, (float)shifted_t); - } else if (sd_version_is_anima(version)) { - // Anima uses normalized flow timesteps. - timesteps_vec.assign(1, t / static_cast(TIMESTEPS)); - } else if (sd_version_is_z_image(version)) { - timesteps_vec.assign(1, 1000.f - t); - } else { - timesteps_vec.assign(1, t); + std::vector timesteps_vec = prepare_sample_timesteps(sigma, shifted_timestep); + timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); + adjust_sample_step_scalings(shifted_timestep, timesteps_vec, c_in, &c_skip, &c_out); + + sd::Tensor timesteps_tensor({static_cast(timesteps_vec.size())}, timesteps_vec); + sd::Tensor guidance_tensor({1}, std::vector{guidance.distilled_guidance}); + sd::Tensor noised_input = x * c_in; + if (!denoise_mask.empty() && version == VERSION_WAN2_2_TI2V) { + noised_input = noised_input * denoise_mask + init_latent * (1.0f - denoise_mask); } - timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); - if (cache_runtime.spectrum_enabled && cache_runtime.spectrum.should_predict()) { - cache_runtime.spectrum.predict(denoised); - - if (denoise_mask != nullptr) { - apply_mask(denoised, init_latent, denoise_mask); + cache_runtime.spectrum.predict(&denoised); + if (!denoise_mask.empty()) { + denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask); } - - if (sd_preview_cb != nullptr && sd_should_preview_denoised()) { - if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false); - } - } - - int64_t t1 = ggml_time_us(); - if (step > 0 || step == -(int)steps) { - int showstep = std::abs(step); - pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep); + if (sd_should_preview_denoised() && preview.callback != nullptr) { + preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false); } + report_sample_progress(step, steps, t0); return denoised; } - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); - std::vector guidance_vec(1, guidance.distilled_guidance); - auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); - - copy_ggml_tensor(noised_input, input); - // noised_input = noised_input * c_in - ggml_ext_tensor_scale_inplace(noised_input, c_in); - - if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) { - apply_mask(noised_input, init_latent, denoise_mask); - } - if (sd_preview_cb != nullptr && sd_should_preview_noisy()) { - if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, true); - } + if (sd_should_preview_noisy() && preview.callback != nullptr) { + preview_image(step, noised_input, version, preview.mode, preview.callback, preview.data, true); } - std::vector controls; - - if (control_hint != nullptr && control_net != nullptr) { - if (control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector)) { - controls = control_net->controls; - } else { - LOG_ERROR("controlnet compute failed"); - } - // print_ggml_tensor(controls[12]); - // GGML_ASSERT(0); - } - - diffusion_params.x = noised_input; - diffusion_params.timesteps = timesteps; - diffusion_params.guidance = guidance_tensor; - diffusion_params.ref_latents = ref_latents; + sd::Tensor cond_out; + sd::Tensor uncond_out; + sd::Tensor img_cond_out; + sd::Tensor skip_cond_out; + sd_sample::SampleStepCacheDispatcher step_cache(cache_runtime, step, sigma); + std::vector> controls; + DiffusionParams diffusion_params; + diffusion_params.x = &noised_input; + diffusion_params.timesteps = ×teps_tensor; + diffusion_params.guidance = &guidance_tensor; + diffusion_params.ref_latents = &ref_latents; diffusion_params.increase_ref_index = increase_ref_index; - diffusion_params.controls = controls; + diffusion_params.controls = &controls; diffusion_params.control_strength = control_strength; - diffusion_params.vace_context = vace_context; + diffusion_params.vace_context = vace_context.empty() ? nullptr : &vace_context; diffusion_params.vace_strength = vace_strength; + diffusion_params.skip_layers = nullptr; - auto run_diffusion_condition = [&](const SDCondition* condition, ggml_tensor** output_tensor) -> bool { - if (step_cache.before_condition(condition, diffusion_params.x, *output_tensor)) { - return true; + compute_sample_controls(control_image, + noised_input, + timesteps_tensor, + cond, + &controls); + + auto run_condition = [&](const SDCondition& condition, + const sd::Tensor* c_concat_override = nullptr, + const std::vector* local_skip_layers = nullptr) -> sd::Tensor { + diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn; + diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat); + diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector; + diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids; + diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights; + diffusion_params.skip_layers = local_skip_layers; + + sd::Tensor cached_output; + if (step_cache.before_condition(&condition, noised_input, &cached_output)) { + return std::move(cached_output); } - if (!work_diffusion_model->compute(n_threads, - diffusion_params, - output_tensor)) { + auto output_opt = work_diffusion_model->compute(n_threads, diffusion_params); + if (output_opt.empty()) { LOG_ERROR("diffusion model compute failed"); - return false; + return sd::Tensor(); } - step_cache.after_condition(condition, diffusion_params.x, *output_tensor); - return true; + step_cache.after_condition(&condition, noised_input, output_opt); + return output_opt; }; - const SDCondition* active_condition = nullptr; - ggml_tensor** active_output = &out_cond; if (start_merge_step == -1 || step <= start_merge_step) { - // cond - diffusion_params.context = cond.c_crossattn; - diffusion_params.c_concat = cond.c_concat; - diffusion_params.y = cond.c_vector; - active_condition = &cond; + cond_out = run_condition(cond); + if (cond_out.empty()) { + return {}; + } } else { - diffusion_params.context = id_cond.c_crossattn; - diffusion_params.c_concat = cond.c_concat; - diffusion_params.y = id_cond.c_vector; - active_condition = &id_cond; - } - - if (!run_diffusion_condition(active_condition, active_output)) { - return nullptr; - } - - bool current_step_skipped = step_cache.is_step_skipped(); - - float* negative_data = nullptr; - if (has_unconditioned) { - // uncond - if (!current_step_skipped && control_hint != nullptr && control_net != nullptr) { - if (control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector)) { - controls = control_net->controls; - } else { - LOG_ERROR("controlnet compute failed"); - } + GGML_ASSERT(!id_cond.empty()); + cond_out = run_condition(id_cond, + cond.c_concat.empty() ? nullptr : &cond.c_concat); + if (cond_out.empty()) { + return {}; } - current_step_skipped = step_cache.is_step_skipped(); - diffusion_params.controls = controls; - diffusion_params.context = uncond.c_crossattn; - diffusion_params.c_concat = uncond.c_concat; - diffusion_params.y = uncond.c_vector; - if (!run_diffusion_condition(&uncond, &out_uncond)) { - return nullptr; - } - negative_data = (float*)out_uncond->data; } - float* img_cond_data = nullptr; - if (has_img_cond) { - diffusion_params.context = img_cond.c_crossattn; - diffusion_params.c_concat = img_cond.c_concat; - diffusion_params.y = img_cond.c_vector; - if (!run_diffusion_condition(&img_cond, &out_img_cond)) { - return nullptr; + if (!uncond.empty()) { + if (!step_cache.is_step_skipped()) { + compute_sample_controls(control_image, + noised_input, + timesteps_tensor, + uncond, + &controls); + } + uncond_out = run_condition(uncond); + if (uncond_out.empty()) { + return {}; } - img_cond_data = (float*)out_img_cond->data; } - - int step_count = static_cast(sigmas.size()); - bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count); - float* skip_layer_data = has_skiplayer ? (float*)out_skip->data : nullptr; + if (!img_cond.empty()) { + img_cond_out = run_condition(img_cond, + cond.c_concat.empty() ? nullptr : &cond.c_concat); + if (img_cond_out.empty()) { + return {}; + } + } + bool is_skiplayer_step = has_skiplayer && + step > (int)(guidance.slg.layer_start * static_cast(sigmas.size())) && + step < (int)(guidance.slg.layer_end * static_cast(sigmas.size())); if (is_skiplayer_step) { LOG_DEBUG("Skipping layers at step %d\n", step); if (!step_cache.is_step_skipped()) { - // skip layer (same as conditioned) - diffusion_params.context = cond.c_crossattn; - diffusion_params.c_concat = cond.c_concat; - diffusion_params.y = cond.c_vector; - diffusion_params.skip_layers = skip_layers; - if (!work_diffusion_model->compute(n_threads, - diffusion_params, - &out_skip)) { - LOG_ERROR("diffusion model compute failed"); - return nullptr; + skip_cond_out = run_condition(cond, + cond.c_concat.empty() ? nullptr : &cond.c_concat, + &skip_layers); + if (skip_cond_out.empty()) { + return {}; } } - skip_layer_data = (float*)out_skip->data; - } - float* vec_denoised = (float*)denoised->data; - float* vec_input = (float*)input->data; - float* positive_data = (float*)out_cond->data; - int ne_elements = (int)ggml_nelements(denoised); - - if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { - int64_t shifted_t_idx = static_cast(roundf(timesteps_vec[0])); - float shifted_sigma = denoiser->t_to_sigma((float)shifted_t_idx); - std::vector shifted_scaling = denoiser->get_scalings(shifted_sigma); - float shifted_c_skip = shifted_scaling[0]; - float shifted_c_out = shifted_scaling[1]; - float shifted_c_in = shifted_scaling[2]; - - c_skip = shifted_c_skip * c_in / shifted_c_in; - c_out = shifted_c_out; } - for (int i = 0; i < ne_elements; i++) { - float latent_result = positive_data[i]; - if (has_unconditioned) { - // out_uncond + cfg_scale * (out_cond - out_uncond) - if (has_img_cond) { - // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond) - latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]); - } else { - // img_cfg_scale == cfg_scale - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); - } - } else if (has_img_cond) { - // img_cfg_scale == 1 - latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]); + GGML_ASSERT(!cond_out.empty()); + sd::Tensor latent_result = cond_out; + if (!uncond_out.empty()) { + if (!img_cond_out.empty()) { + latent_result = uncond_out + + img_cfg_scale * (img_cond_out - uncond_out) + + cfg_scale * (cond_out - img_cond_out); + } else { + latent_result = uncond_out + cfg_scale * (cond_out - uncond_out); } - if (is_skiplayer_step) { - latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; - } - // v = latent_result, eps = latent_result - // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) - vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; + } else if (!img_cond_out.empty()) { + latent_result = img_cond_out + cfg_scale * (cond_out - img_cond_out); } + if (is_skiplayer_step && !skip_cond_out.empty()) { + latent_result += (cond_out - skip_cond_out) * slg_scale; + } + denoised = latent_result * c_out + x * c_skip; if (cache_runtime.spectrum_enabled) { cache_runtime.spectrum.update(denoised); } - - if (denoise_mask != nullptr) { - apply_mask(denoised, init_latent, denoise_mask); + if (!denoise_mask.empty()) { + denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask); } - - if (sd_preview_cb != nullptr && sd_should_preview_denoised()) { - if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false); - } - } - - int64_t t1 = ggml_time_us(); - if (step > 0 || step == -(int)steps) { - int showstep = std::abs(step); - pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep); - // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); + if (sd_should_preview_denoised() && preview.callback != nullptr) { + preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false); } + report_sample_progress(step, steps, t0); return denoised; }; - if (!sample_k_diffusion(method, denoise, work_ctx, x, sigmas, sampler_rng, eta)) { + auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta); + if (x0_opt.empty()) { LOG_ERROR("Diffusion model sampling failed"); if (control_net) { control_net->free_control_ctx(); control_net->free_compute_buffer(); } - diffusion_model->free_compute_buffer(); - return NULL; + if (work_diffusion_model) { + work_diffusion_model->free_compute_buffer(); + } + return {}; } - size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; - log_sample_cache_summary(cache_runtime, total_steps); - + auto x0 = std::move(x0_opt); + sd_sample::log_sample_cache_summary(cache_runtime, steps); if (inverse_noise_scaling) { - x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); + x0 = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x0); } if (control_net) { control_net->free_control_ctx(); control_net->free_compute_buffer(); } - work_diffusion_model->free_compute_buffer(); - return x; + if (work_diffusion_model) { + work_diffusion_model->free_compute_buffer(); + } + return x0; } int get_vae_scale_factor() { @@ -2409,11 +1857,10 @@ public: return (h / vae_scale_factor) * (w / vae_scale_factor); } - ggml_tensor* generate_init_latent(ggml_context* work_ctx, - int width, - int height, - int frames = 1, - bool video = false) { + sd::Tensor generate_init_latent(int width, + int height, + int frames = 1, + bool video = false) { int vae_scale_factor = get_vae_scale_factor(); int W = width / vae_scale_factor; int H = height / vae_scale_factor; @@ -2422,34 +1869,35 @@ public: T = ((T - 1) / 4) + 1; } int C = get_latent_channel(); - ggml_tensor* init_latent; if (video) { - init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C); - } else { - init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + return sd::zeros({W, H, T, C, 1}); } - ggml_set_f32(init_latent, 0.f); - return init_latent; + return sd::zeros({W, H, C, 1}); } - ggml_tensor* encode_to_vae_latents(ggml_context* work_ctx, ggml_tensor* x) { - ggml_tensor* vae_output = first_stage_model->encode(n_threads, work_ctx, x, vae_tiling_params, circular_x, circular_y); - ggml_tensor* latents = first_stage_model->vae_output_to_latents(work_ctx, vae_output, rng); + sd::Tensor encode_to_vae_latents(const sd::Tensor& x) { + auto latents = first_stage_model->encode(n_threads, x, vae_tiling_params, circular_x, circular_y); + if (latents.empty()) { + return {}; + } + latents = first_stage_model->vae_output_to_latents(latents, rng); return latents; } - ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { - ggml_tensor* latents = encode_to_vae_latents(work_ctx, x); + sd::Tensor encode_first_stage(const sd::Tensor& x) { + auto latents = encode_to_vae_latents(x); + if (latents.empty()) { + return {}; + } if (version != VERSION_SD1_PIX2PIX) { - latents = first_stage_model->vae_to_diffuison_latents(work_ctx, latents); + latents = first_stage_model->vae_to_diffusion_latents(latents); } return latents; } - ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) { - x = first_stage_model->diffusion_to_vae_latents(work_ctx, x); - x = first_stage_model->decode(n_threads, work_ctx, x, vae_tiling_params, decode_video, circular_x, circular_y); - return x; + sd::Tensor decode_first_stage(const sd::Tensor& x, bool decode_video = false) { + auto latents = first_stage_model->diffusion_to_vae_latents(x); + return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); } void set_flow_shift(float flow_shift = INFINITY) { @@ -2966,667 +2414,216 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me return DISCRETE_SCHEDULER; } -sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, - ggml_context* work_ctx, - ggml_tensor* init_latent, - std::string prompt, - std::string negative_prompt, - int clip_skip, - sd_guidance_params_t guidance, - float eta, - int shifted_timestep, - int width, - int height, - enum sample_method_t sample_method, - const std::vector& sigmas, - int64_t seed, - int batch_count, - sd_image_t control_image, - float control_strength, - sd_pm_params_t pm_params, - std::vector ref_images, - std::vector ref_latents, - bool increase_ref_index, - ggml_tensor* concat_latent = nullptr, - ggml_tensor* denoise_mask = nullptr, - const sd_cache_params_t* cache_params = nullptr) { - if (seed < 0) { - // Generally, when using the provided command line, the seed is always >0. - // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library - // by a third party with a seed <0, let's incorporate randomization here. - srand((int)time(nullptr)); - seed = rand(); +static int64_t resolve_seed(int64_t seed) { + if (seed >= 0) { + return seed; } - - if (!std::isfinite(guidance.img_cfg)) { - guidance.img_cfg = guidance.txt_cfg; - } - - int sample_steps = static_cast(sigmas.size() - 1); - - int64_t t0 = ggml_time_ms(); - - ConditionerParams condition_params; - condition_params.text = prompt; - condition_params.clip_skip = clip_skip; - condition_params.width = width; - condition_params.height = height; - condition_params.ref_images = ref_images; - condition_params.adm_in_channels = static_cast(sd_ctx->sd->diffusion_model->get_adm_in_channels()); - - // Photo Maker - SDCondition id_cond = sd_ctx->sd->get_pmid_conditon(work_ctx, pm_params, condition_params); - - // Get learned condition - condition_params.zero_out_masked = false; - SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, - sd_ctx->sd->n_threads, - condition_params); - - SDCondition uncond; - if (guidance.txt_cfg != 1.0 || - (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { - bool zero_out_masked = false; - if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) { - zero_out_masked = true; - } - condition_params.text = negative_prompt; - condition_params.zero_out_masked = zero_out_masked; - uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, - sd_ctx->sd->n_threads, - condition_params); - } - int64_t t1 = ggml_time_ms(); - LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0); - - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->cond_stage_model->free_params_buffer(); - } - - // Control net hint - ggml_tensor* image_hint = nullptr; - if (control_image.data != nullptr) { - image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_ggml_tensor(control_image, image_hint); - } - - // Sample - std::vector final_latents; // collect latents to decode - int C = sd_ctx->sd->get_latent_channel(); - int W = width / sd_ctx->sd->get_vae_scale_factor(); - int H = height / sd_ctx->sd->get_vae_scale_factor(); - - ggml_tensor* control_latent = nullptr; - if (sd_version_is_control(sd_ctx->sd->version) && image_hint != nullptr) { - control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint); - ggml_ext_tensor_scale_inplace(control_latent, control_strength); - } - - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - int64_t mask_channels = 1; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - mask_channels = 8 * 8; // flatten the whole mask - } else if (sd_ctx->sd->version == VERSION_FLEX_2) { - mask_channels = 1 + init_latent->ne[2]; - } - auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); - // no mask, set the whole image as masked - for (int64_t x = 0; x < empty_latent->ne[0]; x++) { - for (int64_t y = 0; y < empty_latent->ne[1]; y++) { - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - // TODO: this might be wrong - for (int64_t c = 0; c < init_latent->ne[2]; c++) { - ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c); - } - for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) { - ggml_ext_tensor_set_f32(empty_latent, 1, x, y, c); - } - } else if (sd_ctx->sd->version == VERSION_FLEX_2) { - for (int64_t c = 0; c < empty_latent->ne[2]; c++) { - // 0x16,1x1,0x16 - ggml_ext_tensor_set_f32(empty_latent, c == init_latent->ne[2], x, y, c); - } - } else { - ggml_ext_tensor_set_f32(empty_latent, 1, x, y, 0); - for (int64_t c = 1; c < empty_latent->ne[2]; c++) { - ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c); - } - } - } - } - - if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != nullptr && sd_ctx->sd->control_net == nullptr) { - bool no_inpaint = concat_latent == nullptr; - if (no_inpaint) { - concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); - } - // fill in the control image here - for (int64_t x = 0; x < control_latent->ne[0]; x++) { - for (int64_t y = 0; y < control_latent->ne[1]; y++) { - if (no_inpaint) { - for (int64_t c = 0; c < concat_latent->ne[2] - control_latent->ne[2]; c++) { - // 0x16,1x1,0x16 - ggml_ext_tensor_set_f32(concat_latent, c == init_latent->ne[2], x, y, c); - } - } - for (int64_t c = 0; c < control_latent->ne[2]; c++) { - float v = ggml_ext_tensor_get_f32(control_latent, x, y, c); - ggml_ext_tensor_set_f32(concat_latent, v, x, y, concat_latent->ne[2] - control_latent->ne[2] + c); - } - } - } - } else if (concat_latent == nullptr) { - concat_latent = empty_latent; - } - cond.c_concat = concat_latent; - uncond.c_concat = empty_latent; - denoise_mask = nullptr; - } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) { - auto empty_latent = ggml_dup_tensor(work_ctx, init_latent); - ggml_set_f32(empty_latent, 0); - uncond.c_concat = empty_latent; - cond.c_concat = ref_latents[0]; - if (cond.c_concat == nullptr) { - cond.c_concat = empty_latent; - } - } else if (sd_version_is_control(sd_ctx->sd->version)) { - auto empty_latent = ggml_dup_tensor(work_ctx, init_latent); - ggml_set_f32(empty_latent, 0); - uncond.c_concat = empty_latent; - if (sd_ctx->sd->control_net == nullptr) { - cond.c_concat = control_latent; - } - if (cond.c_concat == nullptr) { - cond.c_concat = empty_latent; - } - } - SDCondition img_cond; - if (uncond.c_crossattn != nullptr && - (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { - img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); - } - for (int b = 0; b < batch_count; b++) { - int64_t sampling_start = ggml_time_ms(); - int64_t cur_seed = seed + b; - LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed); - - sd_ctx->sd->rng->manual_seed(cur_seed); - sd_ctx->sd->sampler_rng->manual_seed(cur_seed); - ggml_tensor* x_t = init_latent; - ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); - ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng); - - int start_merge_step = -1; - if (sd_ctx->sd->use_pmid) { - start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps); - // if (start_merge_step > 30) - // start_merge_step = 30; - LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); - } - - ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, - sd_ctx->sd->diffusion_model, - true, - x_t, - noise, - cond, - uncond, - img_cond, - image_hint, - control_strength, - guidance, - eta, - shifted_timestep, - sample_method, - sigmas, - start_merge_step, - id_cond, - ref_latents, - increase_ref_index, - denoise_mask, - nullptr, - 1.0f, - cache_params); - int64_t sampling_end = ggml_time_ms(); - if (x_0 != nullptr) { - // print_ggml_tensor(x_0); - LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - final_latents.push_back(x_0); - } else { - LOG_ERROR("sampling for image %d/%d failed after %.2fs", b + 1, batch_count, (sampling_end - sampling_start) * 1.0f / 1000); - } - } - - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } - int64_t t3 = ggml_time_ms(); - LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000); - - // Decode to image - LOG_INFO("decoding %zu latents", final_latents.size()); - std::vector decoded_images; // collect decoded images - for (size_t i = 0; i < final_latents.size(); i++) { - t1 = ggml_time_ms(); - ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); - // print_ggml_tensor(img); - if (img != nullptr) { - decoded_images.push_back(img); - } - int64_t t2 = ggml_time_ms(); - LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000); - } - - int64_t t4 = ggml_time_ms(); - LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->first_stage_model->free_params_buffer(); - } - - sd_ctx->sd->lora_stat(); - - sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); - if (result_images == nullptr) { - ggml_free(work_ctx); - return nullptr; - } - memset(result_images, 0, batch_count * sizeof(sd_image_t)); - - for (size_t i = 0; i < decoded_images.size(); i++) { - result_images[i].width = width; - result_images[i].height = height; - result_images[i].channel = 3; - result_images[i].data = ggml_tensor_to_sd_image(decoded_images[i]); - } - ggml_free(work_ctx); - - return result_images; + srand((int)time(nullptr)); + return rand(); } -sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { - sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params; +static enum sample_method_t resolve_sample_method(sd_ctx_t* sd_ctx, enum sample_method_t sample_method) { + if (sample_method == SAMPLE_METHOD_COUNT) { + return sd_get_default_sample_method(sd_ctx); + } + return sample_method; +} - int width = sd_img_gen_params->width; - int height = sd_img_gen_params->height; +static scheduler_t resolve_scheduler(sd_ctx_t* sd_ctx, + scheduler_t scheduler, + enum sample_method_t sample_method) { + if (scheduler == SCHEDULER_COUNT) { + return sd_get_default_scheduler(sd_ctx, sample_method); + } + return scheduler; +} - int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); - int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); - int spatial_multiple = vae_scale_factor * diffusion_model_down_factor; +struct GenerationRequest { + std::string prompt; + std::string negative_prompt; + int width = -1; + int height = -1; + int clip_skip = -1; + int vae_scale_factor = -1; + int diffusion_model_down_factor = -1; + int64_t seed = -1; + bool use_uncond = false; + bool use_img_cond = false; + bool use_high_noise_uncond = false; + bool use_high_noise_img_cond = false; + const sd_cache_params_t* cache_params = nullptr; + int batch_count = 1; + int shifted_timestep = 0; + float strength = 1.f; + float control_strength = 0.f; + float eta = 0.f; + bool increase_ref_index = false; + bool auto_resize_ref_image = false; + sd_guidance_params_t guidance = {}; + sd_guidance_params_t high_noise_guidance = {}; + sd_pm_params_t pm_params = {}; + int frames = -1; + float vace_strength = 1.f; + + GenerationRequest(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { + prompt = SAFE_STR(sd_img_gen_params->prompt); + negative_prompt = SAFE_STR(sd_img_gen_params->negative_prompt); + width = sd_img_gen_params->width; + height = sd_img_gen_params->height; + vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); + diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); + seed = sd_img_gen_params->seed; + batch_count = sd_img_gen_params->batch_count; + clip_skip = sd_img_gen_params->clip_skip; + shifted_timestep = sd_img_gen_params->sample_params.shifted_timestep; + strength = sd_img_gen_params->strength; + control_strength = sd_img_gen_params->control_strength; + eta = sd_img_gen_params->sample_params.eta; + increase_ref_index = sd_img_gen_params->increase_ref_index; + auto_resize_ref_image = sd_img_gen_params->auto_resize_ref_image; + guidance = sd_img_gen_params->sample_params.guidance; + pm_params = sd_img_gen_params->pm_params; + cache_params = &sd_img_gen_params->cache; + resolve(sd_ctx); + } + + GenerationRequest(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) { + prompt = SAFE_STR(sd_vid_gen_params->prompt); + negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt); + width = sd_vid_gen_params->width; + height = sd_vid_gen_params->height; + frames = (sd_vid_gen_params->video_frames - 1) / 4 * 4 + 1; + clip_skip = sd_vid_gen_params->clip_skip; + vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); + diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); + seed = sd_vid_gen_params->seed; + cache_params = &sd_vid_gen_params->cache; + vace_strength = sd_vid_gen_params->vace_strength; + guidance = sd_vid_gen_params->sample_params.guidance; + high_noise_guidance = sd_vid_gen_params->high_noise_sample_params.guidance; + resolve(sd_ctx); + } + + void align_generation_request_size() { + int spatial_multiple = vae_scale_factor * diffusion_model_down_factor; + int width_offset = align_up_offset(width, spatial_multiple); + int height_offset = align_up_offset(height, spatial_multiple); + if (width_offset <= 0 && height_offset <= 0) { + return; + } + + int original_width = width; + int original_height = height; - int width_offset = align_up_offset(width, spatial_multiple); - int height_offset = align_up_offset(height, spatial_multiple); - if (width_offset > 0 || height_offset > 0) { width += width_offset; height += height_offset; - LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple); + LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", + original_width, + original_height, + width, + height, + spatial_multiple); } - bool circular_x = sd_ctx->sd->circular_x; - bool circular_y = sd_ctx->sd->circular_y; - - if (!sd_img_gen_params->vae_tiling_params.enabled) { - if (sd_ctx->sd->first_stage_model) { - sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); - } - if (sd_ctx->sd->preview_vae) { - sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); - } - } else { - int tile_size_x, tile_size_y; - float _overlap; - int latent_size_x = width / sd_ctx->sd->get_vae_scale_factor(); - int latent_size_y = height / sd_ctx->sd->get_vae_scale_factor(); - sd_ctx->sd->first_stage_model->get_tile_sizes(tile_size_x, tile_size_y, _overlap, sd_img_gen_params->vae_tiling_params, latent_size_x, latent_size_y); - - // force disable circular padding for vae if tiling is enabled unless latent is smaller than tile size - // otherwise it will cause artifacts at the edges of the tiles - sd_ctx->sd->circular_x = sd_ctx->sd->circular_x && (tile_size_x >= latent_size_x); - sd_ctx->sd->circular_y = sd_ctx->sd->circular_y && (tile_size_y >= latent_size_y); - - if (sd_ctx->sd->first_stage_model) { - sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); - } - if (sd_ctx->sd->preview_vae) { - sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + static void resolve_guidance(sd_ctx_t* sd_ctx, + sd_guidance_params_t* guidance, + bool* use_uncond, + bool* use_img_cond, + const char* stage_name = nullptr) { + GGML_ASSERT(guidance != nullptr); + GGML_ASSERT(use_uncond != nullptr); + GGML_ASSERT(use_img_cond != nullptr); + // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond) + // img_cfg == txt_cfg means that img_cfg is not used + if (!std::isfinite(guidance->img_cfg)) { + guidance->img_cfg = guidance->txt_cfg; } - // disable circular tiling if it's enabled for the VAE - sd_ctx->sd->circular_x = circular_x && (tile_size_x < latent_size_x); - sd_ctx->sd->circular_y = circular_y && (tile_size_y < latent_size_y); - } - - LOG_DEBUG("generate_image %dx%d", width, height); - if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { - return nullptr; - } - - ggml_init_params params; - params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = nullptr; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); - - ggml_context* work_ctx = ggml_init(params); - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); - return nullptr; - } - - int64_t seed = sd_img_gen_params->seed; - if (seed < 0) { - srand((int)time(nullptr)); - seed = rand(); - } - sd_ctx->sd->rng->manual_seed(seed); - sd_ctx->sd->sampler_rng->manual_seed(seed); - - size_t t0 = ggml_time_ms(); - - sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift); - - // Apply lora - sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count); - - enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method; - if (sample_method == SAMPLE_METHOD_COUNT) { - sample_method = sd_get_default_sample_method(sd_ctx); - } - LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - - int sample_steps = sd_img_gen_params->sample_params.sample_steps; - std::vector sigmas; - if (sd_img_gen_params->sample_params.custom_sigmas_count > 0) { - sigmas = std::vector(sd_img_gen_params->sample_params.custom_sigmas, - sd_img_gen_params->sample_params.custom_sigmas + sd_img_gen_params->sample_params.custom_sigmas_count); - if (sample_steps != sigmas.size() - 1) { - sample_steps = static_cast(sigmas.size()) - 1; - LOG_WARN("sample_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps); + if (!sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version)) { + guidance->img_cfg = guidance->txt_cfg; } - } else { - scheduler_t scheduler = sd_img_gen_params->sample_params.scheduler; - if (scheduler == SCHEDULER_COUNT) { - scheduler = sd_get_default_scheduler(sd_ctx, sample_method); + + if (guidance->txt_cfg != 1.f) { + *use_uncond = true; } - sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps, - sd_ctx->sd->get_image_seq_len(height, width), - scheduler, - sd_ctx->sd->version); - } - ggml_tensor* init_latent = nullptr; - ggml_tensor* concat_latent = nullptr; - ggml_tensor* denoise_mask = nullptr; - if (sd_img_gen_params->init_image.data) { - LOG_INFO("IMG2IMG"); + if (guidance->img_cfg != guidance->txt_cfg) { + *use_img_cond = true; + *use_uncond = true; + } - size_t t_enc = static_cast(sample_steps * sd_img_gen_params->strength); - if (t_enc == sample_steps) - t_enc--; - LOG_INFO("target t_enc is %zu steps", t_enc); - std::vector sigma_sched; - sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); - sigmas = sigma_sched; - - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); - - sd_image_to_ggml_tensor(sd_img_gen_params->mask_image, mask_img); - sd_image_to_ggml_tensor(sd_img_gen_params->init_image, init_img); - - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - int64_t mask_channels = 1; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - mask_channels = vae_scale_factor * vae_scale_factor; // flatten the whole mask - } else if (sd_ctx->sd->version == VERSION_FLEX_2) { - mask_channels = 1 + sd_ctx->sd->get_latent_channel(); - } - ggml_tensor* masked_latent = nullptr; - - if (sd_ctx->sd->version != VERSION_FLEX_2) { - // most inpaint models mask before vae - ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - ggml_ext_tensor_apply_mask(init_img, mask_img, masked_img); - masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + if (guidance->txt_cfg < 1.f) { + const char* prefix = stage_name == nullptr ? "" : stage_name; + if (guidance->txt_cfg == 0.f) { + LOG_WARN("%sunconditioned mode, images won't follow the prompt (use cfg-scale=1 for distilled models)", + prefix); } else { - // mask after vae - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1); - ggml_ext_tensor_apply_mask(init_latent, mask_img, masked_latent, 0.); - } - concat_latent = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - masked_latent->ne[0], - masked_latent->ne[1], - mask_channels + masked_latent->ne[2], - 1); - for (int ix = 0; ix < masked_latent->ne[0]; ix++) { - for (int iy = 0; iy < masked_latent->ne[1]; iy++) { - int mx = ix * vae_scale_factor; - int my = iy * vae_scale_factor; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - for (int k = 0; k < masked_latent->ne[2]; k++) { - float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k); - ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k); - } - // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image - for (int x = 0; x < vae_scale_factor; x++) { - for (int y = 0; y < vae_scale_factor; y++) { - float m = ggml_ext_tensor_get_f32(mask_img, mx + x, my + y); - // TODO: check if the way the mask is flattened is correct (is it supposed to be x*vae_scale_factor+y or x+vae_scale_factor*y?) - // python code was using "b (h vae_scale_factor) (w vae_scale_factor) -> b (vae_scale_factor vae_scale_factor) h w" - ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * vae_scale_factor + y); - } - } - } else if (sd_ctx->sd->version == VERSION_FLEX_2) { - float m = ggml_ext_tensor_get_f32(mask_img, mx, my); - // masked image - for (int k = 0; k < masked_latent->ne[2]; k++) { - float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k); - ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k); - } - // downsampled mask - ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2]); - // control (todo: support this) - for (int k = 0; k < masked_latent->ne[2]; k++) { - ggml_ext_tensor_set_f32(concat_latent, 0, ix, iy, masked_latent->ne[2] + 1 + k); - } - } else { - float m = ggml_ext_tensor_get_f32(mask_img, mx, my); - ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, 0); - for (int k = 0; k < masked_latent->ne[2]; k++) { - float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k); - ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels); - } - } - } - } - } else { - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - } - - { - // LOG_WARN("Inpainting with a base model is not great"); - denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / vae_scale_factor, height / vae_scale_factor, 1, 1); - for (int ix = 0; ix < denoise_mask->ne[0]; ix++) { - for (int iy = 0; iy < denoise_mask->ne[1]; iy++) { - int mx = ix * vae_scale_factor; - int my = iy * vae_scale_factor; - float m = ggml_ext_tensor_get_f32(mask_img, mx, my); - ggml_ext_tensor_set_f32(denoise_mask, m, ix, iy); - } + LOG_WARN("%scfg value out of expected range may produce unexpected results", prefix); } } - } else { - LOG_INFO("TXT2IMG"); - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); - } - init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height); } - sd_guidance_params_t guidance = sd_img_gen_params->sample_params.guidance; - std::vector ref_images; - for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) { - ref_images.push_back(&sd_img_gen_params->ref_images[i]); - } + void resolve(sd_ctx_t* sd_ctx) { + align_generation_request_size(); + seed = resolve_seed(seed); - std::vector empty_image_data; - sd_image_t empty_image = {(uint32_t)width, (uint32_t)height, 3, nullptr}; - if (ref_images.empty() && sd_version_is_unet_edit(sd_ctx->sd->version)) { - LOG_WARN("This model needs at least one reference image; using an empty reference"); - empty_image_data.resize(width * height * 3); - ref_images.push_back(&empty_image); - empty_image.data = empty_image_data.data(); - guidance.img_cfg = 0.f; - } - - if (ref_images.size() > 0) { - LOG_INFO("EDIT mode"); - } - - std::vector ref_latents; - for (int i = 0; i < ref_images.size(); i++) { - ggml_tensor* img; - if (sd_img_gen_params->auto_resize_ref_image) { - LOG_DEBUG("auto resize ref images"); - sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]); - int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height); - double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height); - double vae_height = vae_width * ref_image.height / ref_image.width; - - int factor = 16; - if (sd_version_is_qwen_image(sd_ctx->sd->version)) { - factor = 32; - } - - vae_height = round(vae_height / factor) * factor; - vae_width = round(vae_width / factor) * factor; - - sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast(vae_width), static_cast(vae_height)); - free(ref_image.data); - ref_image.data = nullptr; - - LOG_DEBUG("resize vae ref image %d from %dx%d to %dx%d", i, ref_image.height, ref_image.width, resized_image.height, resized_image.width); - - img = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - resized_image.width, - resized_image.height, - 3, - 1); - sd_image_f32_to_ggml_tensor(resized_image, img); - free(resized_image.data); - resized_image.data = nullptr; - } else { - img = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - ref_images[i]->width, - ref_images[i]->height, - 3, - 1); - sd_image_to_ggml_tensor(*ref_images[i], img); + resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_cond); + if (sd_ctx->sd->high_noise_diffusion_model) { + resolve_guidance(sd_ctx, + &high_noise_guidance, + &use_high_noise_uncond, + &use_high_noise_img_cond, + "high noise: "); } - // print_ggml_tensor(img, false, "img"); - - ggml_tensor* latent = sd_ctx->sd->encode_first_stage(work_ctx, img); - ref_latents.push_back(latent); - } - - if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) { - size_t t1 = ggml_time_ms(); - LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - } - - sd_image_t* result_images = generate_image_internal(sd_ctx, - work_ctx, - init_latent, - SAFE_STR(sd_img_gen_params->prompt), - SAFE_STR(sd_img_gen_params->negative_prompt), - sd_img_gen_params->clip_skip, - guidance, - sd_img_gen_params->sample_params.eta, - sd_img_gen_params->sample_params.shifted_timestep, - width, - height, - sample_method, - sigmas, - seed, - sd_img_gen_params->batch_count, - sd_img_gen_params->control_image, - sd_img_gen_params->control_strength, - sd_img_gen_params->pm_params, - ref_images, - ref_latents, - sd_img_gen_params->increase_ref_index, - concat_latent, - denoise_mask, - &sd_img_gen_params->cache); - - // restore circular params - sd_ctx->sd->circular_x = circular_x; - sd_ctx->sd->circular_y = circular_y; - - size_t t2 = ggml_time_ms(); - - LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000); - - return result_images; -} - -SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) { - if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) { - return nullptr; - } - sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params; - - std::string prompt = SAFE_STR(sd_vid_gen_params->prompt); - std::string negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt); - - int width = sd_vid_gen_params->width; - int height = sd_vid_gen_params->height; - int frames = sd_vid_gen_params->video_frames; - frames = (frames - 1) / 4 * 4 + 1; - int sample_steps = sd_vid_gen_params->sample_params.sample_steps; - - int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); - int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); - int spatial_multiple = vae_scale_factor * diffusion_model_down_factor; - - int width_offset = align_up_offset(width, spatial_multiple); - int height_offset = align_up_offset(height, spatial_multiple); - if (width_offset > 0 || height_offset > 0) { - width += width_offset; - height += height_offset; - LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple); - } - LOG_INFO("generate_video %dx%dx%d", width, height, frames); - - sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift); - - enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method; - if (sample_method == SAMPLE_METHOD_COUNT) { - sample_method = sd_get_default_sample_method(sd_ctx); - } - LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - - int high_noise_sample_steps = 0; - if (sd_ctx->sd->high_noise_diffusion_model) { - high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps; - } - - int total_steps = sample_steps; - - if (high_noise_sample_steps > 0) { - total_steps += high_noise_sample_steps; + if (shifted_timestep > 0 && !sd_version_is_sdxl(sd_ctx->sd->version)) { + LOG_WARN("timestep shifting is only supported for SDXL models!"); + shifted_timestep = 0; + } } +}; +struct SamplePlan { + enum sample_method_t sample_method = SAMPLE_METHOD_COUNT; + enum sample_method_t high_noise_sample_method = SAMPLE_METHOD_COUNT; + int sample_steps = 0; + int high_noise_sample_steps = 0; + int total_steps = 0; + float moe_boundary = 0.f; + int start_merge_step = -1; std::vector sigmas; - if (sd_vid_gen_params->sample_params.custom_sigmas_count > 0) { - sigmas = std::vector(sd_vid_gen_params->sample_params.custom_sigmas, - sd_vid_gen_params->sample_params.custom_sigmas + sd_vid_gen_params->sample_params.custom_sigmas_count); - if (total_steps != sigmas.size() - 1) { + + SamplePlan(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + const GenerationRequest& request) { + sample_method = sd_img_gen_params->sample_params.sample_method; + sample_steps = sd_img_gen_params->sample_params.sample_steps; + resolve(sd_ctx, &request, &sd_img_gen_params->sample_params); + } + + SamplePlan(sd_ctx_t* sd_ctx, + const sd_vid_gen_params_t* sd_vid_gen_params, + const GenerationRequest& request) { + sample_method = sd_vid_gen_params->sample_params.sample_method; + sample_steps = sd_vid_gen_params->sample_params.sample_steps; + if (sd_ctx->sd->high_noise_diffusion_model) { + high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps; + high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method; + } + moe_boundary = sd_vid_gen_params->moe_boundary; + resolve(sd_ctx, &request, &sd_vid_gen_params->sample_params); + } + + void resolve(sd_ctx_t* sd_ctx, + const GenerationRequest* request, + const sd_sample_params_t* sample_params) { + sample_method = resolve_sample_method(sd_ctx, sample_method); + + total_steps = sample_steps + std::max(0, high_noise_sample_steps); + + if (sample_params->custom_sigmas_count > 0) { + sigmas = std::vector(sample_params->custom_sigmas, + sample_params->custom_sigmas + sample_params->custom_sigmas_count); total_steps = static_cast(sigmas.size()) - 1; LOG_WARN("total_steps != custom_sigmas_count - 1, set total_steps to %d", total_steps); if (sample_steps >= total_steps) { @@ -3637,60 +2634,559 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s high_noise_sample_steps = total_steps - sample_steps; LOG_WARN("total_steps != custom_sigmas_count - 1, set high_noise_sample_steps to %d", high_noise_sample_steps); } + } else { + scheduler_t scheduler = resolve_scheduler(sd_ctx, + sample_params->scheduler, + sample_method); + sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, + sd_ctx->sd->get_image_seq_len(request->height, request->width), + scheduler, + sd_ctx->sd->version); } - } else { - scheduler_t scheduler = sd_vid_gen_params->sample_params.scheduler; - if (scheduler == SCHEDULER_COUNT) { - scheduler = sd_get_default_scheduler(sd_ctx, sample_method); - } - sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, - 0, - scheduler, - sd_ctx->sd->version); - } - if (high_noise_sample_steps < 0) { - // timesteps �?sigmas for Flow models (like wan2.2 a14b) - for (size_t i = 0; i < sigmas.size(); ++i) { - if (sigmas[i] < sd_vid_gen_params->moe_boundary) { - high_noise_sample_steps = static_cast(i); - break; + if (high_noise_sample_steps < 0) { + for (size_t i = 0; i < sigmas.size(); ++i) { + if (sigmas[i] < moe_boundary) { + high_noise_sample_steps = static_cast(i); + break; + } } + LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps); } - LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps); + + LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); + if (high_noise_sample_steps > 0) { + high_noise_sample_method = resolve_sample_method(sd_ctx, + high_noise_sample_method); + LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]); + } + + if (sd_ctx->sd->use_pmid) { + start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * total_steps); + LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); + } + } +}; + +struct ImageGenerationLatents { + sd::Tensor init_latent; + sd::Tensor concat_latent; + sd::Tensor uncond_concat_latent; + sd::Tensor control_image; + std::vector> ref_images; + std::vector> ref_latents; + sd::Tensor denoise_mask; + sd::Tensor clip_vision_output; + sd::Tensor vace_context; + int64_t ref_image_num = 0; +}; + +struct ImageGenerationEmbeds { + SDCondition cond; + SDCondition uncond; + SDCondition img_cond; + SDCondition id_cond; +}; + +struct CircularAxesState { + bool circular_x = false; + bool circular_y = false; +}; + +static CircularAxesState configure_image_vae_axes(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + const GenerationRequest& request) { + CircularAxesState original_axes = {sd_ctx->sd->circular_x, sd_ctx->sd->circular_y}; + + if (!sd_img_gen_params->vae_tiling_params.enabled) { + if (sd_ctx->sd->first_stage_model) { + sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + if (sd_ctx->sd->preview_vae) { + sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + return original_axes; } - ggml_init_params params; - params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = nullptr; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); + int tile_size_x, tile_size_y; + float overlap; + int latent_size_x = request.width / request.vae_scale_factor; + int latent_size_y = request.height / request.vae_scale_factor; + sd_ctx->sd->first_stage_model->get_tile_sizes(tile_size_x, + tile_size_y, + overlap, + sd_img_gen_params->vae_tiling_params, + latent_size_x, + latent_size_y); - ggml_context* work_ctx = ggml_init(params); - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); + sd_ctx->sd->circular_x = sd_ctx->sd->circular_x && (tile_size_x >= latent_size_x); + sd_ctx->sd->circular_y = sd_ctx->sd->circular_y && (tile_size_y >= latent_size_y); + + if (sd_ctx->sd->first_stage_model) { + sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + if (sd_ctx->sd->preview_vae) { + sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + + sd_ctx->sd->circular_x = original_axes.circular_x && (tile_size_x < latent_size_x); + sd_ctx->sd->circular_y = original_axes.circular_y && (tile_size_y < latent_size_y); + + return original_axes; +} + +static void restore_image_vae_axes(sd_ctx_t* sd_ctx, const CircularAxesState& original_axes) { + sd_ctx->sd->circular_x = original_axes.circular_x; + sd_ctx->sd->circular_y = original_axes.circular_y; +} + +class ImageVaeAxesGuard { +private: + sd_ctx_t* sd_ctx = nullptr; + CircularAxesState original_axes; + +public: + ImageVaeAxesGuard(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + const GenerationRequest& request) + : sd_ctx(sd_ctx), + original_axes(configure_image_vae_axes(sd_ctx, sd_img_gen_params, request)) {} + + ~ImageVaeAxesGuard() { + restore_image_vae_axes(sd_ctx, original_axes); + } + + ImageVaeAxesGuard(const ImageVaeAxesGuard&) = delete; + ImageVaeAxesGuard& operator=(const ImageVaeAxesGuard&) = delete; +}; + +static std::optional prepare_image_generation_latents(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + GenerationRequest* request, + SamplePlan* plan) { + int64_t prepare_start_ms = ggml_time_ms(); + + sd::Tensor init_image_tensor; + sd::Tensor control_image_tensor; + sd::Tensor mask_image_tensor; + + if (sd_img_gen_params->init_image.data != nullptr) { + LOG_INFO("IMG2IMG"); + + if (request->strength < 1.f) { + size_t t_enc = static_cast(plan->sample_steps * request->strength); + if (t_enc == static_cast(plan->sample_steps)) { + t_enc--; + } + LOG_INFO("target t_enc is %zu steps", t_enc); + std::vector sigma_sched; + sigma_sched.assign(plan->sigmas.begin() + plan->sample_steps - t_enc - 1, plan->sigmas.end()); + plan->sigmas = std::move(sigma_sched); + plan->sample_steps = static_cast(plan->sigmas.size() - 1); + } + + init_image_tensor = sd_image_to_tensor(sd_img_gen_params->init_image, request->width, request->height); + } + + if (sd_img_gen_params->mask_image.data != nullptr) { + mask_image_tensor = sd_image_to_tensor(sd_img_gen_params->mask_image, request->width, request->height); + mask_image_tensor = sd::ops::round(mask_image_tensor); + } + + if (sd_img_gen_params->control_image.data != nullptr) { + control_image_tensor = sd_image_to_tensor(sd_img_gen_params->control_image, request->width, request->height); + } + + if (init_image_tensor.empty() || mask_image_tensor.empty()) { + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + LOG_WARN("inpainting model requires both an init image and a mask image."); + } + } + + if (mask_image_tensor.empty()) { + mask_image_tensor = sd::full({request->width, request->height, 1, 1}, 1.f); + } + + sd::Tensor latent_mask = sd::ops::interpolate(mask_image_tensor, + {request->width / request->vae_scale_factor, + request->height / request->vae_scale_factor, + 1, + 1}); + + sd::Tensor init_latent; + sd::Tensor control_latent; + if (init_image_tensor.empty()) { + init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height); + } else { + init_latent = sd_ctx->sd->encode_first_stage(init_image_tensor); + if (init_latent.empty()) { + LOG_ERROR("failed to encode init image"); + return std::nullopt; + } + } + + if (!control_image_tensor.empty() && !sd_ctx->sd->vae_decode_only) { + control_latent = sd_ctx->sd->encode_first_stage(control_image_tensor); + if (control_latent.empty()) { + LOG_ERROR("failed to encode control image"); + return std::nullopt; + } + } + + std::vector> ref_images; + for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) { + ref_images.push_back(sd_image_to_tensor(sd_img_gen_params->ref_images[i])); + } + + if (ref_images.empty() && sd_version_is_unet_edit(sd_ctx->sd->version)) { + LOG_WARN("This model needs at least one reference image; using an empty reference"); + ref_images.push_back(sd::zeros({request->width, request->height, 3, 1})); + request->guidance.img_cfg = request->guidance.txt_cfg; + } + + if (!ref_images.empty()) { + LOG_INFO("EDIT mode"); + } + + std::vector> ref_latents; + for (size_t i = 0; i < ref_images.size(); i++) { + sd::Tensor ref_latent; + if (request->auto_resize_ref_image) { + LOG_DEBUG("auto resize ref images"); + int vae_image_size = std::min(1024 * 1024, request->width * request->height); + double vae_width = sqrt(vae_image_size * ref_images[i].shape()[0] / ref_images[i].shape()[1]); + double vae_height = vae_width * ref_images[i].shape()[1] / ref_images[i].shape()[0]; + + int factor = sd_version_is_qwen_image(sd_ctx->sd->version) ? 32 : 16; + vae_height = round(vae_height / factor) * factor; + vae_width = round(vae_width / factor) * factor; + + auto resized_ref_img = sd::ops::interpolate(ref_images[i], + {static_cast(vae_width), static_cast(vae_height), 3, 1}); + + LOG_DEBUG("resize vae ref image %d from %" PRId64 "x%" PRId64 " to %" PRId64 "x%" PRId64, + static_cast(i), + ref_images[i].shape()[1], + ref_images[i].shape()[0], + resized_ref_img.shape()[1], + resized_ref_img.shape()[0]); + + ref_latent = sd_ctx->sd->encode_first_stage(resized_ref_img); + } else { + ref_latent = sd_ctx->sd->encode_first_stage(ref_images[i]); + } + if (ref_latent.empty()) { + LOG_ERROR("failed to encode reference image %d", static_cast(i)); + return std::nullopt; + } + + ref_latents.push_back(std::move(ref_latent)); + } + + sd::Tensor concat_latent; + sd::Tensor uncond_concat_latent; + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + sd::Tensor masked_init_latent; + + if (sd_ctx->sd->version != VERSION_FLEX_2) { + if (!init_image_tensor.empty()) { + auto masked_image = ((1.0f - mask_image_tensor) * (init_image_tensor - 0.5f)) + 0.5f; + masked_init_latent = sd_ctx->sd->encode_first_stage(masked_image); + if (masked_init_latent.empty()) { + LOG_ERROR("failed to encode masked init image"); + return std::nullopt; + } + } else { + masked_init_latent = sd::Tensor::zeros_like(init_latent); + } + } else { + masked_init_latent = ((1.0f - latent_mask) * init_latent); + } + + auto uncond_masked_init_latent = sd::Tensor::zeros_like(masked_init_latent); + + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + auto mask = mask_image_tensor.reshape({request->vae_scale_factor, + request->width / request->vae_scale_factor, + request->vae_scale_factor, + request->height / request->vae_scale_factor}); + mask = mask.permute({1, 3, 0, 2}).reshape({request->width / request->vae_scale_factor, request->height / request->vae_scale_factor, request->vae_scale_factor * request->vae_scale_factor, 1}); + + concat_latent = sd::ops::concat(masked_init_latent, mask, 2); + uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, mask, 2); + } else if (sd_ctx->sd->version == VERSION_FLEX_2) { + concat_latent = sd::ops::concat(masked_init_latent, latent_mask, 2); + if (!control_latent.empty()) { + concat_latent = sd::ops::concat(concat_latent, control_latent, 2); + } else { + concat_latent = sd::ops::concat(concat_latent, sd::Tensor::zeros_like(masked_init_latent), 2); + } + + uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, latent_mask, 2); + uncond_concat_latent = sd::ops::concat(uncond_concat_latent, sd::Tensor::zeros_like(masked_init_latent), 2); + } else { // SD1.x SD2.x SDXL inpaint + concat_latent = sd::ops::concat(latent_mask, masked_init_latent, 2); + uncond_concat_latent = sd::ops::concat(latent_mask, uncond_masked_init_latent, 2); + } + } + if (sd_version_is_unet_edit(sd_ctx->sd->version)) { + concat_latent = sd::ops::interpolate(ref_latents[0], init_latent.shape()); + uncond_concat_latent = sd::Tensor::zeros_like(concat_latent); + } + if (sd_version_is_control(sd_ctx->sd->version)) { + if (!control_latent.empty()) { + concat_latent = control_latent; + } else { + concat_latent = sd::Tensor::zeros_like(init_latent); + } + uncond_concat_latent = sd::Tensor::zeros_like(concat_latent); + } + + if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) { + int64_t t1 = ggml_time_ms(); + LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000); + } + + ImageGenerationLatents latents; + latents.init_latent = std::move(init_latent); + latents.concat_latent = std::move(concat_latent); + latents.uncond_concat_latent = std::move(uncond_concat_latent); + latents.control_image = std::move(control_image_tensor); + latents.ref_images = std::move(ref_images); + latents.ref_latents = std::move(ref_latents); + + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + latents.denoise_mask = std::move(latent_mask); + } + + return latents; +} + +static std::optional prepare_image_generation_embeds(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + GenerationRequest* request, + SamplePlan* plan, + ImageGenerationLatents* latents) { + ConditionerParams condition_params; + condition_params.text = request->prompt; + condition_params.clip_skip = request->clip_skip; + condition_params.width = request->width; + condition_params.height = request->height; + condition_params.ref_images = &latents->ref_images; + condition_params.adm_in_channels = static_cast(sd_ctx->sd->diffusion_model->get_adm_in_channels()); + + auto id_cond = sd_ctx->sd->get_pmid_conditon(request->pm_params, condition_params); + int64_t prepare_start_ms = ggml_time_ms(); + condition_params.zero_out_masked = false; + auto cond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, + condition_params); + if (cond.c_concat.empty()) { + cond.c_concat = latents->concat_latent; // TODO: optimize + } + + SDCondition uncond; + if (request->use_uncond || request->use_high_noise_uncond) { + bool zero_out_masked = false; + if (sd_version_is_sdxl(sd_ctx->sd->version) && + request->negative_prompt.empty() && + !sd_ctx->sd->is_using_edm_v_parameterization) { + zero_out_masked = true; + } + condition_params.text = request->negative_prompt; + condition_params.zero_out_masked = zero_out_masked; + uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, + condition_params); + if (uncond.c_concat.empty()) { + uncond.c_concat = latents->uncond_concat_latent; // TODO: optimize + } + } + + int64_t t1 = ggml_time_ms(); + LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000); + + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->cond_stage_model->free_params_buffer(); + } + + ImageGenerationEmbeds embeds; + if (request->use_img_cond) { + embeds.img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); + } + embeds.cond = std::move(cond); + embeds.uncond = std::move(uncond); + embeds.id_cond = std::move(id_cond); + + return embeds; +} + +static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx, + const GenerationRequest& request, + const std::vector>& final_latents) { + if (final_latents.size() != static_cast(request.batch_count)) { + LOG_ERROR("expected %d latents, got %zu", request.batch_count, final_latents.size()); + return nullptr; + } + LOG_INFO("decoding %zu latents", final_latents.size()); + std::vector> decoded_images; + int64_t t0 = ggml_time_ms(); + + for (size_t i = 0; i < final_latents.size(); i++) { + int64_t t1 = ggml_time_ms(); + sd::Tensor image = sd_ctx->sd->decode_first_stage(final_latents[i]); + if (image.empty()) { + LOG_ERROR("decode_first_stage failed for latent %" PRId64, i + 1); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->first_stage_model->free_params_buffer(); + } + return nullptr; + } + decoded_images.push_back(std::move(image)); + int64_t t2 = ggml_time_ms(); + LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000); + } + + int64_t t4 = ggml_time_ms(); + LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t0) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->first_stage_model->free_params_buffer(); + } + + sd_image_t* result_images = (sd_image_t*)calloc(request.batch_count, sizeof(sd_image_t)); + if (result_images == nullptr) { + return nullptr; + } + memset(result_images, 0, request.batch_count * sizeof(sd_image_t)); + + for (size_t i = 0; i < decoded_images.size(); i++) { + result_images[i] = tensor_to_sd_image(decoded_images[i]); + } + + return result_images; +} + +SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { + if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { return nullptr; } - int64_t seed = sd_vid_gen_params->seed; - if (seed < 0) { - seed = (int)time(nullptr); + int64_t t0 = ggml_time_ms(); + sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params; + GenerationRequest request(sd_ctx, sd_img_gen_params); + LOG_INFO("generate_image %dx%d", request.width, request.height); + + sd_ctx->sd->rng->manual_seed(request.seed); + sd_ctx->sd->sampler_rng->manual_seed(request.seed); + sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift); + sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count); + + ImageVaeAxesGuard axes_guard(sd_ctx, sd_img_gen_params, request); + + SamplePlan plan(sd_ctx, sd_img_gen_params, request); + auto latents_opt = prepare_image_generation_latents(sd_ctx, + sd_img_gen_params, + &request, + &plan); + if (!latents_opt.has_value()) { + return nullptr; + } + ImageGenerationLatents latents = std::move(*latents_opt); + + auto embeds_opt = prepare_image_generation_embeds(sd_ctx, + sd_img_gen_params, + &request, + &plan, + &latents); + if (!embeds_opt.has_value()) { + return nullptr; + } + ImageGenerationEmbeds embeds = std::move(*embeds_opt); + + std::vector> final_latents; + int64_t denoise_start = ggml_time_ms(); + for (int b = 0; b < request.batch_count; b++) { + int64_t sampling_start = ggml_time_ms(); + int64_t cur_seed = request.seed + b; + LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, request.batch_count, cur_seed); + + sd_ctx->sd->rng->manual_seed(cur_seed); + sd_ctx->sd->sampler_rng->manual_seed(cur_seed); + sd::Tensor noise = sd::randn_like(latents.init_latent, sd_ctx->sd->rng); + + sd::Tensor x_0 = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model, + true, + latents.init_latent, + std::move(noise), + embeds.cond, + embeds.uncond, + embeds.img_cond, + embeds.id_cond, + latents.control_image, + request.control_strength, + request.guidance, + request.eta, + request.shifted_timestep, + plan.sample_method, + plan.sigmas, + plan.start_merge_step, + latents.ref_latents, + request.increase_ref_index, + latents.denoise_mask, + sd::Tensor(), + 1.f, + request.cache_params); + int64_t sampling_end = ggml_time_ms(); + if (!x_0.empty()) { + LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + final_latents.push_back(std::move(x_0)); + continue; + } + + LOG_ERROR("sampling for image %d/%d failed after %.2fs", + b + 1, + request.batch_count, + (sampling_end - sampling_start) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + return nullptr; + } + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + int64_t denoise_end = ggml_time_ms(); + LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", + final_latents.size(), + (denoise_end - denoise_start) * 1.0f / 1000); + + auto result = decode_image_outputs(sd_ctx, request, final_latents); + if (result == nullptr) { + return nullptr; } - sd_ctx->sd->rng->manual_seed(seed); - sd_ctx->sd->sampler_rng->manual_seed(seed); + sd_ctx->sd->lora_stat(); - int64_t t0 = ggml_time_ms(); + int64_t t1 = ggml_time_ms(); + LOG_INFO("generate_image completed in %.2fs", (t1 - t0) * 1.0f / 1000); + return result; +} - // Apply lora - sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count); +static std::optional prepare_video_generation_latents(sd_ctx_t* sd_ctx, + const sd_vid_gen_params_t* sd_vid_gen_params, + GenerationRequest* request) { + ImageGenerationLatents latents; + int64_t prepare_start_ms = ggml_time_ms(); + + sd::Tensor start_image; + sd::Tensor end_image; + + if (sd_vid_gen_params->init_image.data) { + start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, request->width, request->height); + } + + if (sd_vid_gen_params->end_image.data) { + end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, request->width, request->height); + } - ggml_tensor* init_latent = nullptr; - ggml_tensor* clip_vision_output = nullptr; - ggml_tensor* concat_latent = nullptr; - ggml_tensor* denoise_mask = nullptr; - ggml_tensor* vace_context = nullptr; - int64_t ref_image_num = 0; // for vace if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" || @@ -3700,331 +3196,370 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") { - if (sd_vid_gen_params->init_image.data) { - clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2); + if (!start_image.empty()) { + auto clip_vision_output = sd_ctx->sd->get_clip_vision_output(start_image, false, -2); + if (clip_vision_output.empty()) { + LOG_ERROR("failed to compute clip vision output for init image"); + return std::nullopt; + } + latents.clip_vision_output = std::move(clip_vision_output); } else { - clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2, true); + latents.clip_vision_output = sd_ctx->sd->get_clip_vision_output(start_image, false, -2, true); } if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") { - ggml_tensor* end_image_clip_vision_output = nullptr; - if (sd_vid_gen_params->end_image.data) { - end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2); + sd::Tensor end_image_clip_vision_output; + if (!end_image.empty()) { + end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(end_image, false, -2); + if (end_image_clip_vision_output.empty()) { + LOG_ERROR("failed to compute clip vision output for end image"); + return std::nullopt; + } } else { - end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2, true); + end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(end_image, false, -2, true); } - clip_vision_output = ggml_ext_tensor_concat(work_ctx, clip_vision_output, end_image_clip_vision_output, 1); + latents.clip_vision_output = sd::ops::concat(latents.clip_vision_output, end_image_clip_vision_output, 1); } int64_t t1 = ggml_time_ms(); - LOG_INFO("get_clip_vision_output completed, taking %" PRId64 " ms", t1 - t0); + LOG_INFO("get_clip_vision_output completed, taking %" PRId64 " ms", t1 - prepare_start_ms); } - int64_t t1 = ggml_time_ms(); - ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3); - ggml_ext_tensor_iter(image, [&](ggml_tensor* image, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.5f; - if (i2 == 0 && sd_vid_gen_params->init_image.data) { // start image - value = *(sd_vid_gen_params->init_image.data + i1 * width * 3 + i0 * 3 + i3); - value /= 255.f; - } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data) { - value = *(sd_vid_gen_params->end_image.data + i1 * width * 3 + i0 * 3 + i3); - value /= 255.f; - } - ggml_ext_tensor_set_f32(image, value, i0, i1, i2, i3); - }); + int64_t t1 = ggml_time_ms(); + sd::Tensor image = sd::full({request->width, request->height, request->frames, 3, 1}, 0.5f); + if (!start_image.empty()) { + sd::ops::slice_assign(&image, 2, 0, 1, start_image.unsqueeze(2)); + } + if (!end_image.empty()) { + sd::ops::slice_assign(&image, 2, request->frames - 1, request->frames, end_image.unsqueeze(2)); + } - concat_latent = sd_ctx->sd->encode_first_stage(work_ctx, image); // [b*c, t, h/vae_scale_factor, w/vae_scale_factor] + auto concat_latent = sd_ctx->sd->encode_first_stage(image); // [b, c, t, h/vae_scale_factor, w/vae_scale_factor] + if (concat_latent.empty()) { + LOG_ERROR("failed to encode video conditioning frames"); + return std::nullopt; + } + latents.concat_latent = std::move(concat_latent); int64_t t2 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); - ggml_tensor* concat_mask = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - concat_latent->ne[0], - concat_latent->ne[1], - concat_latent->ne[2], - 4); // [b*4, t, w/vae_scale_factor, h/vae_scale_factor] - ggml_ext_tensor_iter(concat_mask, [&](ggml_tensor* concat_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.0f; - if (i2 == 0 && sd_vid_gen_params->init_image.data) { // start image - value = 1.0f; - } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data && i3 == 3) { - value = 1.0f; - } - ggml_ext_tensor_set_f32(concat_mask, value, i0, i1, i2, i3); - }); - - concat_latent = ggml_ext_tensor_concat(work_ctx, concat_mask, concat_latent, 3); // [b*(c+4), t, h/vae_scale_factor, w/vae_scale_factor] - } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && sd_vid_gen_params->init_image.data) { + sd::Tensor concat_mask = sd::zeros({latents.concat_latent.shape()[0], + latents.concat_latent.shape()[1], + latents.concat_latent.shape()[2], + 4, + 1}); // [b, 4, t, h/vae_scale_factor, w/vae_scale_factor] + if (!start_image.empty()) { + sd::ops::fill_slice(&concat_mask, 2, 0, 1, 1.0f); + } + if (!end_image.empty()) { + auto last_channel = sd::ops::slice(concat_mask, 3, 3, 4); + sd::ops::fill_slice(&last_channel, 2, last_channel.shape()[2] - 1, last_channel.shape()[2], 1.0f); + sd::ops::slice_assign(&concat_mask, 3, 3, 4, last_channel); + } + latents.concat_latent = sd::ops::concat(concat_mask, latents.concat_latent, 3); // [b, 4+c, t, h/vae_scale_factor, w/vae_scale_factor] + } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && !start_image.empty()) { LOG_INFO("IMG2VID"); - int64_t t1 = ggml_time_ms(); - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, init_img); - init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3); + int64_t t1 = ggml_time_ms(); + auto init_img = start_image.reshape({start_image.shape()[0], start_image.shape()[1], 1, start_image.shape()[2], 1}); + auto init_image_latent = sd_ctx->sd->encode_first_stage(init_img); // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor] + if (init_image_latent.empty()) { + LOG_ERROR("failed to encode init video frame"); + return std::nullopt; + } - auto init_image_latent = sd_ctx->sd->encode_to_vae_latents(work_ctx, init_img); // [b*c, 1, h/16, w/16] + latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true); // [b, c, t, h/vae_scale_factor, w/vae_scale_factor] + sd::ops::slice_assign(&latents.init_latent, 2, 0, init_image_latent.shape()[2], init_image_latent); - init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true); - denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1); - ggml_set_f32(denoise_mask, 1.f); - - init_latent = sd_ctx->sd->first_stage_model->diffusion_to_vae_latents(work_ctx, init_latent); - - ggml_ext_tensor_iter(init_image_latent, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3); - ggml_ext_tensor_set_f32(init_latent, value, i0, i1, i2, i3); - if (i3 == 0) { - ggml_ext_tensor_set_f32(denoise_mask, 0.f, i0, i1, i2, i3); - } - }); - - init_latent = sd_ctx->sd->first_stage_model->vae_to_diffuison_latents(work_ctx, init_latent); + latents.denoise_mask = sd::full({latents.init_latent.shape()[0], latents.init_latent.shape()[1], latents.init_latent.shape()[2], 1, 1}, 1.f); + sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], 0.0f); int64_t t2 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.x-VACE-14B") { LOG_INFO("VACE"); - int64_t t1 = ggml_time_ms(); - ggml_tensor* ref_image_latent = nullptr; - if (sd_vid_gen_params->init_image.data) { - ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, ref_img); - ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3); - - ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img); // [b*c, 1, h/16, w/16] - auto zero_latent = ggml_dup_tensor(work_ctx, ref_image_latent); - ggml_set_f32(zero_latent, 0.f); - ref_image_latent = ggml_ext_tensor_concat(work_ctx, ref_image_latent, zero_latent, 3); // [b*2*c, 1, h/16, w/16] + int64_t t1 = ggml_time_ms(); + sd::Tensor ref_image_latent; + if (!start_image.empty()) { + auto ref_img = start_image.reshape({start_image.shape()[0], start_image.shape()[1], 1, start_image.shape()[2], 1}); + auto encoded_ref = sd_ctx->sd->encode_first_stage(ref_img); // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor] + if (encoded_ref.empty()) { + LOG_ERROR("failed to encode VACE reference image"); + return std::nullopt; + } + ref_image_latent = sd::ops::concat(encoded_ref, sd::zeros(encoded_ref.shape()), 3); // [b, 2*c, 1, h/vae_scale_factor, w/vae_scale_factor] } - ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3); - ggml_ext_tensor_iter(control_video, [&](ggml_tensor* control_video, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.5f; - if (i2 < sd_vid_gen_params->control_frames_size) { - value = sd_image_get_f32(sd_vid_gen_params->control_frames[i2], i0, i1, i3); - } - ggml_ext_tensor_set_f32(control_video, value, i0, i1, i2, i3); - }); - ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1); - ggml_set_f32(mask, 1.0f); - ggml_tensor* inactive = ggml_dup_tensor(work_ctx, control_video); - ggml_tensor* reactive = ggml_dup_tensor(work_ctx, control_video); + sd::Tensor control_video = sd::full({request->width, request->height, request->frames, 3, 1}, 0.5f); + int64_t control_frame_count = std::min(request->frames, sd_vid_gen_params->control_frames_size); + for (int64_t i = 0; i < control_frame_count; ++i) { + auto control_frame = sd_image_to_tensor(sd_vid_gen_params->control_frames[i], request->width, request->height); + sd::ops::slice_assign(&control_video, 2, i, i + 1, control_frame.unsqueeze(2)); + } - ggml_ext_tensor_iter(control_video, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float control_video_value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3) - 0.5f; - float mask_value = ggml_ext_tensor_get_f32(mask, i0, i1, i2, 0); - float inactive_value = (control_video_value * (1.f - mask_value)) + 0.5f; - float reactive_value = (control_video_value * mask_value) + 0.5f; + sd::Tensor mask = sd::full({request->width, request->height, request->frames, 1, 1}, 1.0f); - ggml_ext_tensor_set_f32(inactive, inactive_value, i0, i1, i2, i3); - ggml_ext_tensor_set_f32(reactive, reactive_value, i0, i1, i2, i3); - }); + control_video = control_video - 0.5f; + sd::Tensor inactive = control_video * (1.0f - mask) + 0.5f; + sd::Tensor reactive = control_video * mask + 0.5f; - inactive = sd_ctx->sd->encode_first_stage(work_ctx, inactive); // [b*c, t, h/vae_scale_factor, w/vae_scale_factor] - reactive = sd_ctx->sd->encode_first_stage(work_ctx, reactive); // [b*c, t, h/vae_scale_factor, w/vae_scale_factor] + inactive = sd_ctx->sd->encode_first_stage(inactive); // [b, c, t, h/vae_scale_factor, w/vae_scale_factor] + if (inactive.empty()) { + LOG_ERROR("failed to encode VACE inactive context"); + return std::nullopt; + } - int64_t length = inactive->ne[2]; - if (ref_image_latent) { + reactive = sd_ctx->sd->encode_first_stage(reactive); // [b, c, t, h/vae_scale_factor, w/vae_scale_factor] + if (reactive.empty()) { + LOG_ERROR("failed to encode VACE reactive context"); + return std::nullopt; + } + + int64_t length = inactive.shape()[2]; + if (!ref_image_latent.empty()) { length += 1; - frames = static_cast((length - 1) * 4 + 1); - ref_image_num = 1; + request->frames = static_cast((length - 1) * 4 + 1); + latents.ref_image_num = 1; } - vace_context = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, inactive->ne[0], inactive->ne[1], length, 96); // [b*96, t, h/vae_scale_factor, w/vae_scale_factor] - ggml_ext_tensor_iter(vace_context, [&](ggml_tensor* vace_context, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value; - if (i3 < 32) { - if (ref_image_latent && i2 == 0) { - value = ggml_ext_tensor_get_f32(ref_image_latent, i0, i1, 0, i3); - } else { - if (i3 < 16) { - value = ggml_ext_tensor_get_f32(inactive, i0, i1, i2 - ref_image_num, i3); - } else { - value = ggml_ext_tensor_get_f32(reactive, i0, i1, i2 - ref_image_num, i3 - 16); - } - } - } else { // mask - if (ref_image_latent && i2 == 0) { - value = 0.f; - } else { - int64_t vae_stride = vae_scale_factor; - int64_t mask_height_index = i1 * vae_stride + (i3 - 32) / vae_stride; - int64_t mask_width_index = i0 * vae_stride + (i3 - 32) % vae_stride; - value = ggml_ext_tensor_get_f32(mask, mask_width_index, mask_height_index, i2 - ref_image_num, 0); - } - } - ggml_ext_tensor_set_f32(vace_context, value, i0, i1, i2, i3); - }); - int64_t t2 = ggml_time_ms(); + auto vace_context = sd::ops::concat(inactive, reactive, 3); // [b, 2*c, t, h/vae_scale_factor, w/vae_scale_factor] + + mask = sd::full({request->width, request->height, inactive.shape()[2], 1, 1}, 1.0f); + auto mask_context = mask.reshape({request->vae_scale_factor, + inactive.shape()[0], + request->vae_scale_factor, + inactive.shape()[1], + inactive.shape()[2]}); // [t, h/vae_scale_factor, vae_scale_factor, w/vae_scale_factor, vae_scale_factor] + mask_context = mask_context.permute({1, 3, 4, 0, 2}) // [vae_scale_factor, vae_scale_factor, t, h/vae_scale_factor, w/vae_scale_factor] + .reshape({inactive.shape()[0], + inactive.shape()[1], + inactive.shape()[2], + request->vae_scale_factor * request->vae_scale_factor}); // [vae_scale_factor*vae_scale_factor, t, h/vae_scale_factor, w/vae_scale_factor] + + if (!ref_image_latent.empty()) { + vace_context = sd::ops::concat(ref_image_latent, vace_context, 2); // [b, 2*c, t+1, h/vae_scale_factor, w/vae_scale_factor] + auto mask_pad = sd::zeros({mask_context.shape()[0], + mask_context.shape()[1], + 1, + mask_context.shape()[3]}); // [vae_scale_factor*vae_scale_factor, 1, h/vae_scale_factor, w/vae_scale_factor] + mask_context = sd::ops::concat(mask_pad, mask_context, 2); // [vae_scale_factor*vae_scale_factor, t + 1, h/vae_scale_factor, w/vae_scale_factor] + } + + mask_context.unsqueeze_(mask_context.dim()); // [b, vae_scale_factor*vae_scale_factor, t + 1 or t, h/vae_scale_factor, w/vae_scale_factor] + + latents.vace_context = sd::ops::concat(vace_context, mask_context, 3); // [b, 2*c + vae_scale_factor*vae_scale_factor, t + 1 or t, h/vae_scale_factor, w/vae_scale_factor] + int64_t t2 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); } - if (init_latent == nullptr) { - init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true); + if (latents.init_latent.empty()) { + latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true); } - // Get learned condition + return latents; +} + +static ImageGenerationEmbeds prepare_video_generation_embeds(sd_ctx_t* sd_ctx, + const sd_vid_gen_params_t* sd_vid_gen_params, + const GenerationRequest& request, + const ImageGenerationLatents& latents) { + ImageGenerationEmbeds embeds; ConditionerParams condition_params; - condition_params.clip_skip = sd_vid_gen_params->clip_skip; + condition_params.clip_skip = request.clip_skip; + condition_params.text = request.prompt; condition_params.zero_out_masked = true; - condition_params.text = prompt; - int64_t t1 = ggml_time_ms(); - SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, - sd_ctx->sd->n_threads, - condition_params); - cond.c_concat = concat_latent; - cond.c_vector = clip_vision_output; - SDCondition uncond; - if (sd_vid_gen_params->sample_params.guidance.txt_cfg != 1.0 || sd_vid_gen_params->high_noise_sample_params.guidance.txt_cfg != 1.0) { - condition_params.text = negative_prompt; - uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, - sd_ctx->sd->n_threads, - condition_params); - uncond.c_concat = concat_latent; - uncond.c_vector = clip_vision_output; + int64_t prepare_start_ms = ggml_time_ms(); + embeds.cond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, + condition_params); + embeds.cond.c_concat = latents.concat_latent; + embeds.cond.c_vector = latents.clip_vision_output; + if (request.use_uncond) { + condition_params.text = request.negative_prompt; + embeds.uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, + condition_params); + embeds.uncond.c_concat = latents.concat_latent; + embeds.uncond.c_vector = latents.clip_vision_output; } - int64_t t2 = ggml_time_ms(); - LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1); + + int64_t t1 = ggml_time_ms(); + LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000); if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->cond_stage_model->free_params_buffer(); } + return embeds; +} - int W = width / vae_scale_factor; - int H = height / vae_scale_factor; - int T = static_cast(init_latent->ne[2]); - int C = sd_ctx->sd->get_latent_channel(); - - ggml_tensor* final_latent; - ggml_tensor* x_t = init_latent; - ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C); - ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng); - // High Noise Sample - if (high_noise_sample_steps > 0) { - LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T); - enum sample_method_t high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method; - if (high_noise_sample_method == SAMPLE_METHOD_COUNT) { - high_noise_sample_method = sd_get_default_sample_method(sd_ctx); - } - LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]); - - int64_t sampling_start = ggml_time_ms(); - - std::vector high_noise_sigmas = std::vector(sigmas.begin(), sigmas.begin() + high_noise_sample_steps + 1); - sigmas = std::vector(sigmas.begin() + high_noise_sample_steps, sigmas.end()); - - x_t = sd_ctx->sd->sample(work_ctx, - sd_ctx->sd->high_noise_diffusion_model, - false, - x_t, - noise, - cond, - uncond, - {}, - nullptr, - 0, - sd_vid_gen_params->high_noise_sample_params.guidance, - sd_vid_gen_params->high_noise_sample_params.eta, - sd_vid_gen_params->high_noise_sample_params.shifted_timestep, - high_noise_sample_method, - high_noise_sigmas, - -1, - {}, - {}, - false, - denoise_mask, - vace_context, - sd_vid_gen_params->vace_strength, - &sd_vid_gen_params->cache); - - int64_t sampling_end = ggml_time_ms(); - LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); - } - noise = nullptr; +static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx, + const sd::Tensor& final_latent, + int* num_frames_out) { + if (final_latent.empty()) { + LOG_ERROR("no latent video to decode"); + return nullptr; } - - // Sample - { - LOG_DEBUG("sample %dx%dx%d", W, H, T); - int64_t sampling_start = ggml_time_ms(); - - final_latent = sd_ctx->sd->sample(work_ctx, - sd_ctx->sd->diffusion_model, - true, - x_t, - noise, - cond, - uncond, - {}, - nullptr, - 0, - sd_vid_gen_params->sample_params.guidance, - sd_vid_gen_params->sample_params.eta, - sd_vid_gen_params->sample_params.shifted_timestep, - sample_method, - sigmas, - -1, - {}, - {}, - false, - denoise_mask, - vace_context, - sd_vid_gen_params->vace_strength, - &sd_vid_gen_params->cache); - - int64_t sampling_end = ggml_time_ms(); - LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } - } - - if (ref_image_num > 0) { - ggml_tensor* trim_latent = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - final_latent->ne[0], - final_latent->ne[1], - final_latent->ne[2] - ref_image_num, - final_latent->ne[3]); - ggml_ext_tensor_iter(trim_latent, [&](ggml_tensor* trim_latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(final_latent, i0, i1, i2 + ref_image_num, i3); - ggml_ext_tensor_set_f32(trim_latent, value, i0, i1, i2, i3); - }); - final_latent = trim_latent; - } - - int64_t t4 = ggml_time_ms(); - LOG_INFO("generating latent video completed, taking %.2fs", (t4 - t2) * 1.0f / 1000); - ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true); - int64_t t5 = ggml_time_ms(); + int64_t t4 = ggml_time_ms(); + sd::Tensor vid = sd_ctx->sd->decode_first_stage(final_latent, true); + int64_t t5 = ggml_time_ms(); LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000); if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->first_stage_model->free_params_buffer(); } - - sd_ctx->sd->lora_stat(); - - sd_image_t* result_images = (sd_image_t*)calloc(vid->ne[2], sizeof(sd_image_t)); - if (result_images == nullptr) { - ggml_free(work_ctx); + if (vid.empty()) { + LOG_ERROR("decode_first_stage failed for video"); return nullptr; } - *num_frames_out = static_cast(vid->ne[2]); - for (int64_t i = 0; i < vid->ne[2]; i++) { - result_images[i].width = static_cast(vid->ne[0]); - result_images[i].height = static_cast(vid->ne[1]); - result_images[i].channel = 3; - result_images[i].data = ggml_tensor_to_sd_image(vid, static_cast(i), true); + sd_image_t* result_images = (sd_image_t*)calloc(vid.shape()[2], sizeof(sd_image_t)); + if (result_images == nullptr) { + return nullptr; + } + if (num_frames_out != nullptr) { + *num_frames_out = static_cast(vid.shape()[2]); } - ggml_free(work_ctx); - LOG_INFO("generate_video completed in %.2fs", (t5 - t0) * 1.0f / 1000); + for (int64_t i = 0; i < vid.shape()[2]; i++) { + result_images[i] = tensor_to_sd_image(vid, static_cast(i)); + } return result_images; } + +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) { + if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) { + return nullptr; + } + if (num_frames_out != nullptr) { + *num_frames_out = 0; + } + int64_t t0 = ggml_time_ms(); + sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params; + GenerationRequest request(sd_ctx, sd_vid_gen_params); + sd_ctx->sd->rng->manual_seed(request.seed); + sd_ctx->sd->sampler_rng->manual_seed(request.seed); + sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift); + sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count); + + SamplePlan plan(sd_ctx, sd_vid_gen_params, request); + auto latent_inputs_opt = prepare_video_generation_latents(sd_ctx, sd_vid_gen_params, &request); + if (!latent_inputs_opt.has_value()) { + return nullptr; + } + ImageGenerationLatents latents = std::move(*latent_inputs_opt); + ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx, + sd_vid_gen_params, + request, + latents); + LOG_INFO("generate_video %dx%dx%d", + request.width, + request.height, + request.frames); + + int64_t latent_start = ggml_time_ms(); + int W = request.width / request.vae_scale_factor; + int H = request.height / request.vae_scale_factor; + int T = static_cast(latents.init_latent.shape()[2]); + + sd::Tensor x_t = latents.init_latent; + sd::Tensor noise = sd::Tensor::randn_like(x_t, sd_ctx->sd->rng); + + if (plan.high_noise_sample_steps > 0) { + LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T); + + int64_t sampling_start = ggml_time_ms(); + std::vector high_noise_sigmas(plan.sigmas.begin(), plan.sigmas.begin() + plan.high_noise_sample_steps + 1); + plan.sigmas = std::vector(plan.sigmas.begin() + plan.high_noise_sample_steps, plan.sigmas.end()); + + sd::Tensor x_t_sampled = sd_ctx->sd->sample(sd_ctx->sd->high_noise_diffusion_model, + false, + x_t, + std::move(noise), + embeds.cond, + request.use_high_noise_uncond ? embeds.uncond : SDCondition(), + embeds.img_cond, + embeds.id_cond, + sd::Tensor(), + 0.f, + request.high_noise_guidance, + sd_vid_gen_params->high_noise_sample_params.eta, + request.shifted_timestep, + plan.high_noise_sample_method, + high_noise_sigmas, + -1, + std::vector>{}, + false, + latents.denoise_mask, + latents.vace_context, + request.vace_strength, + request.cache_params); + int64_t sampling_end = ggml_time_ms(); + if (x_t_sampled.empty()) { + LOG_ERROR("sampling(high noise) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); + } + return nullptr; + } + + x_t = std::move(x_t_sampled); + noise = {}; + LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); + } + } + + LOG_DEBUG("sample %dx%dx%d", W, H, T); + int64_t sampling_start = ggml_time_ms(); + sd::Tensor final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model, + true, + x_t, + std::move(noise), + embeds.cond, + request.use_uncond ? embeds.uncond : SDCondition(), + embeds.img_cond, + embeds.id_cond, + sd::Tensor(), + 0.f, + sd_vid_gen_params->sample_params.guidance, + sd_vid_gen_params->sample_params.eta, + sd_vid_gen_params->sample_params.shifted_timestep, + plan.sample_method, + plan.sigmas, + -1, + std::vector>{}, + false, + latents.denoise_mask, + latents.vace_context, + request.vace_strength, + request.cache_params); + + int64_t sampling_end = ggml_time_ms(); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + if (final_latent.empty()) { + LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + return nullptr; + } + LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + + if (latents.ref_image_num > 0) { + final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]); + } + + int64_t latent_end = ggml_time_ms(); + LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000); + + auto result = decode_video_outputs(sd_ctx, final_latent, num_frames_out); + if (result == nullptr) { + return nullptr; + } + + sd_ctx->sd->lora_stat(); + + int64_t t1 = ggml_time_ms(); + LOG_INFO("generate_video completed in %.2fs", (t1 - t0) * 1.0f / 1000); + return result; +} diff --git a/src/t5.hpp b/src/t5.hpp index 5f8c99d..f64d0b6 100644 --- a/src/t5.hpp +++ b/src/t5.hpp @@ -1,1038 +1,1036 @@ -#ifndef __T5_HPP__ -#define __T5_HPP__ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "darts.h" -#include "ggml_extend.hpp" -#include "json.hpp" -#include "model.h" -#include "vocab/vocab.h" - -// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h -// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h. -// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE -// -// Since tokenization is not the bottleneck in SD, performance was not a major consideration -// during the migration. -class MetaspacePreTokenizer { -private: - std::string replacement; - bool add_prefix_space; - -public: - MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true) - : replacement(replacement), add_prefix_space(add_prefix_space) {} - - std::string tokenize(const std::string& input) const { - std::string tokens; - std::stringstream ss(input); - - if (add_prefix_space) { - tokens += replacement; - } - - std::string token; - bool firstToken = true; - while (std::getline(ss, token, ' ')) { - if (!firstToken) - tokens += replacement + token; - else - tokens += token; - - firstToken = false; - } - - return tokens; - } -}; - -using EncodeResult = std::vector>; -class T5UniGramTokenizer { -public: - enum Status { - OK, - NO_PIECES_LOADED, - NO_ENTRY_FOUND, - BUILD_DOUBLE_ARRAY_FAILED, - PIECE_ALREADY_DEFINED, - INVLIAD_JSON - }; - -protected: - MetaspacePreTokenizer pre_tokenizer; - - // all pairs - std::vector> piece_score_pairs; - - float min_score_ = 0.0; - float max_score_ = 0.0; - std::unique_ptr trie_; - - // Maximum size of the return value of Trie, which corresponds - // to the maximum size of shared common prefix in the sentence pieces. - int trie_results_size_; - // unknown id. - int unk_id_ = 2; - std::string eos_token_ = ""; - int eos_id_ = 1; - int pad_id_ = 0; - // status. - Status status_ = OK; - - float kUnkPenalty = 10.0; - - std::string replacement; - bool add_prefix_space = true; - - void InitializePieces(const std::string& json_str) { - nlohmann::json data; - - try { - data = nlohmann::json::parse(json_str); - } catch (const nlohmann::json::parse_error&) { - status_ = INVLIAD_JSON; - return; - } - if (!data.contains("model")) { - status_ = INVLIAD_JSON; - return; - } - nlohmann::json model = data["model"]; - if (!model.contains("vocab")) { - status_ = INVLIAD_JSON; - return; - } - if (model.contains("unk_id")) { - unk_id_ = model["unk_id"]; - } - - replacement = data["pre_tokenizer"]["replacement"]; - add_prefix_space = data["pre_tokenizer"]["add_prefix_space"]; - - pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space); - - for (const auto& item : model["vocab"]) { - if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) { - status_ = INVLIAD_JSON; - return; - } - std::string piece = item[0]; - if (piece.empty()) { - piece = ""; - } - float score = item[1]; - piece_score_pairs.emplace_back(piece, score); - } - } - - // Builds a Trie index. - void BuildTrie(std::vector>* pieces) { - if (status_ != OK) - return; - - if (pieces->empty()) { - status_ = NO_PIECES_LOADED; - return; - } - - // sort by sentencepiece since DoubleArray::build() - // only accepts sorted strings. - sort(pieces->begin(), pieces->end()); - - // Makes key/value set for DoubleArrayTrie. - std::vector key(pieces->size()); - std::vector value(pieces->size()); - for (size_t i = 0; i < pieces->size(); ++i) { - // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second); - key[i] = (*pieces)[i].first.data(); // sorted piece. - value[i] = (*pieces)[i].second; // vocab_id - } - - trie_ = std::unique_ptr(new Darts::DoubleArray()); - if (trie_->build(key.size(), const_cast(&key[0]), nullptr, - &value[0]) != 0) { - status_ = BUILD_DOUBLE_ARRAY_FAILED; - return; - } - - // Computes the maximum number of shared prefixes in the trie. - const int kMaxTrieResultsSize = 1024; - std::vector results( - kMaxTrieResultsSize); - trie_results_size_ = 0; - for (const auto& p : *pieces) { - const size_t num_nodes = trie_->commonPrefixSearch( - p.first.data(), results.data(), results.size(), p.first.size()); - trie_results_size_ = std::max(trie_results_size_, static_cast(num_nodes)); - } - - if (trie_results_size_ == 0) - status_ = NO_ENTRY_FOUND; - } - - // Non-virtual (inlined) implementation for faster execution. - inline float GetScoreInlined(int id) const { - return piece_score_pairs[id].second; - } - - inline bool IsUnusedInlined(int id) const { - return false; // TODO - } - - inline bool IsUserDefinedInlined(int id) const { - return false; // TODO - } - - inline size_t OneCharLen(const char* src) const { - return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; - } - - // The optimized Viterbi encode. - // Main differences from the original function: - // 1. Memorizes the best path at each postion so far, - // 2. No need to store the Lattice nodes, - // 3. Works in utf-8 directly, - // 4. Defines a new struct with fewer fields than Lattice, - // 5. Does not depend on `class Lattice` nor call `SetSentence()`, - // `PopulateNodes()`, or `Viterbi()`. It does everything in one function. - // For detailed explanations please see the comments inside the function body. - EncodeResult EncodeOptimized(const std::string& normalized) const { - // An optimized Viterbi algorithm for unigram language models. Benchmarking - // results show that it generates almost identical outputs and achieves 2.1x - // speedup on average for 102 languages compared to the original - // implementation. It's based on the following three ideas: - // - // 1. Because it uses the *unigram* model: - // best_score(x1, x2, �? xt) = best_score(x1, x2, �? x{t-1}) + score(xt) - // Deciding the best path (and score) can be decoupled into two isolated - // terms: (a) the best path ended before the last token `best_score(x1, x2, �? - // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are - // not related to each other at all. - // - // Therefore, we can compute once and store the *best_path ending at - // each character position*. In this way, when we know best_path_ends_at[M], - // we can reuse it to compute all the best_path_ends_at_[...] where the last - // token starts at the same character position M. - // - // This improves the time complexity from O(n*k*k) to O(n*k) because it - // eliminates the extra loop of recomputing the best path ending at the same - // position, where n is the input length and k is the maximum number of tokens - // that can be recognized starting at each position. - // - // 2. Again, because it uses the *unigram* model, we don’t need to actually - // store the lattice nodes. We still recognize all the tokens and lattice - // nodes from the input, but along identifying them, we use and discard them - // on the fly. There is no need to actually store them for best path Viterbi - // decoding. The only thing we need to store is the best_path ending at - // each character position. - // - // This improvement reduces the things needed to store in memory from O(n*k) - // to O(n), where n is the input length and k is the maximum number of tokens - // that can be recognized starting at each position. - // - // It also avoids the need of dynamic-size lattice node pool, because the - // number of things to store is fixed as n. - // - // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding - // inputs. In the original implementation, the lattice positions are based on - // unicode positions. A mapping from unicode position to the utf-8 position is - // maintained to recover the utf-8 string piece. - // - // We found that it is sufficient and beneficial to directly work with utf-8 - // positions: - // - // Firstly, it saves the conversion and mapping between unicode positions and - // utf-8 positions. - // - // Secondly, it reduces the number of fields we need to maintain in the - // node/path structure. Specifically, there are 8 fields defined in - // `Lattice::Node` used by the original encoder, but here in the optimized - // encoder we only need to define 3 fields in `BestPathNode`. - - if (status() != OK || normalized.empty()) { - return {}; - } - // Represents the last node of the best path. - struct BestPathNode { - int id = -1; // The vocab id. (maybe -1 for UNK) - float best_path_score = - 0; // The total score of the best path ending at this node. - int starts_at = - -1; // The starting position (in utf-8) of this node. The entire best - // path can be constructed by backtracking along this link. - }; - const int size = static_cast(normalized.size()); - const float unk_score = min_score() - kUnkPenalty; - // The ends are exclusive. - std::vector best_path_ends_at(size + 1); - // Generate lattice on-the-fly (not stored) and update best_path_ends_at. - int starts_at = 0; - while (starts_at < size) { - std::size_t node_pos = 0; - std::size_t key_pos = starts_at; - const auto best_path_score_till_here = - best_path_ends_at[starts_at].best_path_score; - bool has_single_node = false; - const int mblen = - std::min(static_cast(OneCharLen(normalized.data() + starts_at)), - size - starts_at); - while (key_pos < size) { - const int ret = - trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1); - if (ret == -2) - break; - if (ret >= 0) { - if (IsUnusedInlined(ret)) - continue; - // Update the best path node. - auto& target_node = best_path_ends_at[key_pos]; - const auto length = (key_pos - starts_at); - // User defined symbol receives extra bonus to always be selected. - const auto score = IsUserDefinedInlined(ret) - ? (length * max_score_ - 0.1) - : GetScoreInlined(ret); - const auto candidate_best_path_score = - score + best_path_score_till_here; - if (target_node.starts_at == -1 || - candidate_best_path_score > target_node.best_path_score) { - target_node.best_path_score = static_cast(candidate_best_path_score); - target_node.starts_at = starts_at; - target_node.id = ret; - } - if (!has_single_node && length == mblen) { - has_single_node = true; - } - } - } - if (!has_single_node) { - auto& target_node = best_path_ends_at[starts_at + mblen]; - const auto candidate_best_path_score = - unk_score + best_path_score_till_here; - if (target_node.starts_at == -1 || - candidate_best_path_score > target_node.best_path_score) { - target_node.best_path_score = candidate_best_path_score; - target_node.starts_at = starts_at; - target_node.id = unk_id_; - } - } - // Move by one unicode character. - starts_at += mblen; - } - // Backtrack to identify the best path. - EncodeResult results; - int ends_at = size; - while (ends_at > 0) { - const auto& node = best_path_ends_at[ends_at]; - results.emplace_back( - normalized.substr(node.starts_at, ends_at - node.starts_at), node.id); - ends_at = node.starts_at; - } - std::reverse(results.begin(), results.end()); - return results; - } - -public: - explicit T5UniGramTokenizer(bool is_umt5 = false) { - if (is_umt5) { - InitializePieces(load_umt5_tokenizer_json()); - } else { - InitializePieces(load_t5_tokenizer_json()); - } - - min_score_ = FLT_MAX; - max_score_ = FLT_MIN; - - std::vector> pieces; - for (int i = 0; i < piece_score_pairs.size(); i++) { - const auto& sp = piece_score_pairs[i]; - - min_score_ = std::min(min_score_, sp.second); - max_score_ = std::max(max_score_, sp.second); - - pieces.emplace_back(sp.first, i); - } - - BuildTrie(&pieces); - } - ~T5UniGramTokenizer(){}; - - std::string Normalize(const std::string& input) const { - // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29 - // TODO: nmt-nfkc - std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " "); - return normalized; - } - - std::vector Encode(const std::string& input, bool append_eos_if_not_present = true) const { - std::string normalized = Normalize(input); - normalized = pre_tokenizer.tokenize(normalized); - EncodeResult result = EncodeOptimized(normalized); - if (result.size() > 0 && append_eos_if_not_present) { - auto item = result[result.size() - 1]; - if (item.first != eos_token_) { - result.emplace_back(eos_token_, eos_id_); - } - } - std::vector tokens; - for (auto item : result) { - tokens.push_back(item.second); - } - return tokens; - } - - void pad_tokens(std::vector& tokens, - std::vector& weights, - std::vector* attention_mask, - size_t max_length = 0, - bool padding = false) { - if (max_length > 0 && padding) { - size_t orig_token_num = tokens.size() - 1; - size_t n = static_cast(std::ceil(orig_token_num * 1.0 / (max_length - 1))); - if (n == 0) { - n = 1; - } - size_t length = max_length * n; - LOG_DEBUG("token length: %llu", length); - std::vector new_tokens; - std::vector new_weights; - std::vector new_attention_mask; - int token_idx = 0; - for (int i = 0; i < length; i++) { - if (token_idx >= orig_token_num) { - break; - } - if (attention_mask != nullptr) { - new_attention_mask.push_back(0.0); - } - if (i % max_length == max_length - 1) { - new_tokens.push_back(eos_id_); - new_weights.push_back(1.0); - } else { - new_tokens.push_back(tokens[token_idx]); - new_weights.push_back(weights[token_idx]); - token_idx++; - } - } - - new_tokens.push_back(eos_id_); - new_weights.push_back(1.0); - if (attention_mask != nullptr) { - new_attention_mask.push_back(0.0); - } - - tokens = new_tokens; - weights = new_weights; - if (attention_mask != nullptr) { - *attention_mask = new_attention_mask; - } - - if (padding) { - int pad_token_id = pad_id_; - tokens.insert(tokens.end(), length - tokens.size(), pad_token_id); - weights.insert(weights.end(), length - weights.size(), 1.0); - if (attention_mask != nullptr) { - // maybe keep some padding tokens unmasked? - attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF); - } - } - } - } - - // Returns the minimum score in sentence pieces. - // min_score() - 10 is used for the cost of unknown sentence. - float min_score() const { return min_score_; } - - // Returns the maximum score in sentence pieces. - // max_score() is used for the cost of user defined symbols. - float max_score() const { return max_score_; } - - Status status() const { return status_; } -}; - -class T5LayerNorm : public UnaryBlock { -protected: - int64_t hidden_size; - float eps; - - void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { - enum ggml_type wtype = GGML_TYPE_F32; - params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); - } - -public: - T5LayerNorm(int64_t hidden_size, - float eps = 1e-06f) - : hidden_size(hidden_size), - eps(eps) {} - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - ggml_tensor* w = params["weight"]; - x = ggml_rms_norm(ctx->ggml_ctx, x, eps); - x = ggml_mul(ctx->ggml_ctx, x, w); - return x; - } -}; - -struct T5DenseActDense : public UnaryBlock { -public: - T5DenseActDense(int64_t model_dim, int64_t ff_dim) { - blocks["wi"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - // x: [N, n_token, model_dim] - auto wi = std::dynamic_pointer_cast(blocks["wi"]); - auto wo = std::dynamic_pointer_cast(blocks["wo"]); - - x = wi->forward(ctx, x); - x = ggml_relu_inplace(ctx->ggml_ctx, x); - x = wo->forward(ctx, x); - return x; - } -}; - -struct T5DenseGatedActDense : public UnaryBlock { -public: - T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { - blocks["wi_0"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - blocks["wi_1"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - float scale = 1.f / 32.f; - // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...). - blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false, false, false, scale)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - // x: [N, n_token, model_dim] - auto wi_0 = std::dynamic_pointer_cast(blocks["wi_0"]); - auto wi_1 = std::dynamic_pointer_cast(blocks["wi_1"]); - auto wo = std::dynamic_pointer_cast(blocks["wo"]); - - auto hidden_gelu = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true); - auto hidden_linear = wi_1->forward(ctx, x); - x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear); - x = wo->forward(ctx, x); - return x; - } -}; - -struct T5LayerFF : public UnaryBlock { -public: - T5LayerFF(int64_t model_dim, int64_t ff_dim) { - blocks["DenseReluDense"] = std::shared_ptr(new T5DenseGatedActDense(model_dim, ff_dim)); - blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - // x: [N, n_token, model_dim] - auto DenseReluDense = std::dynamic_pointer_cast(blocks["DenseReluDense"]); - auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); - - auto forwarded_states = layer_norm->forward(ctx, x); - forwarded_states = DenseReluDense->forward(ctx, forwarded_states); - x = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x); - return x; - } -}; - -class T5Attention : public GGMLBlock { -protected: - int64_t model_dim; - int64_t inner_dim; - int64_t num_heads; - bool using_relative_attention_bias; - int64_t relative_attention_num_buckets = 32; - int64_t relative_attention_max_distance = 128; - -public: - T5Attention(int64_t model_dim, - int64_t inner_dim, - int64_t num_heads, - bool using_relative_attention_bias = false) - : model_dim(model_dim), - inner_dim(inner_dim), - num_heads(num_heads), - using_relative_attention_bias(using_relative_attention_bias) { - blocks["q"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); - blocks["k"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); - blocks["v"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); - blocks["o"] = std::shared_ptr(new Linear(inner_dim, model_dim, false)); - if (using_relative_attention_bias) { - blocks["relative_attention_bias"] = std::shared_ptr(new Embedding(relative_attention_num_buckets, num_heads)); - } - } - - ggml_tensor* compute_bias(GGMLRunnerContext* ctx, - ggml_tensor* relative_position_bucket) { - auto relative_attention_bias = std::dynamic_pointer_cast(blocks["relative_attention_bias"]); - - auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) - values = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length) - return values; - } - - // x: [N, n_token, model_dim] - std::pair forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - auto q_proj = std::dynamic_pointer_cast(blocks["q"]); - auto k_proj = std::dynamic_pointer_cast(blocks["k"]); - auto v_proj = std::dynamic_pointer_cast(blocks["v"]); - auto out_proj = std::dynamic_pointer_cast(blocks["o"]); - - int64_t n_head = num_heads; - int64_t d_head = inner_dim / n_head; - - auto q = q_proj->forward(ctx, x); - auto k = k_proj->forward(ctx, x); - auto v = v_proj->forward(ctx, x); - - if (using_relative_attention_bias && relative_position_bucket != nullptr) { - past_bias = compute_bias(ctx, relative_position_bucket); - } - if (past_bias != nullptr) { - if (mask != nullptr) { - mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias); - mask = ggml_add(ctx->ggml_ctx, mask, past_bias); - } else { - mask = past_bias; - } - } - - k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast(d_head)), true); - - x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head] - - x = out_proj->forward(ctx, x); // [N, n_token, model_dim] - return {x, past_bias}; - } -}; - -struct T5LayerSelfAttention : public GGMLBlock { -public: - T5LayerSelfAttention(int64_t model_dim, - int64_t inner_dim, - int64_t ff_dim, - int64_t num_heads, - bool using_relative_attention_bias) { - blocks["SelfAttention"] = std::shared_ptr(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias)); - blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); - } - - std::pair forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // x: [N, n_token, model_dim] - auto SelfAttention = std::dynamic_pointer_cast(blocks["SelfAttention"]); - auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); - - auto normed_hidden_state = layer_norm->forward(ctx, x); - auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket); - auto output = ret.first; - past_bias = ret.second; - - x = ggml_add_inplace(ctx->ggml_ctx, output, x); - return {x, past_bias}; - } -}; - -struct T5Block : public GGMLBlock { -public: - T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) { - blocks["layer.0"] = std::shared_ptr(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias)); - blocks["layer.1"] = std::shared_ptr(new T5LayerFF(model_dim, ff_dim)); - } - - std::pair forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // x: [N, n_token, model_dim] - auto layer_0 = std::dynamic_pointer_cast(blocks["layer.0"]); - auto layer_1 = std::dynamic_pointer_cast(blocks["layer.1"]); - - auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket); - x = ret.first; - past_bias = ret.second; - x = layer_1->forward(ctx, x); - return {x, past_bias}; - } -}; - -struct T5Stack : public GGMLBlock { - int64_t num_layers; - -public: - T5Stack(int64_t num_layers, - int64_t model_dim, - int64_t inner_dim, - int64_t ff_dim, - int64_t num_heads, - bool relative_attention = true) - : num_layers(num_layers) { - for (int i = 0; i < num_layers; i++) { - blocks["block." + std::to_string(i)] = std::shared_ptr(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0))); - } - - blocks["final_layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* attention_mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // x: [N, n_token, model_dim] - for (int i = 0; i < num_layers; i++) { - auto block = std::dynamic_pointer_cast(blocks["block." + std::to_string(i)]); - - auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); - x = ret.first; - past_bias = ret.second; - } - - auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]); - - x = final_layer_norm->forward(ctx, x); - return x; - } -}; - -struct T5Params { - int64_t num_layers = 24; - int64_t model_dim = 4096; - int64_t ff_dim = 10240; - int64_t num_heads = 64; - int64_t vocab_size = 32128; - bool relative_attention = true; -}; - -struct T5 : public GGMLBlock { - T5Params params; - -public: - T5() {} - T5(T5Params params) - : params(params) { - blocks["encoder"] = std::shared_ptr(new T5Stack(params.num_layers, - params.model_dim, - params.model_dim, - params.ff_dim, - params.num_heads, - params.relative_attention)); - blocks["shared"] = std::shared_ptr(new Embedding(params.vocab_size, - params.model_dim)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* input_ids, - ggml_tensor* past_bias = nullptr, - ggml_tensor* attention_mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // input_ids: [N, n_token] - - auto shared = std::dynamic_pointer_cast(blocks["shared"]); - auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); - - auto x = shared->forward(ctx, input_ids); - x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); - return x; - } -}; - -struct T5Runner : public GGMLRunner { - T5Params params; - T5 model; - std::vector relative_position_bucket_vec; - - T5Runner(ggml_backend_t backend, - bool offload_params_to_cpu, - const String2TensorStorage& tensor_storage_map, - const std::string prefix, - bool is_umt5 = false) - : GGMLRunner(backend, offload_params_to_cpu) { - if (is_umt5) { - params.vocab_size = 256384; - params.relative_attention = false; - } - model = T5(params); - model.init(params_ctx, tensor_storage_map, prefix); - } - - std::string get_desc() override { - return "t5"; - } - - void get_param_tensors(std::map& tensors, const std::string prefix) { - model.get_param_tensors(tensors, prefix); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* input_ids, - ggml_tensor* relative_position_bucket, - ggml_tensor* attention_mask = nullptr) { - size_t N = input_ids->ne[1]; - size_t n_token = input_ids->ne[0]; - - auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim] - return hidden_states; - } - - ggml_cgraph* build_graph(ggml_tensor* input_ids, - ggml_tensor* attention_mask = nullptr) { - ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - input_ids = to_backend(input_ids); - attention_mask = to_backend(attention_mask); - - relative_position_bucket_vec = compute_relative_position_bucket(static_cast(input_ids->ne[0]), static_cast(input_ids->ne[0])); - - // for (int i = 0; i < relative_position_bucket_vec.size(); i++) { - // if (i % 77 == 0) { - // printf("\n"); - // } - // printf("%d ", relative_position_bucket_vec[i]); - // } - - auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx, - GGML_TYPE_I32, - input_ids->ne[0], - input_ids->ne[0]); - set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); - - auto runner_ctx = get_context(); - ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask); - - ggml_build_forward_expand(gf, hidden_states); - - return gf; - } - - bool compute(const int n_threads, - ggml_tensor* input_ids, - ggml_tensor* attention_mask, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(input_ids, attention_mask); - }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); - } - - static std::vector _relative_position_bucket(const std::vector& relative_position, - bool bidirectional = true, - int num_buckets = 32, - int max_distance = 128) { - std::vector relative_buckets(relative_position.size(), 0); - std::vector abs_relative_position = relative_position; - - if (bidirectional) { - num_buckets = num_buckets / 2; - for (size_t i = 0; i < relative_position.size(); ++i) { - if (relative_position[i] > 0) { - relative_buckets[i] += num_buckets; - } - abs_relative_position[i] = std::abs(relative_position[i]); - } - } else { - for (size_t i = 0; i < relative_position.size(); ++i) { - abs_relative_position[i] = std::max(-relative_position[i], 0); - } - } - - int max_exact = num_buckets / 2; - std::vector relative_position_if_large(relative_position.size(), 0); - - for (size_t i = 0; i < relative_position.size(); ++i) { - if (abs_relative_position[i] < max_exact) { - relative_buckets[i] += abs_relative_position[i]; - } else { - float log_pos = std::log(static_cast(abs_relative_position[i]) / max_exact); - float log_base = std::log(static_cast(max_distance) / max_exact); - relative_position_if_large[i] = max_exact + static_cast((log_pos / log_base) * (num_buckets - max_exact)); - relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1); - relative_buckets[i] += relative_position_if_large[i]; - } - } - - return relative_buckets; - } - - std::vector compute_relative_position_bucket(int query_length, - int key_length) { - std::vector context_position(query_length); - std::vector memory_position(key_length); - - for (int i = 0; i < query_length; ++i) { - context_position[i] = i; - } - for (int i = 0; i < key_length; ++i) { - memory_position[i] = i; - } - - std::vector> relative_position(query_length, std::vector(key_length, 0)); - for (int i = 0; i < query_length; ++i) { - for (int j = 0; j < key_length; ++j) { - relative_position[i][j] = memory_position[j] - context_position[i]; - } - } - - std::vector relative_position_bucket; - for (int i = 0; i < query_length; ++i) { - std::vector result = _relative_position_bucket(relative_position[i], true); - relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end()); - } - - return relative_position_bucket; - } -}; - -struct T5Embedder { - T5UniGramTokenizer tokenizer; - T5Runner model; - - T5Embedder(ggml_backend_t backend, - bool offload_params_to_cpu, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - bool is_umt5 = false) - : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { - } - - void get_param_tensors(std::map& tensors, const std::string prefix) { - model.get_param_tensors(tensors, prefix); - } - - void alloc_params_buffer() { - model.alloc_params_buffer(); - } - - std::tuple, std::vector, std::vector> tokenize(std::string text, - size_t max_length = 0, - bool padding = false) { - auto parsed_attention = parse_prompt_attention(text); - - { - std::stringstream ss; - ss << "["; - for (const auto& item : parsed_attention) { - ss << "['" << item.first << "', " << item.second << "], "; - } - ss << "]"; - LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); - } - - std::vector tokens; - std::vector weights; - for (const auto& item : parsed_attention) { - const std::string& curr_text = item.first; - float curr_weight = item.second; - std::vector curr_tokens = tokenizer.Encode(curr_text, false); - tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); - weights.insert(weights.end(), curr_tokens.size(), curr_weight); - } - - int EOS_TOKEN_ID = 1; - tokens.push_back(EOS_TOKEN_ID); - weights.push_back(1.0); - - std::vector attention_mask; - - tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding); - - // for (int i = 0; i < tokens.size(); i++) { - // std::cout << tokens[i] << ":" << weights[i] << ", "; - // } - // std::cout << std::endl; - - return {tokens, weights, attention_mask}; - } - - void test() { - ggml_init_params params; - params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - params.mem_buffer = nullptr; - params.no_alloc = false; - - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); - - { - std::string text("a lovely cat"); - // std::string text("一只可爱的�?); // umt5 chinease test - auto tokens_and_weights = tokenize(text, 512, true); - std::vector& tokens = std::get<0>(tokens_and_weights); - std::vector& weights = std::get<1>(tokens_and_weights); - std::vector& masks = std::get<2>(tokens_and_weights); - for (auto token : tokens) { - printf("%d ", token); - } - printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - auto attention_mask = vector_to_ggml_tensor(work_ctx, masks); - ggml_tensor* out = nullptr; - - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, attention_mask, &out, work_ctx); - int64_t t1 = ggml_time_ms(); - - print_ggml_tensor(out); - LOG_DEBUG("t5 test done in %lldms", t1 - t0); - } - } - - static void load_from_file_and_test(const std::string& file_path) { - // cpu f16: pass - // cpu f32: pass - // cuda f16: pass - // cuda f32: pass - // cuda q8_0: pass - // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); - ggml_type model_data_type = GGML_TYPE_F16; - - ModelLoader model_loader; - if (!model_loader.init_from_file_and_convert_name(file_path)) { - LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); - return; - } - - auto& tensor_storage_map = model_loader.get_tensor_storage_map(); - for (auto& [name, tensor_storage] : tensor_storage_map) { - if (ends_with(name, "weight")) { - tensor_storage.expected_type = model_data_type; - } - } - - std::shared_ptr t5 = std::make_shared(backend, false, tensor_storage_map, "", true); - - t5->alloc_params_buffer(); - std::map tensors; - t5->get_param_tensors(tensors, ""); - - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); - return; - } - - LOG_INFO("t5 model loaded"); - t5->test(); - } -}; - -#endif // __T5_HPP__ \ No newline at end of file +#ifndef __T5_HPP__ +#define __T5_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "darts.h" +#include "ggml_extend.hpp" +#include "json.hpp" +#include "model.h" +#include "vocab/vocab.h" + +// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h +// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h. +// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE +// +// Since tokenization is not the bottleneck in SD, performance was not a major consideration +// during the migration. +class MetaspacePreTokenizer { +private: + std::string replacement; + bool add_prefix_space; + +public: + MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true) + : replacement(replacement), add_prefix_space(add_prefix_space) {} + + std::string tokenize(const std::string& input) const { + std::string tokens; + std::stringstream ss(input); + + if (add_prefix_space) { + tokens += replacement; + } + + std::string token; + bool firstToken = true; + while (std::getline(ss, token, ' ')) { + if (!firstToken) + tokens += replacement + token; + else + tokens += token; + + firstToken = false; + } + + return tokens; + } +}; + +using EncodeResult = std::vector>; +class T5UniGramTokenizer { +public: + enum Status { + OK, + NO_PIECES_LOADED, + NO_ENTRY_FOUND, + BUILD_DOUBLE_ARRAY_FAILED, + PIECE_ALREADY_DEFINED, + INVLIAD_JSON + }; + +protected: + MetaspacePreTokenizer pre_tokenizer; + + // all pairs + std::vector> piece_score_pairs; + + float min_score_ = 0.0; + float max_score_ = 0.0; + std::unique_ptr trie_; + + // Maximum size of the return value of Trie, which corresponds + // to the maximum size of shared common prefix in the sentence pieces. + int trie_results_size_; + // unknown id. + int unk_id_ = 2; + std::string eos_token_ = ""; + int eos_id_ = 1; + int pad_id_ = 0; + // status. + Status status_ = OK; + + float kUnkPenalty = 10.0; + + std::string replacement; + bool add_prefix_space = true; + + void InitializePieces(const std::string& json_str) { + nlohmann::json data; + + try { + data = nlohmann::json::parse(json_str); + } catch (const nlohmann::json::parse_error&) { + status_ = INVLIAD_JSON; + return; + } + if (!data.contains("model")) { + status_ = INVLIAD_JSON; + return; + } + nlohmann::json model = data["model"]; + if (!model.contains("vocab")) { + status_ = INVLIAD_JSON; + return; + } + if (model.contains("unk_id")) { + unk_id_ = model["unk_id"]; + } + + replacement = data["pre_tokenizer"]["replacement"]; + add_prefix_space = data["pre_tokenizer"]["add_prefix_space"]; + + pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space); + + for (const auto& item : model["vocab"]) { + if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) { + status_ = INVLIAD_JSON; + return; + } + std::string piece = item[0]; + if (piece.empty()) { + piece = ""; + } + float score = item[1]; + piece_score_pairs.emplace_back(piece, score); + } + } + + // Builds a Trie index. + void BuildTrie(std::vector>* pieces) { + if (status_ != OK) + return; + + if (pieces->empty()) { + status_ = NO_PIECES_LOADED; + return; + } + + // sort by sentencepiece since DoubleArray::build() + // only accepts sorted strings. + sort(pieces->begin(), pieces->end()); + + // Makes key/value set for DoubleArrayTrie. + std::vector key(pieces->size()); + std::vector value(pieces->size()); + for (size_t i = 0; i < pieces->size(); ++i) { + // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second); + key[i] = (*pieces)[i].first.data(); // sorted piece. + value[i] = (*pieces)[i].second; // vocab_id + } + + trie_ = std::unique_ptr(new Darts::DoubleArray()); + if (trie_->build(key.size(), const_cast(&key[0]), nullptr, + &value[0]) != 0) { + status_ = BUILD_DOUBLE_ARRAY_FAILED; + return; + } + + // Computes the maximum number of shared prefixes in the trie. + const int kMaxTrieResultsSize = 1024; + std::vector results( + kMaxTrieResultsSize); + trie_results_size_ = 0; + for (const auto& p : *pieces) { + const size_t num_nodes = trie_->commonPrefixSearch( + p.first.data(), results.data(), results.size(), p.first.size()); + trie_results_size_ = std::max(trie_results_size_, static_cast(num_nodes)); + } + + if (trie_results_size_ == 0) + status_ = NO_ENTRY_FOUND; + } + + // Non-virtual (inlined) implementation for faster execution. + inline float GetScoreInlined(int id) const { + return piece_score_pairs[id].second; + } + + inline bool IsUnusedInlined(int id) const { + return false; // TODO + } + + inline bool IsUserDefinedInlined(int id) const { + return false; // TODO + } + + inline size_t OneCharLen(const char* src) const { + return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; + } + + // The optimized Viterbi encode. + // Main differences from the original function: + // 1. Memorizes the best path at each postion so far, + // 2. No need to store the Lattice nodes, + // 3. Works in utf-8 directly, + // 4. Defines a new struct with fewer fields than Lattice, + // 5. Does not depend on `class Lattice` nor call `SetSentence()`, + // `PopulateNodes()`, or `Viterbi()`. It does everything in one function. + // For detailed explanations please see the comments inside the function body. + EncodeResult EncodeOptimized(const std::string& normalized) const { + // An optimized Viterbi algorithm for unigram language models. Benchmarking + // results show that it generates almost identical outputs and achieves 2.1x + // speedup on average for 102 languages compared to the original + // implementation. It's based on the following three ideas: + // + // 1. Because it uses the *unigram* model: + // best_score(x1, x2, ... xt) = best_score(x1, x2, ... x{t-1}) + score(xt) + // Deciding the best path (and score) can be decoupled into two isolated + // terms: (a) the best path ended before the last token `best_score(x1, x2, ...)` + // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are + // not related to each other at all. + // + // Therefore, we can compute once and store the *best_path ending at + // each character position*. In this way, when we know best_path_ends_at[M], + // we can reuse it to compute all the best_path_ends_at_[...] where the last + // token starts at the same character position M. + // + // This improves the time complexity from O(n*k*k) to O(n*k) because it + // eliminates the extra loop of recomputing the best path ending at the same + // position, where n is the input length and k is the maximum number of tokens + // that can be recognized starting at each position. + // + // 2. Again, because it uses the *unigram* model, we don't need to actually + // store the lattice nodes. We still recognize all the tokens and lattice + // nodes from the input, but along identifying them, we use and discard them + // on the fly. There is no need to actually store them for best path Viterbi + // decoding. The only thing we need to store is the best_path ending at + // each character position. + // + // This improvement reduces the things needed to store in memory from O(n*k) + // to O(n), where n is the input length and k is the maximum number of tokens + // that can be recognized starting at each position. + // + // It also avoids the need of dynamic-size lattice node pool, because the + // number of things to store is fixed as n. + // + // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding + // inputs. In the original implementation, the lattice positions are based on + // unicode positions. A mapping from unicode position to the utf-8 position is + // maintained to recover the utf-8 string piece. + // + // We found that it is sufficient and beneficial to directly work with utf-8 + // positions: + // + // Firstly, it saves the conversion and mapping between unicode positions and + // utf-8 positions. + // + // Secondly, it reduces the number of fields we need to maintain in the + // node/path structure. Specifically, there are 8 fields defined in + // `Lattice::Node` used by the original encoder, but here in the optimized + // encoder we only need to define 3 fields in `BestPathNode`. + + if (status() != OK || normalized.empty()) { + return {}; + } + // Represents the last node of the best path. + struct BestPathNode { + int id = -1; // The vocab id. (maybe -1 for UNK) + float best_path_score = + 0; // The total score of the best path ending at this node. + int starts_at = + -1; // The starting position (in utf-8) of this node. The entire best + // path can be constructed by backtracking along this link. + }; + const int size = static_cast(normalized.size()); + const float unk_score = min_score() - kUnkPenalty; + // The ends are exclusive. + std::vector best_path_ends_at(size + 1); + // Generate lattice on-the-fly (not stored) and update best_path_ends_at. + int starts_at = 0; + while (starts_at < size) { + std::size_t node_pos = 0; + std::size_t key_pos = starts_at; + const auto best_path_score_till_here = + best_path_ends_at[starts_at].best_path_score; + bool has_single_node = false; + const int mblen = + std::min(static_cast(OneCharLen(normalized.data() + starts_at)), + size - starts_at); + while (key_pos < size) { + const int ret = + trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1); + if (ret == -2) + break; + if (ret >= 0) { + if (IsUnusedInlined(ret)) + continue; + // Update the best path node. + auto& target_node = best_path_ends_at[key_pos]; + const auto length = (key_pos - starts_at); + // User defined symbol receives extra bonus to always be selected. + const auto score = IsUserDefinedInlined(ret) + ? (length * max_score_ - 0.1) + : GetScoreInlined(ret); + const auto candidate_best_path_score = + score + best_path_score_till_here; + if (target_node.starts_at == -1 || + candidate_best_path_score > target_node.best_path_score) { + target_node.best_path_score = static_cast(candidate_best_path_score); + target_node.starts_at = starts_at; + target_node.id = ret; + } + if (!has_single_node && length == mblen) { + has_single_node = true; + } + } + } + if (!has_single_node) { + auto& target_node = best_path_ends_at[starts_at + mblen]; + const auto candidate_best_path_score = + unk_score + best_path_score_till_here; + if (target_node.starts_at == -1 || + candidate_best_path_score > target_node.best_path_score) { + target_node.best_path_score = candidate_best_path_score; + target_node.starts_at = starts_at; + target_node.id = unk_id_; + } + } + // Move by one unicode character. + starts_at += mblen; + } + // Backtrack to identify the best path. + EncodeResult results; + int ends_at = size; + while (ends_at > 0) { + const auto& node = best_path_ends_at[ends_at]; + results.emplace_back( + normalized.substr(node.starts_at, ends_at - node.starts_at), node.id); + ends_at = node.starts_at; + } + std::reverse(results.begin(), results.end()); + return results; + } + +public: + explicit T5UniGramTokenizer(bool is_umt5 = false) { + if (is_umt5) { + InitializePieces(load_umt5_tokenizer_json()); + } else { + InitializePieces(load_t5_tokenizer_json()); + } + + min_score_ = FLT_MAX; + max_score_ = FLT_MIN; + + std::vector> pieces; + for (int i = 0; i < piece_score_pairs.size(); i++) { + const auto& sp = piece_score_pairs[i]; + + min_score_ = std::min(min_score_, sp.second); + max_score_ = std::max(max_score_, sp.second); + + pieces.emplace_back(sp.first, i); + } + + BuildTrie(&pieces); + } + ~T5UniGramTokenizer(){}; + + std::string Normalize(const std::string& input) const { + // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29 + // TODO: nmt-nfkc + std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " "); + return normalized; + } + + std::vector Encode(const std::string& input, bool append_eos_if_not_present = true) const { + std::string normalized = Normalize(input); + normalized = pre_tokenizer.tokenize(normalized); + EncodeResult result = EncodeOptimized(normalized); + if (result.size() > 0 && append_eos_if_not_present) { + auto item = result[result.size() - 1]; + if (item.first != eos_token_) { + result.emplace_back(eos_token_, eos_id_); + } + } + std::vector tokens; + for (auto item : result) { + tokens.push_back(item.second); + } + return tokens; + } + + void pad_tokens(std::vector& tokens, + std::vector& weights, + std::vector* attention_mask, + size_t max_length = 0, + bool padding = false) { + if (max_length > 0 && padding) { + size_t orig_token_num = tokens.size() - 1; + size_t n = static_cast(std::ceil(orig_token_num * 1.0 / (max_length - 1))); + if (n == 0) { + n = 1; + } + size_t length = max_length * n; + LOG_DEBUG("token length: %llu", length); + std::vector new_tokens; + std::vector new_weights; + std::vector new_attention_mask; + int token_idx = 0; + for (int i = 0; i < length; i++) { + if (token_idx >= orig_token_num) { + break; + } + if (attention_mask != nullptr) { + new_attention_mask.push_back(0.0); + } + if (i % max_length == max_length - 1) { + new_tokens.push_back(eos_id_); + new_weights.push_back(1.0); + } else { + new_tokens.push_back(tokens[token_idx]); + new_weights.push_back(weights[token_idx]); + token_idx++; + } + } + + new_tokens.push_back(eos_id_); + new_weights.push_back(1.0); + if (attention_mask != nullptr) { + new_attention_mask.push_back(0.0); + } + + tokens = new_tokens; + weights = new_weights; + if (attention_mask != nullptr) { + *attention_mask = new_attention_mask; + } + + if (padding) { + int pad_token_id = pad_id_; + tokens.insert(tokens.end(), length - tokens.size(), pad_token_id); + weights.insert(weights.end(), length - weights.size(), 1.0); + if (attention_mask != nullptr) { + // maybe keep some padding tokens unmasked? + attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF); + } + } + } + } + + // Returns the minimum score in sentence pieces. + // min_score() - 10 is used for the cost of unknown sentence. + float min_score() const { return min_score_; } + + // Returns the maximum score in sentence pieces. + // max_score() is used for the cost of user defined symbols. + float max_score() const { return max_score_; } + + Status status() const { return status_; } +}; + +class T5LayerNorm : public UnaryBlock { +protected: + int64_t hidden_size; + float eps; + + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = GGML_TYPE_F32; + params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); + } + +public: + T5LayerNorm(int64_t hidden_size, + float eps = 1e-06f) + : hidden_size(hidden_size), + eps(eps) {} + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + ggml_tensor* w = params["weight"]; + x = ggml_rms_norm(ctx->ggml_ctx, x, eps); + x = ggml_mul(ctx->ggml_ctx, x, w); + return x; + } +}; + +struct T5DenseActDense : public UnaryBlock { +public: + T5DenseActDense(int64_t model_dim, int64_t ff_dim) { + blocks["wi"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); + blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + // x: [N, n_token, model_dim] + auto wi = std::dynamic_pointer_cast(blocks["wi"]); + auto wo = std::dynamic_pointer_cast(blocks["wo"]); + + x = wi->forward(ctx, x); + x = ggml_relu_inplace(ctx->ggml_ctx, x); + x = wo->forward(ctx, x); + return x; + } +}; + +struct T5DenseGatedActDense : public UnaryBlock { +public: + T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { + blocks["wi_0"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); + blocks["wi_1"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); + float scale = 1.f / 32.f; + // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...). + blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false, false, false, scale)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + // x: [N, n_token, model_dim] + auto wi_0 = std::dynamic_pointer_cast(blocks["wi_0"]); + auto wi_1 = std::dynamic_pointer_cast(blocks["wi_1"]); + auto wo = std::dynamic_pointer_cast(blocks["wo"]); + + auto hidden_gelu = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true); + auto hidden_linear = wi_1->forward(ctx, x); + x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear); + x = wo->forward(ctx, x); + return x; + } +}; + +struct T5LayerFF : public UnaryBlock { +public: + T5LayerFF(int64_t model_dim, int64_t ff_dim) { + blocks["DenseReluDense"] = std::shared_ptr(new T5DenseGatedActDense(model_dim, ff_dim)); + blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + // x: [N, n_token, model_dim] + auto DenseReluDense = std::dynamic_pointer_cast(blocks["DenseReluDense"]); + auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); + + auto forwarded_states = layer_norm->forward(ctx, x); + forwarded_states = DenseReluDense->forward(ctx, forwarded_states); + x = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x); + return x; + } +}; + +class T5Attention : public GGMLBlock { +protected: + int64_t model_dim; + int64_t inner_dim; + int64_t num_heads; + bool using_relative_attention_bias; + int64_t relative_attention_num_buckets = 32; + int64_t relative_attention_max_distance = 128; + +public: + T5Attention(int64_t model_dim, + int64_t inner_dim, + int64_t num_heads, + bool using_relative_attention_bias = false) + : model_dim(model_dim), + inner_dim(inner_dim), + num_heads(num_heads), + using_relative_attention_bias(using_relative_attention_bias) { + blocks["q"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); + blocks["k"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); + blocks["v"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); + blocks["o"] = std::shared_ptr(new Linear(inner_dim, model_dim, false)); + if (using_relative_attention_bias) { + blocks["relative_attention_bias"] = std::shared_ptr(new Embedding(relative_attention_num_buckets, num_heads)); + } + } + + ggml_tensor* compute_bias(GGMLRunnerContext* ctx, + ggml_tensor* relative_position_bucket) { + auto relative_attention_bias = std::dynamic_pointer_cast(blocks["relative_attention_bias"]); + + auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) + values = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length) + return values; + } + + // x: [N, n_token, model_dim] + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + auto q_proj = std::dynamic_pointer_cast(blocks["q"]); + auto k_proj = std::dynamic_pointer_cast(blocks["k"]); + auto v_proj = std::dynamic_pointer_cast(blocks["v"]); + auto out_proj = std::dynamic_pointer_cast(blocks["o"]); + + int64_t n_head = num_heads; + int64_t d_head = inner_dim / n_head; + + auto q = q_proj->forward(ctx, x); + auto k = k_proj->forward(ctx, x); + auto v = v_proj->forward(ctx, x); + + if (using_relative_attention_bias && relative_position_bucket != nullptr) { + past_bias = compute_bias(ctx, relative_position_bucket); + } + if (past_bias != nullptr) { + if (mask != nullptr) { + mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias); + mask = ggml_add(ctx->ggml_ctx, mask, past_bias); + } else { + mask = past_bias; + } + } + + k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast(d_head)), true); + + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head] + + x = out_proj->forward(ctx, x); // [N, n_token, model_dim] + return {x, past_bias}; + } +}; + +struct T5LayerSelfAttention : public GGMLBlock { +public: + T5LayerSelfAttention(int64_t model_dim, + int64_t inner_dim, + int64_t ff_dim, + int64_t num_heads, + bool using_relative_attention_bias) { + blocks["SelfAttention"] = std::shared_ptr(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias)); + blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // x: [N, n_token, model_dim] + auto SelfAttention = std::dynamic_pointer_cast(blocks["SelfAttention"]); + auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); + + auto normed_hidden_state = layer_norm->forward(ctx, x); + auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket); + auto output = ret.first; + past_bias = ret.second; + + x = ggml_add_inplace(ctx->ggml_ctx, output, x); + return {x, past_bias}; + } +}; + +struct T5Block : public GGMLBlock { +public: + T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) { + blocks["layer.0"] = std::shared_ptr(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias)); + blocks["layer.1"] = std::shared_ptr(new T5LayerFF(model_dim, ff_dim)); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // x: [N, n_token, model_dim] + auto layer_0 = std::dynamic_pointer_cast(blocks["layer.0"]); + auto layer_1 = std::dynamic_pointer_cast(blocks["layer.1"]); + + auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket); + x = ret.first; + past_bias = ret.second; + x = layer_1->forward(ctx, x); + return {x, past_bias}; + } +}; + +struct T5Stack : public GGMLBlock { + int64_t num_layers; + +public: + T5Stack(int64_t num_layers, + int64_t model_dim, + int64_t inner_dim, + int64_t ff_dim, + int64_t num_heads, + bool relative_attention = true) + : num_layers(num_layers) { + for (int i = 0; i < num_layers; i++) { + blocks["block." + std::to_string(i)] = std::shared_ptr(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0))); + } + + blocks["final_layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* attention_mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // x: [N, n_token, model_dim] + for (int i = 0; i < num_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["block." + std::to_string(i)]); + + auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); + x = ret.first; + past_bias = ret.second; + } + + auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]); + + x = final_layer_norm->forward(ctx, x); + return x; + } +}; + +struct T5Params { + int64_t num_layers = 24; + int64_t model_dim = 4096; + int64_t ff_dim = 10240; + int64_t num_heads = 64; + int64_t vocab_size = 32128; + bool relative_attention = true; +}; + +struct T5 : public GGMLBlock { + T5Params params; + +public: + T5() {} + T5(T5Params params) + : params(params) { + blocks["encoder"] = std::shared_ptr(new T5Stack(params.num_layers, + params.model_dim, + params.model_dim, + params.ff_dim, + params.num_heads, + params.relative_attention)); + blocks["shared"] = std::shared_ptr(new Embedding(params.vocab_size, + params.model_dim)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* past_bias = nullptr, + ggml_tensor* attention_mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // input_ids: [N, n_token] + + auto shared = std::dynamic_pointer_cast(blocks["shared"]); + auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); + + auto x = shared->forward(ctx, input_ids); + x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); + return x; + } +}; + +struct T5Runner : public GGMLRunner { + T5Params params; + T5 model; + std::vector relative_position_bucket_vec; + + T5Runner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map, + const std::string prefix, + bool is_umt5 = false) + : GGMLRunner(backend, offload_params_to_cpu) { + if (is_umt5) { + params.vocab_size = 256384; + params.relative_attention = false; + } + model = T5(params); + model.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "t5"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + model.get_param_tensors(tensors, prefix); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* relative_position_bucket, + ggml_tensor* attention_mask = nullptr) { + size_t N = input_ids->ne[1]; + size_t n_token = input_ids->ne[0]; + + auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim] + return hidden_states; + } + + ggml_cgraph* build_graph(const sd::Tensor& input_ids_tensor, + const sd::Tensor& attention_mask_tensor = {}) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_tensor* input_ids = make_input(input_ids_tensor); + ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor); + + relative_position_bucket_vec = compute_relative_position_bucket(static_cast(input_ids->ne[0]), static_cast(input_ids->ne[0])); + + // for (int i = 0; i < relative_position_bucket_vec.size(); i++) { + // if (i % 77 == 0) { + // printf("\n"); + // } + // printf("%d ", relative_position_bucket_vec[i]); + // } + + auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx, + GGML_TYPE_I32, + input_ids->ne[0], + input_ids->ne[0]); + set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); + + auto runner_ctx = get_context(); + ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask); + + ggml_build_forward_expand(gf, hidden_states); + + return gf; + } + + sd::Tensor compute(const int n_threads, + const sd::Tensor& input_ids, + const sd::Tensor& attention_mask) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(input_ids, attention_mask); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true), 3); + } + + static std::vector _relative_position_bucket(const std::vector& relative_position, + bool bidirectional = true, + int num_buckets = 32, + int max_distance = 128) { + std::vector relative_buckets(relative_position.size(), 0); + std::vector abs_relative_position = relative_position; + + if (bidirectional) { + num_buckets = num_buckets / 2; + for (size_t i = 0; i < relative_position.size(); ++i) { + if (relative_position[i] > 0) { + relative_buckets[i] += num_buckets; + } + abs_relative_position[i] = std::abs(relative_position[i]); + } + } else { + for (size_t i = 0; i < relative_position.size(); ++i) { + abs_relative_position[i] = std::max(-relative_position[i], 0); + } + } + + int max_exact = num_buckets / 2; + std::vector relative_position_if_large(relative_position.size(), 0); + + for (size_t i = 0; i < relative_position.size(); ++i) { + if (abs_relative_position[i] < max_exact) { + relative_buckets[i] += abs_relative_position[i]; + } else { + float log_pos = std::log(static_cast(abs_relative_position[i]) / max_exact); + float log_base = std::log(static_cast(max_distance) / max_exact); + relative_position_if_large[i] = max_exact + static_cast((log_pos / log_base) * (num_buckets - max_exact)); + relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1); + relative_buckets[i] += relative_position_if_large[i]; + } + } + + return relative_buckets; + } + + std::vector compute_relative_position_bucket(int query_length, + int key_length) { + std::vector context_position(query_length); + std::vector memory_position(key_length); + + for (int i = 0; i < query_length; ++i) { + context_position[i] = i; + } + for (int i = 0; i < key_length; ++i) { + memory_position[i] = i; + } + + std::vector> relative_position(query_length, std::vector(key_length, 0)); + for (int i = 0; i < query_length; ++i) { + for (int j = 0; j < key_length; ++j) { + relative_position[i][j] = memory_position[j] - context_position[i]; + } + } + + std::vector relative_position_bucket; + for (int i = 0; i < query_length; ++i) { + std::vector result = _relative_position_bucket(relative_position[i], true); + relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end()); + } + + return relative_position_bucket; + } +}; + +struct T5Embedder { + T5UniGramTokenizer tokenizer; + T5Runner model; + + T5Embedder(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + bool is_umt5 = false) + : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + model.get_param_tensors(tensors, prefix); + } + + void alloc_params_buffer() { + model.alloc_params_buffer(); + } + + std::tuple, std::vector, std::vector> tokenize(std::string text, + size_t max_length = 0, + bool padding = false) { + auto parsed_attention = parse_prompt_attention(text); + + { + std::stringstream ss; + ss << "["; + for (const auto& item : parsed_attention) { + ss << "['" << item.first << "', " << item.second << "], "; + } + ss << "]"; + LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); + } + + std::vector tokens; + std::vector weights; + for (const auto& item : parsed_attention) { + const std::string& curr_text = item.first; + float curr_weight = item.second; + std::vector curr_tokens = tokenizer.Encode(curr_text, false); + tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); + weights.insert(weights.end(), curr_tokens.size(), curr_weight); + } + + int EOS_TOKEN_ID = 1; + tokens.push_back(EOS_TOKEN_ID); + weights.push_back(1.0); + + std::vector attention_mask; + + tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding); + + // for (int i = 0; i < tokens.size(); i++) { + // std::cout << tokens[i] << ":" << weights[i] << ", "; + // } + // std::cout << std::endl; + + return {tokens, weights, attention_mask}; + } + + void test() { + ggml_init_params params; + params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB + params.mem_buffer = nullptr; + params.no_alloc = false; + + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); + + { + std::string text("a lovely cat"); + auto tokens_and_weights = tokenize(text, 512, true); + std::vector& tokens = std::get<0>(tokens_and_weights); + std::vector& weights = std::get<1>(tokens_and_weights); + std::vector& masks = std::get<2>(tokens_and_weights); + for (auto token : tokens) { + printf("%d ", token); + } + printf("\n"); + auto input_ids = sd::Tensor::from_vector(tokens); + auto attention_mask = sd::Tensor::from_vector(masks); + sd::Tensor out; + + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, attention_mask); + int64_t t1 = ggml_time_ms(); + + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); + LOG_DEBUG("t5 test done in %lldms", t1 - t0); + } + } + + static void load_from_file_and_test(const std::string& file_path) { + // cpu f16: pass + // cpu f32: pass + // cuda f16: pass + // cuda f32: pass + // cuda q8_0: pass + // ggml_backend_t backend = ggml_backend_cuda_init(0); + ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_type model_data_type = GGML_TYPE_F16; + + ModelLoader model_loader; + if (!model_loader.init_from_file_and_convert_name(file_path)) { + LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); + return; + } + + auto& tensor_storage_map = model_loader.get_tensor_storage_map(); + for (auto& [name, tensor_storage] : tensor_storage_map) { + if (ends_with(name, "weight")) { + tensor_storage.expected_type = model_data_type; + } + } + + std::shared_ptr t5 = std::make_shared(backend, false, tensor_storage_map, "", true); + + t5->alloc_params_buffer(); + std::map tensors; + t5->get_param_tensors(tensors, ""); + + bool success = model_loader.load_tensors(tensors); + + if (!success) { + LOG_ERROR("load tensors from model loader failed"); + return; + } + + LOG_INFO("t5 model loaded"); + t5->test(); + } +}; + +#endif // __T5_HPP__ diff --git a/src/tae.hpp b/src/tae.hpp index 3df09e4..0a0ca68 100644 --- a/src/tae.hpp +++ b/src/tae.hpp @@ -562,41 +562,40 @@ struct TinyImageAutoEncoder : public VAE { taesd.get_param_tensors(tensors, prefix); } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { + SD_UNUSED(rng); return vae_output; } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { + return latents; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { + return latents; } int get_encoder_output_channels(int input_channels) { return taesd.z_channels; } - ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { ggml_cgraph* gf = ggml_new_graph(compute_ctx); - z = to_backend(z); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z); ggml_build_forward_expand(gf, out); return gf; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z_tensor, + bool decode_graph) override { auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(z, decode_graph); + return build_graph(z_tensor, decode_graph); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z_tensor.dim()); } }; @@ -625,42 +624,41 @@ struct TinyVideoAutoEncoder : public VAE { taehv.get_param_tensors(tensors, prefix); } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { + SD_UNUSED(rng); return vae_output; } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { + return latents; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { + return latents; } int get_encoder_output_channels(int input_channels) { return taehv.z_channels; } - ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { ggml_cgraph* gf = ggml_new_graph(compute_ctx); - z = to_backend(z); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); ggml_tensor* out = decode_graph ? taehv.decode(&runner_ctx, z) : taehv.encode(&runner_ctx, z); ggml_build_forward_expand(gf, out); return gf; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z_tensor, + bool decode_graph) override { auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(z, decode_graph); + return build_graph(z_tensor, decode_graph); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z_tensor.dim()); } }; -#endif // __TAE_HPP__ \ No newline at end of file +#endif // __TAE_HPP__ diff --git a/src/tensor.hpp b/src/tensor.hpp new file mode 100644 index 0000000..33a2bde --- /dev/null +++ b/src/tensor.hpp @@ -0,0 +1,1249 @@ +#ifndef __SD_TENSOR_HPP__ +#define __SD_TENSOR_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rng.hpp" + +namespace sd { + + template + class Tensor; + + inline std::vector tensor_unravel_index(int64_t flat, const std::vector& shape); + + [[noreturn]] inline void tensor_throw_invalid_argument(const std::string& message) { + std::fprintf(stderr, "sd::Tensor error: %s\n", message.c_str()); + std::fflush(stderr); + throw std::invalid_argument(message); + } + + inline std::string tensor_shape_to_string(const std::vector& shape) { + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < shape.size(); ++i) { + if (i != 0) { + oss << ", "; + } + oss << shape[i]; + } + oss << "]"; + return oss.str(); + } + + inline int64_t tensor_numel(const std::vector& shape) { + if (shape.empty()) { + return 0; + } + int64_t numel = 1; + for (int64_t dim : shape) { + if (dim < 0) { + tensor_throw_invalid_argument("Tensor shape must be non-negative, got shape=" + + tensor_shape_to_string(shape)); + } + numel *= dim; + } + return numel; + } + + template + class Tensor { + public: + Tensor() = default; + + explicit Tensor(std::vector shape) + : data_(static_cast(tensor_numel(shape))), shape_(std::move(shape)) { + } + + Tensor(std::vector shape, std::vector data) + : data_(std::move(data)), shape_(std::move(shape)) { + if (static_cast(data_.size()) != tensor_numel(shape_)) { + tensor_throw_invalid_argument("Tensor data size does not match shape: data.size()=" + + std::to_string(data_.size()) + ", shape=" + + tensor_shape_to_string(shape_) + ", numel=" + + std::to_string(tensor_numel(shape_))); + } + } + + const std::vector& shape() const { + return shape_; + } + + int64_t dim() const { + return static_cast(shape_.size()); + } + + int64_t numel() const { + return static_cast(data_.size()); + } + + bool empty() const { + return data_.empty(); + } + + T* data() { + return data_.data(); + } + + const T* data() const { + return data_.data(); + } + + std::vector& values() { + return data_; + } + + const std::vector& values() const { + return data_; + } + + void resize(std::vector shape) { + shape_ = std::move(shape); + data_.resize(static_cast(tensor_numel(shape_))); + } + + Tensor& reshape_(std::vector shape) { + if (tensor_numel(shape) != numel()) { + tensor_throw_invalid_argument("Tensor reshape changes element count: from shape=" + + tensor_shape_to_string(shape_) + " (numel=" + + std::to_string(numel()) + ") to shape=" + + tensor_shape_to_string(shape) + " (numel=" + + std::to_string(tensor_numel(shape)) + ")"); + } + shape_ = std::move(shape); + return *this; + } + + Tensor reshape(std::vector shape) const { + Tensor result = *this; + result.reshape_(std::move(shape)); + return result; + } + + Tensor& squeeze_() { + std::vector new_shape; + new_shape.reserve(shape_.size()); + for (int64_t dim : shape_) { + if (dim != 1) { + new_shape.push_back(dim); + } + } + shape_ = std::move(new_shape); + return *this; + } + + Tensor& squeeze_(size_t dim) { + if (dim >= shape_.size()) { + tensor_throw_invalid_argument("Tensor squeeze dimension out of range: dim=" + + std::to_string(dim) + ", shape=" + + tensor_shape_to_string(shape_)); + } + if (shape_[dim] != 1) { + tensor_throw_invalid_argument("Tensor squeeze requires dimension size 1: dim=" + + std::to_string(dim) + ", shape=" + + tensor_shape_to_string(shape_)); + } + shape_.erase(shape_.begin() + static_cast(dim)); + return *this; + } + + Tensor squeeze() const { + Tensor result = *this; + result.squeeze_(); + return result; + } + + Tensor squeeze(size_t dim) const { + Tensor result = *this; + result.squeeze_(dim); + return result; + } + + Tensor& unsqueeze_(size_t dim) { + if (dim > shape_.size()) { + tensor_throw_invalid_argument("Tensor unsqueeze dimension out of range: dim=" + + std::to_string(dim) + ", shape=" + + tensor_shape_to_string(shape_)); + } + shape_.insert(shape_.begin() + static_cast(dim), 1); + return *this; + } + + Tensor unsqueeze(size_t dim) const { + Tensor result = *this; + result.unsqueeze_(dim); + return result; + } + + Tensor permute(const std::vector& dims) const { + if (dims.size() != static_cast(dim())) { + tensor_throw_invalid_argument("Tensor permute requires one dimension index per axis: tensor_shape=" + + tensor_shape_to_string(shape_) + ", dims_size=" + + std::to_string(dims.size())); + } + + std::vector seen(dims.size(), false); + std::vector out_shape(dims.size(), 1); + for (size_t i = 0; i < dims.size(); ++i) { + size_t dim_index = dims[i]; + if (dim_index >= dims.size() || seen[dim_index]) { + tensor_throw_invalid_argument("Tensor permute dimensions must be a valid permutation: tensor_shape=" + + tensor_shape_to_string(shape_)); + } + seen[dim_index] = true; + out_shape[i] = shape_[dim_index]; + } + + Tensor result(out_shape); + if (result.numel() == 0) { + return result; + } + + for (int64_t flat = 0; flat < result.numel(); ++flat) { + std::vector out_coord = tensor_unravel_index(flat, out_shape); + std::vector src_coord(static_cast(dim()), 0); + for (size_t i = 0; i < dims.size(); ++i) { + src_coord[dims[i]] = out_coord[i]; + } + result[flat] = index(src_coord); + } + + return result; + } + + Tensor& permute_(const std::vector& dims) { + *this = permute(dims); + return *this; + } + + void fill_(const T& value) { + std::fill(data_.begin(), data_.end(), value); + } + + Tensor& masked_fill_(const Tensor& mask, const T& value); + + T mean() const; + + static Tensor zeros(std::vector shape) { + return Tensor(std::move(shape)); + } + + static Tensor zeros_like(const Tensor& other) { + return zeros(other.shape()); + } + + static Tensor ones(std::vector shape) { + return full(std::move(shape), static_cast(1)); + } + + static Tensor ones_like(const Tensor& other) { + return ones(other.shape()); + } + + static Tensor full(std::vector shape, const T& value) { + Tensor tensor(std::move(shape)); + tensor.fill_(value); + return tensor; + } + + static Tensor randn(std::vector shape, const std::shared_ptr& rng) { + static_assert(std::is_same_v, "Tensor::randn currently requires Tensor"); + if (!rng) { + tensor_throw_invalid_argument("Tensor randn requires a valid RNG"); + } + const uint32_t size = static_cast(tensor_numel(shape)); + return Tensor(std::move(shape), rng->randn(size)); + } + + static Tensor randn_like(const Tensor& other, const std::shared_ptr& rng) { + return randn(other.shape(), rng); + } + + static Tensor from_vector(std::vector data) { + const int64_t size = static_cast(data.size()); + return Tensor({size}, std::move(data)); + } + + T& index(const std::vector& coord) { + return data_.at(offset_of(coord)); + } + + const T& index(const std::vector& coord) const { + return data_.at(offset_of(coord)); + } + + template && ...)>> + T& index(Indices... indices) { + return index(std::vector{static_cast(indices)...}); + } + + template && ...)>> + const T& index(Indices... indices) const { + return index(std::vector{static_cast(indices)...}); + } + + T& operator[](int64_t index) { + return data_.at(static_cast(index)); + } + + const T& operator[](int64_t index) const { + return data_.at(static_cast(index)); + } + + private: + size_t offset_of(const std::vector& coord) const { + if (coord.size() != shape_.size()) { + tensor_throw_invalid_argument("Tensor index rank mismatch: coord_rank=" + + std::to_string(coord.size()) + ", shape=" + + tensor_shape_to_string(shape_)); + } + size_t offset = 0; + size_t stride = 1; + for (size_t i = 0; i < shape_.size(); ++i) { + if (coord[i] < 0 || coord[i] >= shape_[i]) { + tensor_throw_invalid_argument("Tensor index out of range: shape=" + + tensor_shape_to_string(shape_)); + } + offset += static_cast(coord[i]) * stride; + stride *= static_cast(shape_[i]); + } + return offset; + } + + std::vector data_; + std::vector shape_; + }; + + template + inline T Tensor::mean() const { + if (empty()) { + return T{}; + } + T sum = T{}; + for (const T& value : data_) { + sum += value; + } + return sum / static_cast(numel()); + } + + template <> + inline float Tensor::mean() const { + if (empty()) { + return 0.0f; + } + double sum = 0.0; + for (float value : data_) { + sum += static_cast(value); + } + return static_cast(sum / static_cast(numel())); + } + + template + inline void tensor_check_same_shape(const Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + tensor_throw_invalid_argument("Tensor shapes must match: lhs_shape=" + + tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" + + tensor_shape_to_string(rhs.shape())); + } + } + + inline std::vector tensor_broadcast_shape(const std::vector& lhs, const std::vector& rhs) { + size_t ndim = std::max(lhs.size(), rhs.size()); + std::vector shape(ndim, 1); + for (size_t i = 0; i < ndim; ++i) { + int64_t lhs_dim = lhs.size() > i ? lhs[i] : 1; + int64_t rhs_dim = rhs.size() > i ? rhs[i] : 1; + if (lhs_dim != rhs_dim && lhs_dim != 1 && rhs_dim != 1) { + tensor_throw_invalid_argument("Tensor shapes are not broadcastable: lhs_shape=" + + tensor_shape_to_string(lhs) + ", rhs_shape=" + + tensor_shape_to_string(rhs)); + } + shape[i] = std::max(lhs_dim, rhs_dim); + } + return shape; + } + + inline std::vector tensor_unravel_index(int64_t flat, const std::vector& shape) { + std::vector coord(shape.size(), 0); + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] <= 0) { + tensor_throw_invalid_argument("Tensor unravel_index requires positive shape: shape=" + + tensor_shape_to_string(shape)); + } + coord[i] = flat % shape[i]; + flat /= shape[i]; + } + return coord; + } + + inline std::vector tensor_compute_strides(const std::vector& shape) { + std::vector strides(shape.size(), 1); + int64_t stride = 1; + for (size_t i = 0; i < shape.size(); ++i) { + strides[i] = stride; + stride *= shape[i]; + } + return strides; + } + + template + inline void tensor_for_each_broadcast_offset(const std::vector& out_shape, + const std::vector& lhs_shape_raw, + const std::vector& lhs_strides_raw, + const std::vector& rhs_shape_raw, + const std::vector& rhs_strides_raw, + F&& fn) { + const size_t ndim = out_shape.size(); + std::vector out_strides = tensor_compute_strides(out_shape); + std::vector lhs_shape(ndim, 1); + std::vector lhs_strides(ndim, 0); + std::vector rhs_shape(ndim, 1); + std::vector rhs_strides(ndim, 0); + + for (size_t i = 0; i < lhs_shape_raw.size(); ++i) { + lhs_shape[i] = lhs_shape_raw[i]; + lhs_strides[i] = lhs_strides_raw[i]; + } + for (size_t i = 0; i < rhs_shape_raw.size(); ++i) { + rhs_shape[i] = rhs_shape_raw[i]; + rhs_strides[i] = rhs_strides_raw[i]; + } + + const int64_t numel = tensor_numel(out_shape); + for (int64_t flat = 0; flat < numel; ++flat) { + int64_t remaining = flat; + int64_t lhs_offset = 0; + int64_t rhs_offset = 0; + for (size_t i = ndim; i-- > 0;) { + int64_t coord = remaining / out_strides[i]; + remaining %= out_strides[i]; + if (lhs_shape[i] != 1) { + lhs_offset += coord * lhs_strides[i]; + } + if (rhs_shape[i] != 1) { + rhs_offset += coord * rhs_strides[i]; + } + } + fn(flat, lhs_offset, rhs_offset); + } + } + + template + inline Tensor& Tensor::masked_fill_(const Tensor& mask, const T& value) { + if (empty()) { + return *this; + } + tensor_broadcast_shape(shape_, mask.shape()); + const std::vector data_strides = tensor_compute_strides(shape_); + const std::vector mask_strides = tensor_compute_strides(mask.shape()); + const uint8_t* mask_data = mask.data(); + tensor_for_each_broadcast_offset(shape_, + shape_, + data_strides, + mask.shape(), + mask_strides, + [&](int64_t, int64_t data_offset, int64_t mask_offset) { + if (mask_data[mask_offset] != 0) { + data_[static_cast(data_offset)] = value; + } + }); + return *this; + } + + template ::value>> + inline Tensor operator<(const Tensor& lhs, Scalar rhs) { + Tensor result(lhs.shape()); + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + result[i] = lhs[i] < value ? 1 : 0; + } + return result; + } + + template ::value>> + inline Tensor operator<(Scalar lhs, const Tensor& rhs) { + Tensor result(rhs.shape()); + const T value = static_cast(lhs); + for (int64_t i = 0; i < rhs.numel(); ++i) { + result[i] = value < rhs[i] ? 1 : 0; + } + return result; + } + + template + inline Tensor operator<(const Tensor& lhs, const Tensor& rhs) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] < rhs_data[rhs_offset] ? 1 : 0; + }); + return result; + } + + template + inline Tensor& operator+=(Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() == rhs.shape()) { + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] += rhs[i]; + } + return lhs; + } + tensor_broadcast_shape(lhs.shape(), rhs.shape()); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(lhs.shape(), + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) { + lhs[static_cast(lhs_offset)] += rhs_data[rhs_offset]; + }); + return lhs; + } + + template ::value>> + inline Tensor& operator+=(Tensor& lhs, Scalar rhs) { + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] += value; + } + return lhs; + } + + template + inline Tensor& operator-=(Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() == rhs.shape()) { + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] -= rhs[i]; + } + return lhs; + } + tensor_broadcast_shape(lhs.shape(), rhs.shape()); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(lhs.shape(), + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) { + lhs[static_cast(lhs_offset)] -= rhs_data[rhs_offset]; + }); + return lhs; + } + + template ::value>> + inline Tensor& operator-=(Tensor& lhs, Scalar rhs) { + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] -= value; + } + return lhs; + } + + template + inline Tensor& operator*=(Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() == rhs.shape()) { + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] *= rhs[i]; + } + return lhs; + } + tensor_broadcast_shape(lhs.shape(), rhs.shape()); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(lhs.shape(), + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) { + lhs[static_cast(lhs_offset)] *= rhs_data[rhs_offset]; + }); + return lhs; + } + + template ::value>> + inline Tensor& operator*=(Tensor& lhs, Scalar rhs) { + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] *= value; + } + return lhs; + } + + template + inline Tensor& operator/=(Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() == rhs.shape()) { + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] /= rhs[i]; + } + return lhs; + } + tensor_broadcast_shape(lhs.shape(), rhs.shape()); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(lhs.shape(), + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) { + lhs[static_cast(lhs_offset)] /= rhs_data[rhs_offset]; + }); + return lhs; + } + + template ::value>> + inline Tensor& operator/=(Tensor& lhs, Scalar rhs) { + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] /= value; + } + return lhs; + } + + template + inline Tensor operator+(Tensor lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] + rhs_data[rhs_offset]; + }); + return result; + } + lhs += rhs; + return lhs; + } + + template ::value>> + inline Tensor operator+(Tensor lhs, Scalar rhs) { + lhs += rhs; + return lhs; + } + + template ::value>> + inline Tensor operator+(Scalar lhs, Tensor rhs) { + rhs += lhs; + return rhs; + } + + template + inline Tensor operator-(Tensor lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] - rhs_data[rhs_offset]; + }); + return result; + } + lhs -= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator-(Tensor lhs, Scalar rhs) { + lhs -= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator-(Scalar lhs, const Tensor& rhs) { + Tensor result = rhs; + const T value = static_cast(lhs); + for (int64_t i = 0; i < result.numel(); ++i) { + result[i] = value - result[i]; + } + return result; + } + + template + inline Tensor operator*(Tensor lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] * rhs_data[rhs_offset]; + }); + return result; + } + lhs *= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator*(Tensor lhs, Scalar rhs) { + lhs *= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator*(Scalar lhs, Tensor rhs) { + rhs *= lhs; + return rhs; + } + + template + inline Tensor operator/(Tensor lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] / rhs_data[rhs_offset]; + }); + return result; + } + lhs /= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator/(Tensor lhs, Scalar rhs) { + lhs /= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator/(Scalar lhs, const Tensor& rhs) { + Tensor result = rhs; + const T value = static_cast(lhs); + for (int64_t i = 0; i < result.numel(); ++i) { + result[i] = value / result[i]; + } + return result; + } + + template + inline Tensor operator-(const Tensor& tensor) { + Tensor result = tensor; + for (int64_t i = 0; i < result.numel(); ++i) { + result[i] = -result[i]; + } + return result; + } + + template + inline Tensor zeros(std::vector shape) { + return Tensor::zeros(std::move(shape)); + } + + template + inline Tensor full(std::vector shape, const T& value) { + return Tensor::full(std::move(shape), value); + } + + template + inline Tensor randn(std::vector shape, const std::shared_ptr& rng) { + return Tensor::randn(std::move(shape), rng); + } + + template + inline Tensor randn_like(const Tensor& tensor, const std::shared_ptr& rng) { + return Tensor::randn(tensor.shape(), rng); + } + + template + inline std::vector tensor_to_vector(const Tensor& tensor) { + return tensor.values(); + } + + namespace ops { + enum class InterpolateMode { + Nearest, + }; + + inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) { + if (index < 0) { + index += dim_size; + } + return index; + } + + template + inline std::pair resolve_slice_bounds(const Tensor& input, + size_t dim, + int64_t start, + int64_t end) { + if (dim >= static_cast(input.dim())) { + tensor_throw_invalid_argument("Tensor slice dimension out of range: dim=" + + std::to_string(dim) + ", rank=" + + std::to_string(input.dim()) + ", input_shape=" + + tensor_shape_to_string(input.shape())); + } + + int64_t dim_size = input.shape()[dim]; + start = normalize_slice_bound(start, dim_size); + end = normalize_slice_bound(end, dim_size); + + if (start < 0 || start > dim_size || end < 0 || end > dim_size || start > end) { + tensor_throw_invalid_argument("Tensor slice bounds out of range: dim=" + + std::to_string(dim) + ", start=" + + std::to_string(start) + ", end=" + + std::to_string(end) + ", input_shape=" + + tensor_shape_to_string(input.shape())); + } + + return {start, end}; + } + + template + inline Tensor exp(const Tensor& input) { + Tensor output(input.shape()); + for (int64_t i = 0; i < input.numel(); ++i) { + output[i] = static_cast(std::exp(static_cast(input[i]))); + } + return output; + } + + template + inline Tensor clamp(const Tensor& input, const T& min_value, const T& max_value) { + if (min_value > max_value) { + tensor_throw_invalid_argument("Tensor clamp requires min_value <= max_value"); + } + Tensor output(input.shape()); + for (int64_t i = 0; i < input.numel(); ++i) { + output[i] = std::clamp(input[i], min_value, max_value); + } + return output; + } + + template + inline Tensor round(const Tensor& input) { + Tensor output(input.shape()); + for (int64_t i = 0; i < input.numel(); ++i) { + output[i] = static_cast(std::round(static_cast(input[i]))); + } + return output; + } + + template + inline Tensor slice(const Tensor& input, + size_t dim, + int64_t start, + int64_t end) { + auto [resolved_start, resolved_end] = resolve_slice_bounds(input, dim, start, end); + std::vector out_shape = input.shape(); + out_shape[dim] = resolved_end - resolved_start; + + Tensor output(out_shape); + if (output.numel() == 0) { + return output; + } + + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= input.shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(input.dim()); ++i) { + outer *= input.shape()[i]; + } + + int64_t src_chunk = (resolved_end - resolved_start) * inner; + int64_t src_stride = input.shape()[dim] * inner; + for (int64_t i = 0; i < outer; ++i) { + const int64_t src_offset = i * src_stride + resolved_start * inner; + const int64_t dst_offset = i * src_chunk; + std::copy_n(input.data() + src_offset, src_chunk, output.data() + dst_offset); + } + + return output; + } + + template + inline Tensor narrow(const Tensor& input, + size_t dim, + int64_t start, + int64_t length) { + if (length < 0) { + tensor_throw_invalid_argument("Tensor narrow requires non-negative length: length=" + + std::to_string(length) + ", input_shape=" + + tensor_shape_to_string(input.shape())); + } + return slice(input, dim, start, start + length); + } + + template + inline void slice_assign(Tensor* dst, + size_t dim, + int64_t start, + int64_t end, + const Tensor& src) { + if (dst == nullptr) { + tensor_throw_invalid_argument("Tensor slice_assign requires non-null dst"); + } + + auto [resolved_start, resolved_end] = resolve_slice_bounds(*dst, dim, start, end); + if (src.dim() != dst->dim()) { + tensor_throw_invalid_argument("Tensor slice_assign requires matching rank: dst_shape=" + + tensor_shape_to_string(dst->shape()) + ", src_shape=" + + tensor_shape_to_string(src.shape())); + } + + std::vector expected_shape = dst->shape(); + expected_shape[dim] = resolved_end - resolved_start; + if (src.shape() != expected_shape) { + tensor_throw_invalid_argument("Tensor slice_assign requires matching source shape: dst_shape=" + + tensor_shape_to_string(dst->shape()) + ", src_shape=" + + tensor_shape_to_string(src.shape()) + ", expected_src_shape=" + + tensor_shape_to_string(expected_shape)); + } + + if (src.numel() == 0) { + return; + } + + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= dst->shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(dst->dim()); ++i) { + outer *= dst->shape()[i]; + } + + int64_t dst_chunk = (resolved_end - resolved_start) * inner; + int64_t dst_stride = dst->shape()[dim] * inner; + for (int64_t i = 0; i < outer; ++i) { + const int64_t dst_offset = i * dst_stride + resolved_start * inner; + const int64_t src_offset = i * dst_chunk; + std::copy_n(src.data() + src_offset, dst_chunk, dst->data() + dst_offset); + } + } + + template + inline void fill_slice(Tensor* dst, + size_t dim, + int64_t start, + int64_t end, + const T& value) { + if (dst == nullptr) { + tensor_throw_invalid_argument("Tensor fill_slice requires non-null dst"); + } + + auto [resolved_start, resolved_end] = resolve_slice_bounds(*dst, dim, start, end); + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= dst->shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(dst->dim()); ++i) { + outer *= dst->shape()[i]; + } + + int64_t chunk = (resolved_end - resolved_start) * inner; + int64_t stride = dst->shape()[dim] * inner; + for (int64_t i = 0; i < outer; ++i) { + const int64_t offset = i * stride + resolved_start * inner; + std::fill_n(dst->data() + offset, chunk, value); + } + } + + template + inline Tensor interpolate(const Tensor& input, + std::vector output_shape, + InterpolateMode mode = InterpolateMode::Nearest, + bool align_corners = false) { + if (mode != InterpolateMode::Nearest) { + tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" + + std::to_string(static_cast(mode))); + } + if (align_corners) { + tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + if (input.shape() == output_shape) { + return input; + } + if (input.dim() != static_cast(output_shape.size())) { + tensor_throw_invalid_argument("Tensor interpolate requires matching rank: input_dim=" + + std::to_string(input.dim()) + ", output_dim=" + + std::to_string(output_shape.size()) + ", input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + for (size_t i = 0; i < output_shape.size(); ++i) { + if (output_shape[i] <= 0) { + tensor_throw_invalid_argument("Tensor interpolate output shape must be positive: input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + if (input.shape()[i] <= 0) { + tensor_throw_invalid_argument("Tensor interpolate input shape must be positive: input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + } + + Tensor output(std::move(output_shape)); + for (int64_t flat = 0; flat < output.numel(); ++flat) { + std::vector output_coord = tensor_unravel_index(flat, output.shape()); + std::vector input_coord(static_cast(input.dim()), 0); + for (size_t i = 0; i < static_cast(input.dim()); ++i) { + input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i]; + } + output[flat] = input.index(input_coord); + } + + return output; + } + + template + inline Tensor interpolate(const Tensor& input, + const std::optional>& size, + const std::optional>& scale_factor, + InterpolateMode mode = InterpolateMode::Nearest, + bool align_corners = false) { + if (mode != InterpolateMode::Nearest) { + tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" + + std::to_string(static_cast(mode))); + } + if (align_corners) { + tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" + + tensor_shape_to_string(input.shape())); + } + if (size.has_value() == scale_factor.has_value()) { + tensor_throw_invalid_argument("Tensor interpolate requires exactly one of size or scale_factor: input_shape=" + + tensor_shape_to_string(input.shape())); + } + + std::vector output_shape = input.shape(); + if (size.has_value()) { + if (size->empty() || size->size() > output_shape.size()) { + tensor_throw_invalid_argument("Tensor interpolate size must target low dimensions: input_shape=" + + tensor_shape_to_string(input.shape()) + ", size_rank=" + + std::to_string(size->size())); + } + for (size_t i = 0; i < size->size(); ++i) { + if ((*size)[i] <= 0) { + tensor_throw_invalid_argument("Tensor interpolate size must be positive: input_shape=" + + tensor_shape_to_string(input.shape()) + ", size=" + + tensor_shape_to_string(*size)); + } + output_shape[i] = (*size)[i]; + } + } else { + if (scale_factor->empty() || scale_factor->size() > output_shape.size()) { + tensor_throw_invalid_argument("Tensor interpolate scale_factor must target low dimensions: input_shape=" + + tensor_shape_to_string(input.shape()) + ", scale_factor_rank=" + + std::to_string(scale_factor->size())); + } + for (size_t i = 0; i < scale_factor->size(); ++i) { + if ((*scale_factor)[i] <= 0.0) { + tensor_throw_invalid_argument("Tensor interpolate scale_factor must be positive: input_shape=" + + tensor_shape_to_string(input.shape())); + } + output_shape[i] = static_cast( + std::floor(static_cast(output_shape[i]) * (*scale_factor)[i])); + if (output_shape[i] <= 0) { + tensor_throw_invalid_argument("Tensor interpolate output shape must be positive: input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + } + } + + return interpolate(input, std::move(output_shape), mode, align_corners); + } + + template + inline Tensor interpolate(const Tensor& input, + const std::optional>& size, + double scale_factor, + InterpolateMode mode = InterpolateMode::Nearest, + bool align_corners = false) { + return interpolate(input, + size, + std::vector(size.has_value() ? size->size() : input.dim(), scale_factor), + mode, + align_corners); + } + + template + inline Tensor concat(const Tensor& lhs, const Tensor& rhs, size_t dim) { + if (lhs.dim() != rhs.dim()) { + tensor_throw_invalid_argument("Tensor concat requires same rank: lhs_dim=" + + std::to_string(lhs.dim()) + ", rhs_dim=" + + std::to_string(rhs.dim()) + ", lhs_shape=" + + tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" + + tensor_shape_to_string(rhs.shape())); + } + if (dim >= static_cast(lhs.dim())) { + tensor_throw_invalid_argument("Tensor concat dimension out of range: dim=" + + std::to_string(dim) + ", rank=" + + std::to_string(lhs.dim()) + ", lhs_shape=" + + tensor_shape_to_string(lhs.shape())); + } + std::vector out_shape = lhs.shape(); + for (size_t i = 0; i < static_cast(lhs.dim()); ++i) { + if (i == dim) { + continue; + } + if (lhs.shape()[i] != rhs.shape()[i]) { + tensor_throw_invalid_argument("Tensor concat requires matching non-concat dimensions: dim=" + + std::to_string(dim) + ", lhs_shape=" + + tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" + + tensor_shape_to_string(rhs.shape())); + } + } + out_shape[dim] += rhs.shape()[dim]; + + Tensor out(out_shape); + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= lhs.shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(lhs.dim()); ++i) { + outer *= lhs.shape()[i]; + } + + int64_t lhs_chunk = lhs.shape()[dim] * inner; + int64_t rhs_chunk = rhs.shape()[dim] * inner; + int64_t out_chunk = lhs_chunk + rhs_chunk; + + for (int64_t i = 0; i < outer; ++i) { + int64_t lhs_offset = i * lhs_chunk; + int64_t rhs_offset = i * rhs_chunk; + int64_t out_offset = i * out_chunk; + + std::copy_n(lhs.data() + lhs_offset, lhs_chunk, out.data() + out_offset); + std::copy_n(rhs.data() + rhs_offset, rhs_chunk, out.data() + out_offset + lhs_chunk); + } + return out; + } + + template + inline std::vector> chunk(const Tensor& tensor, int64_t chunks, size_t dim) { + if (chunks <= 0) { + tensor_throw_invalid_argument("Tensor chunk requires chunks > 0: chunks=" + + std::to_string(chunks) + ", tensor_shape=" + + tensor_shape_to_string(tensor.shape())); + } + if (dim >= static_cast(tensor.dim())) { + tensor_throw_invalid_argument("Tensor chunk dimension out of range: dim=" + + std::to_string(dim) + ", rank=" + + std::to_string(tensor.dim()) + ", tensor_shape=" + + tensor_shape_to_string(tensor.shape())); + } + + const int64_t dim_size = tensor.shape()[dim]; + if (dim_size == 0) { + return {}; + } + if (dim_size % chunks != 0) { + tensor_throw_invalid_argument("Tensor chunk requires the dimension size to be divisible by chunks: dim=" + + std::to_string(dim) + ", dim_size=" + + std::to_string(dim_size) + ", chunks=" + + std::to_string(chunks) + ", tensor_shape=" + + tensor_shape_to_string(tensor.shape())); + } + + const int64_t chunk_size = dim_size / chunks; + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= tensor.shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(tensor.dim()); ++i) { + outer *= tensor.shape()[i]; + } + + std::vector> parts; + parts.reserve(static_cast(chunks)); + + for (int64_t start = 0; start < dim_size; start += chunk_size) { + std::vector part_shape = tensor.shape(); + part_shape[dim] = chunk_size; + Tensor part(part_shape); + + const int64_t src_chunk = chunk_size * inner; + const int64_t dst_chunk = src_chunk; + for (int64_t i = 0; i < outer; ++i) { + const int64_t src_offset = (i * dim_size + start) * inner; + const int64_t dst_offset = i * dst_chunk; + std::copy_n(tensor.data() + src_offset, src_chunk, part.data() + dst_offset); + } + + parts.push_back(std::move(part)); + } + + return parts; + } + + } // namespace ops + +} // namespace sd + +#endif diff --git a/src/tensor_ggml.hpp b/src/tensor_ggml.hpp new file mode 100644 index 0000000..493a958 --- /dev/null +++ b/src/tensor_ggml.hpp @@ -0,0 +1,127 @@ +#ifndef __SD_TENSOR_GGML_HPP__ +#define __SD_TENSOR_GGML_HPP__ + +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "tensor.hpp" + +namespace sd { + + template + struct GGMLTypeTraits; + + template <> + struct GGMLTypeTraits { + static constexpr ggml_type type = GGML_TYPE_F32; + }; + + template <> + struct GGMLTypeTraits { + static constexpr ggml_type type = GGML_TYPE_F16; + }; + + template <> + struct GGMLTypeTraits { + static constexpr ggml_type type = GGML_TYPE_I32; + }; + + template <> + struct GGMLTypeTraits { + static constexpr ggml_type type = GGML_TYPE_I64; + }; + + inline std::vector shape_from_ggml(const ggml_tensor* tensor) { + std::vector shape; + shape.reserve(static_cast(ggml_n_dims(tensor))); + for (int i = 0; i < ggml_n_dims(tensor); ++i) { + shape.push_back(tensor->ne[i]); + } + return shape; + } + + template + inline Tensor make_sd_tensor_from_ggml(const ggml_tensor* tensor) { + if (tensor == nullptr) { + return {}; + } + if (tensor->type != GGMLTypeTraits::type) { + GGML_ABORT("ggml tensor type does not match sd::Tensor type"); + } + Tensor result(shape_from_ggml(tensor)); + if (tensor->buffer != nullptr) { + ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor)); + } else { + std::memcpy(result.data(), tensor->data, ggml_nbytes(tensor)); + } + return result; + } + + template + inline ggml_tensor* make_ggml_tensor(ggml_context* ctx, const Tensor& tensor, bool copy_data = true) { + GGML_ASSERT(tensor.dim() > 0 && tensor.dim() <= 5); + + int n_dims = std::min(static_cast(tensor.dim()), GGML_MAX_DIMS); + + std::array ne = {1, 1, 1, 1}; + for (int64_t i = 0; i < n_dims; ++i) { + ne[static_cast(i)] = tensor.shape()[static_cast(i)]; + } + + if (tensor.dim() == 5) { + ne[3] *= tensor.shape()[4]; + } + + ggml_tensor* result = ggml_new_tensor(ctx, GGMLTypeTraits::type, n_dims, ne.data()); + if (copy_data && tensor.numel() > 0) { + std::memcpy(result->data, tensor.data(), static_cast(ggml_nbytes(result))); + } + return result; + } + + template + inline Tensor load_tensor_from_file_as_tensor(const std::string& file_path) { + std::ifstream file(file_path, std::ios::binary); + if (!file.is_open()) { + throw std::runtime_error("failed to open tensor file: " + file_path); + } + + int32_t n_dims = 0; + int32_t length = 0; + int32_t ttype = 0; + file.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + file.read(reinterpret_cast(&length), sizeof(length)); + file.read(reinterpret_cast(&ttype), sizeof(ttype)); + if (!file.good()) { + throw std::runtime_error("incomplete tensor file header: " + file_path); + } + if (static_cast(ttype) != GGMLTypeTraits::type) { + throw std::invalid_argument("tensor file type does not match requested sd::Tensor type"); + } + + std::vector shape(4, 1); + for (int i = 0; i < n_dims; ++i) { + int32_t dim = 1; + file.read(reinterpret_cast(&dim), sizeof(dim)); + shape[static_cast(i)] = dim; + } + std::string name(static_cast(length), '\0'); + file.read(name.data(), length); + + shape.resize(static_cast(n_dims)); + Tensor tensor(shape); + file.read(reinterpret_cast(tensor.data()), static_cast(tensor.numel() * sizeof(T))); + if (!file.good()) { + throw std::runtime_error("incomplete tensor file data: " + file_path); + } + return tensor; + } + +} // namespace sd + +#endif diff --git a/src/tokenize_util.cpp b/src/tokenize_util.cpp index 22cf8ae..33fdad2 100644 --- a/src/tokenize_util.cpp +++ b/src/tokenize_util.cpp @@ -1,993 +1,993 @@ -#include -#include -#include -#include - -#include "tokenize_util.h" - -bool is_number(char32_t ch) { - return (ch >= U'0' && ch <= U'9'); -} - -bool is_letter(char32_t ch) { - static const struct { char32_t start, end; } ranges[] = { - {0x41, 0x5A}, - {0x61, 0x7A}, - {0xAA, 0xAA}, - {0xB5, 0xB5}, - {0xBA, 0xBA}, - {0xC0, 0xD6}, - {0xD8, 0xF6}, - {0xF8, 0x2C1}, - {0x2C6, 0x2D1}, - {0x2E0, 0x2E4}, - {0x2EC, 0x2EC}, - {0x2EE, 0x2EE}, - {0x370, 0x374}, - {0x376, 0x377}, - {0x37A, 0x37D}, - {0x37F, 0x37F}, - {0x386, 0x386}, - {0x388, 0x38A}, - {0x38C, 0x38C}, - {0x38E, 0x3A1}, - {0x3A3, 0x3F5}, - {0x3F7, 0x481}, - {0x48A, 0x52F}, - {0x531, 0x556}, - {0x559, 0x559}, - {0x560, 0x588}, - {0x5D0, 0x5EA}, - {0x5EF, 0x5F2}, - {0x620, 0x64A}, - {0x66E, 0x66F}, - {0x671, 0x6D3}, - {0x6D5, 0x6D5}, - {0x6E5, 0x6E6}, - {0x6EE, 0x6EF}, - {0x6FA, 0x6FC}, - {0x6FF, 0x6FF}, - {0x710, 0x710}, - {0x712, 0x72F}, - {0x74D, 0x7A5}, - {0x7B1, 0x7B1}, - {0x7CA, 0x7EA}, - {0x7F4, 0x7F5}, - {0x7FA, 0x7FA}, - {0x800, 0x815}, - {0x81A, 0x81A}, - {0x824, 0x824}, - {0x828, 0x828}, - {0x840, 0x858}, - {0x860, 0x86A}, - {0x870, 0x887}, - {0x889, 0x88F}, - {0x8A0, 0x8C9}, - {0x904, 0x939}, - {0x93D, 0x93D}, - {0x950, 0x950}, - {0x958, 0x961}, - {0x971, 0x980}, - {0x985, 0x98C}, - {0x98F, 0x990}, - {0x993, 0x9A8}, - {0x9AA, 0x9B0}, - {0x9B2, 0x9B2}, - {0x9B6, 0x9B9}, - {0x9BD, 0x9BD}, - {0x9CE, 0x9CE}, - {0x9DC, 0x9DD}, - {0x9DF, 0x9E1}, - {0x9F0, 0x9F1}, - {0x9FC, 0x9FC}, - {0xA05, 0xA0A}, - {0xA0F, 0xA10}, - {0xA13, 0xA28}, - {0xA2A, 0xA30}, - {0xA32, 0xA33}, - {0xA35, 0xA36}, - {0xA38, 0xA39}, - {0xA59, 0xA5C}, - {0xA5E, 0xA5E}, - {0xA72, 0xA74}, - {0xA85, 0xA8D}, - {0xA8F, 0xA91}, - {0xA93, 0xAA8}, - {0xAAA, 0xAB0}, - {0xAB2, 0xAB3}, - {0xAB5, 0xAB9}, - {0xABD, 0xABD}, - {0xAD0, 0xAD0}, - {0xAE0, 0xAE1}, - {0xAF9, 0xAF9}, - {0xB05, 0xB0C}, - {0xB0F, 0xB10}, - {0xB13, 0xB28}, - {0xB2A, 0xB30}, - {0xB32, 0xB33}, - {0xB35, 0xB39}, - {0xB3D, 0xB3D}, - {0xB5C, 0xB5D}, - {0xB5F, 0xB61}, - {0xB71, 0xB71}, - {0xB83, 0xB83}, - {0xB85, 0xB8A}, - {0xB8E, 0xB90}, - {0xB92, 0xB95}, - {0xB99, 0xB9A}, - {0xB9C, 0xB9C}, - {0xB9E, 0xB9F}, - {0xBA3, 0xBA4}, - {0xBA8, 0xBAA}, - {0xBAE, 0xBB9}, - {0xBD0, 0xBD0}, - {0xC05, 0xC0C}, - {0xC0E, 0xC10}, - {0xC12, 0xC28}, - {0xC2A, 0xC39}, - {0xC3D, 0xC3D}, - {0xC58, 0xC5A}, - {0xC5C, 0xC5D}, - {0xC60, 0xC61}, - {0xC80, 0xC80}, - {0xC85, 0xC8C}, - {0xC8E, 0xC90}, - {0xC92, 0xCA8}, - {0xCAA, 0xCB3}, - {0xCB5, 0xCB9}, - {0xCBD, 0xCBD}, - {0xCDC, 0xCDE}, - {0xCE0, 0xCE1}, - {0xCF1, 0xCF2}, - {0xD04, 0xD0C}, - {0xD0E, 0xD10}, - {0xD12, 0xD3A}, - {0xD3D, 0xD3D}, - {0xD4E, 0xD4E}, - {0xD54, 0xD56}, - {0xD5F, 0xD61}, - {0xD7A, 0xD7F}, - {0xD85, 0xD96}, - {0xD9A, 0xDB1}, - {0xDB3, 0xDBB}, - {0xDBD, 0xDBD}, - {0xDC0, 0xDC6}, - {0xE01, 0xE30}, - {0xE32, 0xE33}, - {0xE40, 0xE46}, - {0xE81, 0xE82}, - {0xE84, 0xE84}, - {0xE86, 0xE8A}, - {0xE8C, 0xEA3}, - {0xEA5, 0xEA5}, - {0xEA7, 0xEB0}, - {0xEB2, 0xEB3}, - {0xEBD, 0xEBD}, - {0xEC0, 0xEC4}, - {0xEC6, 0xEC6}, - {0xEDC, 0xEDF}, - {0xF00, 0xF00}, - {0xF40, 0xF47}, - {0xF49, 0xF6C}, - {0xF88, 0xF8C}, - {0x1000, 0x102A}, - {0x103F, 0x103F}, - {0x1050, 0x1055}, - {0x105A, 0x105D}, - {0x1061, 0x1061}, - {0x1065, 0x1066}, - {0x106E, 0x1070}, - {0x1075, 0x1081}, - {0x108E, 0x108E}, - {0x10A0, 0x10C5}, - {0x10C7, 0x10C7}, - {0x10CD, 0x10CD}, - {0x10D0, 0x10FA}, - {0x10FC, 0x1248}, - {0x124A, 0x124D}, - {0x1250, 0x1256}, - {0x1258, 0x1258}, - {0x125A, 0x125D}, - {0x1260, 0x1288}, - {0x128A, 0x128D}, - {0x1290, 0x12B0}, - {0x12B2, 0x12B5}, - {0x12B8, 0x12BE}, - {0x12C0, 0x12C0}, - {0x12C2, 0x12C5}, - {0x12C8, 0x12D6}, - {0x12D8, 0x1310}, - {0x1312, 0x1315}, - {0x1318, 0x135A}, - {0x1380, 0x138F}, - {0x13A0, 0x13F5}, - {0x13F8, 0x13FD}, - {0x1401, 0x166C}, - {0x166F, 0x167F}, - {0x1681, 0x169A}, - {0x16A0, 0x16EA}, - {0x16F1, 0x16F8}, - {0x1700, 0x1711}, - {0x171F, 0x1731}, - {0x1740, 0x1751}, - {0x1760, 0x176C}, - {0x176E, 0x1770}, - {0x1780, 0x17B3}, - {0x17D7, 0x17D7}, - {0x17DC, 0x17DC}, - {0x1820, 0x1878}, - {0x1880, 0x1884}, - {0x1887, 0x18A8}, - {0x18AA, 0x18AA}, - {0x18B0, 0x18F5}, - {0x1900, 0x191E}, - {0x1950, 0x196D}, - {0x1970, 0x1974}, - {0x1980, 0x19AB}, - {0x19B0, 0x19C9}, - {0x1A00, 0x1A16}, - {0x1A20, 0x1A54}, - {0x1AA7, 0x1AA7}, - {0x1B05, 0x1B33}, - {0x1B45, 0x1B4C}, - {0x1B83, 0x1BA0}, - {0x1BAE, 0x1BAF}, - {0x1BBA, 0x1BE5}, - {0x1C00, 0x1C23}, - {0x1C4D, 0x1C4F}, - {0x1C5A, 0x1C7D}, - {0x1C80, 0x1C8A}, - {0x1C90, 0x1CBA}, - {0x1CBD, 0x1CBF}, - {0x1CE9, 0x1CEC}, - {0x1CEE, 0x1CF3}, - {0x1CF5, 0x1CF6}, - {0x1CFA, 0x1CFA}, - {0x1D00, 0x1DBF}, - {0x1E00, 0x1F15}, - {0x1F18, 0x1F1D}, - {0x1F20, 0x1F45}, - {0x1F48, 0x1F4D}, - {0x1F50, 0x1F57}, - {0x1F59, 0x1F59}, - {0x1F5B, 0x1F5B}, - {0x1F5D, 0x1F5D}, - {0x1F5F, 0x1F7D}, - {0x1F80, 0x1FB4}, - {0x1FB6, 0x1FBC}, - {0x1FBE, 0x1FBE}, - {0x1FC2, 0x1FC4}, - {0x1FC6, 0x1FCC}, - {0x1FD0, 0x1FD3}, - {0x1FD6, 0x1FDB}, - {0x1FE0, 0x1FEC}, - {0x1FF2, 0x1FF4}, - {0x1FF6, 0x1FFC}, - {0x2071, 0x2071}, - {0x207F, 0x207F}, - {0x2090, 0x209C}, - {0x2102, 0x2102}, - {0x2107, 0x2107}, - {0x210A, 0x2113}, - {0x2115, 0x2115}, - {0x2119, 0x211D}, - {0x2124, 0x2124}, - {0x2126, 0x2126}, - {0x2128, 0x2128}, - {0x212A, 0x212D}, - {0x212F, 0x2139}, - {0x213C, 0x213F}, - {0x2145, 0x2149}, - {0x214E, 0x214E}, - {0x2183, 0x2184}, - {0x2C00, 0x2CE4}, - {0x2CEB, 0x2CEE}, - {0x2CF2, 0x2CF3}, - {0x2D00, 0x2D25}, - {0x2D27, 0x2D27}, - {0x2D2D, 0x2D2D}, - {0x2D30, 0x2D67}, - {0x2D6F, 0x2D6F}, - {0x2D80, 0x2D96}, - {0x2DA0, 0x2DA6}, - {0x2DA8, 0x2DAE}, - {0x2DB0, 0x2DB6}, - {0x2DB8, 0x2DBE}, - {0x2DC0, 0x2DC6}, - {0x2DC8, 0x2DCE}, - {0x2DD0, 0x2DD6}, - {0x2DD8, 0x2DDE}, - {0x2E2F, 0x2E2F}, - {0x3005, 0x3006}, - {0x3031, 0x3035}, - {0x303B, 0x303C}, - {0x3041, 0x3096}, - {0x309D, 0x309F}, - {0x30A1, 0x30FA}, - {0x30FC, 0x30FF}, - {0x3105, 0x312F}, - {0x3131, 0x318E}, - {0x31A0, 0x31BF}, - {0x31F0, 0x31FF}, - {0x3400, 0x4DBF}, - {0x4E00, 0xA48C}, - {0xA4D0, 0xA4FD}, - {0xA500, 0xA60C}, - {0xA610, 0xA61F}, - {0xA62A, 0xA62B}, - {0xA640, 0xA66E}, - {0xA67F, 0xA69D}, - {0xA6A0, 0xA6E5}, - {0xA717, 0xA71F}, - {0xA722, 0xA788}, - {0xA78B, 0xA7DC}, - {0xA7F1, 0xA801}, - {0xA803, 0xA805}, - {0xA807, 0xA80A}, - {0xA80C, 0xA822}, - {0xA840, 0xA873}, - {0xA882, 0xA8B3}, - {0xA8F2, 0xA8F7}, - {0xA8FB, 0xA8FB}, - {0xA8FD, 0xA8FE}, - {0xA90A, 0xA925}, - {0xA930, 0xA946}, - {0xA960, 0xA97C}, - {0xA984, 0xA9B2}, - {0xA9CF, 0xA9CF}, - {0xA9E0, 0xA9E4}, - {0xA9E6, 0xA9EF}, - {0xA9FA, 0xA9FE}, - {0xAA00, 0xAA28}, - {0xAA40, 0xAA42}, - {0xAA44, 0xAA4B}, - {0xAA60, 0xAA76}, - {0xAA7A, 0xAA7A}, - {0xAA7E, 0xAAAF}, - {0xAAB1, 0xAAB1}, - {0xAAB5, 0xAAB6}, - {0xAAB9, 0xAABD}, - {0xAAC0, 0xAAC0}, - {0xAAC2, 0xAAC2}, - {0xAADB, 0xAADD}, - {0xAAE0, 0xAAEA}, - {0xAAF2, 0xAAF4}, - {0xAB01, 0xAB06}, - {0xAB09, 0xAB0E}, - {0xAB11, 0xAB16}, - {0xAB20, 0xAB26}, - {0xAB28, 0xAB2E}, - {0xAB30, 0xAB5A}, - {0xAB5C, 0xAB69}, - {0xAB70, 0xABE2}, - {0xAC00, 0xD7A3}, - {0xD7B0, 0xD7C6}, - {0xD7CB, 0xD7FB}, - {0xF900, 0xFA6D}, - {0xFA70, 0xFAD9}, - {0xFB00, 0xFB06}, - {0xFB13, 0xFB17}, - {0xFB1D, 0xFB1D}, - {0xFB1F, 0xFB28}, - {0xFB2A, 0xFB36}, - {0xFB38, 0xFB3C}, - {0xFB3E, 0xFB3E}, - {0xFB40, 0xFB41}, - {0xFB43, 0xFB44}, - {0xFB46, 0xFBB1}, - {0xFBD3, 0xFD3D}, - {0xFD50, 0xFD8F}, - {0xFD92, 0xFDC7}, - {0xFDF0, 0xFDFB}, - {0xFE70, 0xFE74}, - {0xFE76, 0xFEFC}, - {0xFF21, 0xFF3A}, - {0xFF41, 0xFF5A}, - {0xFF66, 0xFFBE}, - {0xFFC2, 0xFFC7}, - {0xFFCA, 0xFFCF}, - {0xFFD2, 0xFFD7}, - {0xFFDA, 0xFFDC}, - {0x10000, 0x1000B}, - {0x1000D, 0x10026}, - {0x10028, 0x1003A}, - {0x1003C, 0x1003D}, - {0x1003F, 0x1004D}, - {0x10050, 0x1005D}, - {0x10080, 0x100FA}, - {0x10280, 0x1029C}, - {0x102A0, 0x102D0}, - {0x10300, 0x1031F}, - {0x1032D, 0x10340}, - {0x10342, 0x10349}, - {0x10350, 0x10375}, - {0x10380, 0x1039D}, - {0x103A0, 0x103C3}, - {0x103C8, 0x103CF}, - {0x10400, 0x1049D}, - {0x104B0, 0x104D3}, - {0x104D8, 0x104FB}, - {0x10500, 0x10527}, - {0x10530, 0x10563}, - {0x10570, 0x1057A}, - {0x1057C, 0x1058A}, - {0x1058C, 0x10592}, - {0x10594, 0x10595}, - {0x10597, 0x105A1}, - {0x105A3, 0x105B1}, - {0x105B3, 0x105B9}, - {0x105BB, 0x105BC}, - {0x105C0, 0x105F3}, - {0x10600, 0x10736}, - {0x10740, 0x10755}, - {0x10760, 0x10767}, - {0x10780, 0x10785}, - {0x10787, 0x107B0}, - {0x107B2, 0x107BA}, - {0x10800, 0x10805}, - {0x10808, 0x10808}, - {0x1080A, 0x10835}, - {0x10837, 0x10838}, - {0x1083C, 0x1083C}, - {0x1083F, 0x10855}, - {0x10860, 0x10876}, - {0x10880, 0x1089E}, - {0x108E0, 0x108F2}, - {0x108F4, 0x108F5}, - {0x10900, 0x10915}, - {0x10920, 0x10939}, - {0x10940, 0x10959}, - {0x10980, 0x109B7}, - {0x109BE, 0x109BF}, - {0x10A00, 0x10A00}, - {0x10A10, 0x10A13}, - {0x10A15, 0x10A17}, - {0x10A19, 0x10A35}, - {0x10A60, 0x10A7C}, - {0x10A80, 0x10A9C}, - {0x10AC0, 0x10AC7}, - {0x10AC9, 0x10AE4}, - {0x10B00, 0x10B35}, - {0x10B40, 0x10B55}, - {0x10B60, 0x10B72}, - {0x10B80, 0x10B91}, - {0x10C00, 0x10C48}, - {0x10C80, 0x10CB2}, - {0x10CC0, 0x10CF2}, - {0x10D00, 0x10D23}, - {0x10D4A, 0x10D65}, - {0x10D6F, 0x10D85}, - {0x10E80, 0x10EA9}, - {0x10EB0, 0x10EB1}, - {0x10EC2, 0x10EC7}, - {0x10F00, 0x10F1C}, - {0x10F27, 0x10F27}, - {0x10F30, 0x10F45}, - {0x10F70, 0x10F81}, - {0x10FB0, 0x10FC4}, - {0x10FE0, 0x10FF6}, - {0x11003, 0x11037}, - {0x11071, 0x11072}, - {0x11075, 0x11075}, - {0x11083, 0x110AF}, - {0x110D0, 0x110E8}, - {0x11103, 0x11126}, - {0x11144, 0x11144}, - {0x11147, 0x11147}, - {0x11150, 0x11172}, - {0x11176, 0x11176}, - {0x11183, 0x111B2}, - {0x111C1, 0x111C4}, - {0x111DA, 0x111DA}, - {0x111DC, 0x111DC}, - {0x11200, 0x11211}, - {0x11213, 0x1122B}, - {0x1123F, 0x11240}, - {0x11280, 0x11286}, - {0x11288, 0x11288}, - {0x1128A, 0x1128D}, - {0x1128F, 0x1129D}, - {0x1129F, 0x112A8}, - {0x112B0, 0x112DE}, - {0x11305, 0x1130C}, - {0x1130F, 0x11310}, - {0x11313, 0x11328}, - {0x1132A, 0x11330}, - {0x11332, 0x11333}, - {0x11335, 0x11339}, - {0x1133D, 0x1133D}, - {0x11350, 0x11350}, - {0x1135D, 0x11361}, - {0x11380, 0x11389}, - {0x1138B, 0x1138B}, - {0x1138E, 0x1138E}, - {0x11390, 0x113B5}, - {0x113B7, 0x113B7}, - {0x113D1, 0x113D1}, - {0x113D3, 0x113D3}, - {0x11400, 0x11434}, - {0x11447, 0x1144A}, - {0x1145F, 0x11461}, - {0x11480, 0x114AF}, - {0x114C4, 0x114C5}, - {0x114C7, 0x114C7}, - {0x11580, 0x115AE}, - {0x115D8, 0x115DB}, - {0x11600, 0x1162F}, - {0x11644, 0x11644}, - {0x11680, 0x116AA}, - {0x116B8, 0x116B8}, - {0x11700, 0x1171A}, - {0x11740, 0x11746}, - {0x11800, 0x1182B}, - {0x118A0, 0x118DF}, - {0x118FF, 0x11906}, - {0x11909, 0x11909}, - {0x1190C, 0x11913}, - {0x11915, 0x11916}, - {0x11918, 0x1192F}, - {0x1193F, 0x1193F}, - {0x11941, 0x11941}, - {0x119A0, 0x119A7}, - {0x119AA, 0x119D0}, - {0x119E1, 0x119E1}, - {0x119E3, 0x119E3}, - {0x11A00, 0x11A00}, - {0x11A0B, 0x11A32}, - {0x11A3A, 0x11A3A}, - {0x11A50, 0x11A50}, - {0x11A5C, 0x11A89}, - {0x11A9D, 0x11A9D}, - {0x11AB0, 0x11AF8}, - {0x11BC0, 0x11BE0}, - {0x11C00, 0x11C08}, - {0x11C0A, 0x11C2E}, - {0x11C40, 0x11C40}, - {0x11C72, 0x11C8F}, - {0x11D00, 0x11D06}, - {0x11D08, 0x11D09}, - {0x11D0B, 0x11D30}, - {0x11D46, 0x11D46}, - {0x11D60, 0x11D65}, - {0x11D67, 0x11D68}, - {0x11D6A, 0x11D89}, - {0x11D98, 0x11D98}, - {0x11DB0, 0x11DDB}, - {0x11EE0, 0x11EF2}, - {0x11F02, 0x11F02}, - {0x11F04, 0x11F10}, - {0x11F12, 0x11F33}, - {0x11FB0, 0x11FB0}, - {0x12000, 0x12399}, - {0x12480, 0x12543}, - {0x12F90, 0x12FF0}, - {0x13000, 0x1342F}, - {0x13441, 0x13446}, - {0x13460, 0x143FA}, - {0x14400, 0x14646}, - {0x16100, 0x1611D}, - {0x16800, 0x16A38}, - {0x16A40, 0x16A5E}, - {0x16A70, 0x16ABE}, - {0x16AD0, 0x16AED}, - {0x16B00, 0x16B2F}, - {0x16B40, 0x16B43}, - {0x16B63, 0x16B77}, - {0x16B7D, 0x16B8F}, - {0x16D40, 0x16D6C}, - {0x16E40, 0x16E7F}, - {0x16EA0, 0x16EB8}, - {0x16EBB, 0x16ED3}, - {0x16F00, 0x16F4A}, - {0x16F50, 0x16F50}, - {0x16F93, 0x16F9F}, - {0x16FE0, 0x16FE1}, - {0x16FE3, 0x16FE3}, - {0x16FF2, 0x16FF3}, - {0x17000, 0x18CD5}, - {0x18CFF, 0x18D1E}, - {0x18D80, 0x18DF2}, - {0x1AFF0, 0x1AFF3}, - {0x1AFF5, 0x1AFFB}, - {0x1AFFD, 0x1AFFE}, - {0x1B000, 0x1B122}, - {0x1B132, 0x1B132}, - {0x1B150, 0x1B152}, - {0x1B155, 0x1B155}, - {0x1B164, 0x1B167}, - {0x1B170, 0x1B2FB}, - {0x1BC00, 0x1BC6A}, - {0x1BC70, 0x1BC7C}, - {0x1BC80, 0x1BC88}, - {0x1BC90, 0x1BC99}, - {0x1D400, 0x1D454}, - {0x1D456, 0x1D49C}, - {0x1D49E, 0x1D49F}, - {0x1D4A2, 0x1D4A2}, - {0x1D4A5, 0x1D4A6}, - {0x1D4A9, 0x1D4AC}, - {0x1D4AE, 0x1D4B9}, - {0x1D4BB, 0x1D4BB}, - {0x1D4BD, 0x1D4C3}, - {0x1D4C5, 0x1D505}, - {0x1D507, 0x1D50A}, - {0x1D50D, 0x1D514}, - {0x1D516, 0x1D51C}, - {0x1D51E, 0x1D539}, - {0x1D53B, 0x1D53E}, - {0x1D540, 0x1D544}, - {0x1D546, 0x1D546}, - {0x1D54A, 0x1D550}, - {0x1D552, 0x1D6A5}, - {0x1D6A8, 0x1D6C0}, - {0x1D6C2, 0x1D6DA}, - {0x1D6DC, 0x1D6FA}, - {0x1D6FC, 0x1D714}, - {0x1D716, 0x1D734}, - {0x1D736, 0x1D74E}, - {0x1D750, 0x1D76E}, - {0x1D770, 0x1D788}, - {0x1D78A, 0x1D7A8}, - {0x1D7AA, 0x1D7C2}, - {0x1D7C4, 0x1D7CB}, - {0x1DF00, 0x1DF1E}, - {0x1DF25, 0x1DF2A}, - {0x1E030, 0x1E06D}, - {0x1E100, 0x1E12C}, - {0x1E137, 0x1E13D}, - {0x1E14E, 0x1E14E}, - {0x1E290, 0x1E2AD}, - {0x1E2C0, 0x1E2EB}, - {0x1E4D0, 0x1E4EB}, - {0x1E5D0, 0x1E5ED}, - {0x1E5F0, 0x1E5F0}, - {0x1E6C0, 0x1E6DE}, - {0x1E6E0, 0x1E6E2}, - {0x1E6E4, 0x1E6E5}, - {0x1E6E7, 0x1E6ED}, - {0x1E6F0, 0x1E6F4}, - {0x1E6FE, 0x1E6FF}, - {0x1E7E0, 0x1E7E6}, - {0x1E7E8, 0x1E7EB}, - {0x1E7ED, 0x1E7EE}, - {0x1E7F0, 0x1E7FE}, - {0x1E800, 0x1E8C4}, - {0x1E900, 0x1E943}, - {0x1E94B, 0x1E94B}, - {0x1EE00, 0x1EE03}, - {0x1EE05, 0x1EE1F}, - {0x1EE21, 0x1EE22}, - {0x1EE24, 0x1EE24}, - {0x1EE27, 0x1EE27}, - {0x1EE29, 0x1EE32}, - {0x1EE34, 0x1EE37}, - {0x1EE39, 0x1EE39}, - {0x1EE3B, 0x1EE3B}, - {0x1EE42, 0x1EE42}, - {0x1EE47, 0x1EE47}, - {0x1EE49, 0x1EE49}, - {0x1EE4B, 0x1EE4B}, - {0x1EE4D, 0x1EE4F}, - {0x1EE51, 0x1EE52}, - {0x1EE54, 0x1EE54}, - {0x1EE57, 0x1EE57}, - {0x1EE59, 0x1EE59}, - {0x1EE5B, 0x1EE5B}, - {0x1EE5D, 0x1EE5D}, - {0x1EE5F, 0x1EE5F}, - {0x1EE61, 0x1EE62}, - {0x1EE64, 0x1EE64}, - {0x1EE67, 0x1EE6A}, - {0x1EE6C, 0x1EE72}, - {0x1EE74, 0x1EE77}, - {0x1EE79, 0x1EE7C}, - {0x1EE7E, 0x1EE7E}, - {0x1EE80, 0x1EE89}, - {0x1EE8B, 0x1EE9B}, - {0x1EEA1, 0x1EEA3}, - {0x1EEA5, 0x1EEA9}, - {0x1EEAB, 0x1EEBB}, - {0x20000, 0x2A6DF}, - {0x2A700, 0x2B81D}, - {0x2B820, 0x2CEAD}, - {0x2CEB0, 0x2EBE0}, - {0x2EBF0, 0x2EE5D}, - {0x2F800, 0x2FA1D}, - {0x30000, 0x3134A}, - {0x31350, 0x33479}, - }; - - for (const auto& r : ranges) { - if (ch >= r.start && ch <= r.end) - return true; - } - return false; -} - -bool is_space(char32_t cp) { - switch (cp) { - case 0x0009: // TAB \t - case 0x000A: // LF \n - case 0x000B: // VT - case 0x000C: // FF - case 0x000D: // CR \r - case 0x0020: // Space - case 0x00A0: // No-Break Space - case 0x1680: // Ogham Space Mark - case 0x2000: // En Quad - case 0x2001: // Em Quad - case 0x2002: // En Space - case 0x2003: // Em Space - case 0x2004: // Three-Per-Em Space - case 0x2005: // Four-Per-Em Space - case 0x2006: // Six-Per-Em Space - case 0x2007: // Figure Space - case 0x2008: // Punctuation Space - case 0x2009: // Thin Space - case 0x200A: // Hair Space - case 0x202F: // Narrow No-Break Space - case 0x205F: // Medium Mathematical Space - case 0x3000: // Ideographic Space - return true; - default: - return false; - } -} - -std::string str_to_lower(const std::string& input) { - std::string result = input; - std::transform(result.begin(), result.end(), result.begin(), - [](unsigned char c) { return std::tolower(c); }); - return result; -} - -// UTF-8 -> Unicode code points -std::vector utf8_to_codepoints(const std::string& str) { - std::vector codepoints; - size_t i = 0; - while (i < str.size()) { - unsigned char c = str[i]; - char32_t cp = 0; - size_t extra_bytes = 0; - - if ((c & 0x80) == 0) - cp = c; - else if ((c & 0xE0) == 0xC0) { - cp = c & 0x1F; - extra_bytes = 1; - } else if ((c & 0xF0) == 0xE0) { - cp = c & 0x0F; - extra_bytes = 2; - } else if ((c & 0xF8) == 0xF0) { - cp = c & 0x07; - extra_bytes = 3; - } else { - ++i; - continue; - } // Invalid UTF-8 - - if (i + extra_bytes >= str.size()) - break; - - for (size_t j = 1; j <= extra_bytes; ++j) - cp = (cp << 6) | (str[i + j] & 0x3F); - - codepoints.push_back(cp); - i += 1 + extra_bytes; - } - return codepoints; -} - -// Unicode code point -> UTF-8 -std::string codepoint_to_utf8(char32_t cp) { - std::string out; - if (cp <= 0x7F) - out.push_back(static_cast(cp)); - else if (cp <= 0x7FF) { - out.push_back(static_cast(0xC0 | (cp >> 6))); - out.push_back(static_cast(0x80 | (cp & 0x3F))); - } else if (cp <= 0xFFFF) { - out.push_back(static_cast(0xE0 | (cp >> 12))); - out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); - out.push_back(static_cast(0x80 | (cp & 0x3F))); - } else { - out.push_back(static_cast(0xF0 | (cp >> 18))); - out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); - out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); - out.push_back(static_cast(0x80 | (cp & 0x3F))); - } - return out; -} - -bool starts_with(const std::vector& text, - const std::vector& prefix, - std::size_t index) { - if (index > text.size()) { - return false; - } - if (prefix.size() > text.size() - index) { - return false; - } - return std::equal(prefix.begin(), prefix.end(), text.begin() + index); -} - -// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+ -// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+ -std::vector token_split(const std::string& text) { - std::vector tokens; - auto cps = utf8_to_codepoints(text); - size_t i = 0; - - while (i < cps.size()) { - char32_t cp = cps[i]; - - // `(?i:'s|'t|'re|'ve|'m|'ll|'d)` - if (cp == U'\'' && i + 1 < cps.size()) { - std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1])); - if (next == "s" || next == "t" || next == "m") { - tokens.push_back("'" + next); - i += 2; - continue; - } - if (i + 2 < cps.size()) { - next += str_to_lower(codepoint_to_utf8(cps[i + 2])); - if (next == "re" || next == "ve" || next == "ll" || next == "d") { - tokens.push_back("'" + next); - i += 3; - continue; - } - } - } - - // `\p{N}` - if (is_number(cp)) { - tokens.push_back(codepoint_to_utf8(cp)); - ++i; - continue; - } - - // `[^\r\n\p{L}\p{N}]?\p{L}+` - { - // `[^\r\n\p{L}\p{N}]\p{L}+` - if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) { - std::string token = codepoint_to_utf8(cp); - ++i; - - while (i < cps.size() && is_letter(cps[i])) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - tokens.push_back(token); - continue; - } - - // `\p{L}+` - if (is_letter(cp)) { - std::string token = codepoint_to_utf8(cp); - ++i; - while (i < cps.size() && is_letter(cps[i])) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - tokens.push_back(token); - continue; - } - } - - // ` ?[^\s\p{L}\p{N}]+[\r\n]*` - { - // ` [^\s\p{L}\p{N}]+[\r\n]*` - if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) { - std::string token = codepoint_to_utf8(cp); - token += codepoint_to_utf8(cps[i + 1]); - i += 2; - - while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - - while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - - tokens.push_back(token); - continue; - } - - // `[^\s\p{L}\p{N}]+[\r\n]*` - std::string token; - if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { - std::string token = codepoint_to_utf8(cp); - ++i; - - while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - - while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - - tokens.push_back(token); - continue; - } - } - - // `\s*[\r\n]+|\s+(?!\S)|\s+` - if (is_space(cp)) { - std::string token; - bool saw_new_line = false; - - while (i < cps.size() && is_space(cps[i])) { - token += codepoint_to_utf8(cps[i]); - - if (cps[i] == U'\r' || cps[i] == U'\n') { - saw_new_line = true; - } else { - if (saw_new_line) { - break; - } - } - - ++i; - } - - tokens.push_back(token); - continue; - } - - // skip - ++i; - } - - return tokens; -} - -std::vector split_with_special_tokens( - const std::string& text, - const std::vector& special_tokens) { - std::vector result; - size_t pos = 0; - size_t text_len = text.size(); - - while (pos < text_len) { - size_t next_pos = text_len; - std::string matched_token; - - for (const auto& token : special_tokens) { - size_t token_pos = text.find(token, pos); - if (token_pos != std::string::npos && token_pos < next_pos) { - next_pos = token_pos; - matched_token = token; - } - } - - if (next_pos > pos) { - result.push_back(text.substr(pos, next_pos - pos)); - } - - if (!matched_token.empty()) { - result.push_back(matched_token); - pos = next_pos + matched_token.size(); - } else { - break; - } - } - - return result; -} - -// int main() { -// std::string text = "I'm testing C++ token_split function. 你好,世界! 123"; -// auto tokens = token_split(text); - -// for (const auto& t : tokens) { -// std::cout << "[" << t << "] "; -// } -// std::cout << "\n"; -// return 0; -// } +#include +#include +#include +#include + +#include "tokenize_util.h" + +bool is_number(char32_t ch) { + return (ch >= U'0' && ch <= U'9'); +} + +bool is_letter(char32_t ch) { + static const struct { char32_t start, end; } ranges[] = { + {0x41, 0x5A}, + {0x61, 0x7A}, + {0xAA, 0xAA}, + {0xB5, 0xB5}, + {0xBA, 0xBA}, + {0xC0, 0xD6}, + {0xD8, 0xF6}, + {0xF8, 0x2C1}, + {0x2C6, 0x2D1}, + {0x2E0, 0x2E4}, + {0x2EC, 0x2EC}, + {0x2EE, 0x2EE}, + {0x370, 0x374}, + {0x376, 0x377}, + {0x37A, 0x37D}, + {0x37F, 0x37F}, + {0x386, 0x386}, + {0x388, 0x38A}, + {0x38C, 0x38C}, + {0x38E, 0x3A1}, + {0x3A3, 0x3F5}, + {0x3F7, 0x481}, + {0x48A, 0x52F}, + {0x531, 0x556}, + {0x559, 0x559}, + {0x560, 0x588}, + {0x5D0, 0x5EA}, + {0x5EF, 0x5F2}, + {0x620, 0x64A}, + {0x66E, 0x66F}, + {0x671, 0x6D3}, + {0x6D5, 0x6D5}, + {0x6E5, 0x6E6}, + {0x6EE, 0x6EF}, + {0x6FA, 0x6FC}, + {0x6FF, 0x6FF}, + {0x710, 0x710}, + {0x712, 0x72F}, + {0x74D, 0x7A5}, + {0x7B1, 0x7B1}, + {0x7CA, 0x7EA}, + {0x7F4, 0x7F5}, + {0x7FA, 0x7FA}, + {0x800, 0x815}, + {0x81A, 0x81A}, + {0x824, 0x824}, + {0x828, 0x828}, + {0x840, 0x858}, + {0x860, 0x86A}, + {0x870, 0x887}, + {0x889, 0x88F}, + {0x8A0, 0x8C9}, + {0x904, 0x939}, + {0x93D, 0x93D}, + {0x950, 0x950}, + {0x958, 0x961}, + {0x971, 0x980}, + {0x985, 0x98C}, + {0x98F, 0x990}, + {0x993, 0x9A8}, + {0x9AA, 0x9B0}, + {0x9B2, 0x9B2}, + {0x9B6, 0x9B9}, + {0x9BD, 0x9BD}, + {0x9CE, 0x9CE}, + {0x9DC, 0x9DD}, + {0x9DF, 0x9E1}, + {0x9F0, 0x9F1}, + {0x9FC, 0x9FC}, + {0xA05, 0xA0A}, + {0xA0F, 0xA10}, + {0xA13, 0xA28}, + {0xA2A, 0xA30}, + {0xA32, 0xA33}, + {0xA35, 0xA36}, + {0xA38, 0xA39}, + {0xA59, 0xA5C}, + {0xA5E, 0xA5E}, + {0xA72, 0xA74}, + {0xA85, 0xA8D}, + {0xA8F, 0xA91}, + {0xA93, 0xAA8}, + {0xAAA, 0xAB0}, + {0xAB2, 0xAB3}, + {0xAB5, 0xAB9}, + {0xABD, 0xABD}, + {0xAD0, 0xAD0}, + {0xAE0, 0xAE1}, + {0xAF9, 0xAF9}, + {0xB05, 0xB0C}, + {0xB0F, 0xB10}, + {0xB13, 0xB28}, + {0xB2A, 0xB30}, + {0xB32, 0xB33}, + {0xB35, 0xB39}, + {0xB3D, 0xB3D}, + {0xB5C, 0xB5D}, + {0xB5F, 0xB61}, + {0xB71, 0xB71}, + {0xB83, 0xB83}, + {0xB85, 0xB8A}, + {0xB8E, 0xB90}, + {0xB92, 0xB95}, + {0xB99, 0xB9A}, + {0xB9C, 0xB9C}, + {0xB9E, 0xB9F}, + {0xBA3, 0xBA4}, + {0xBA8, 0xBAA}, + {0xBAE, 0xBB9}, + {0xBD0, 0xBD0}, + {0xC05, 0xC0C}, + {0xC0E, 0xC10}, + {0xC12, 0xC28}, + {0xC2A, 0xC39}, + {0xC3D, 0xC3D}, + {0xC58, 0xC5A}, + {0xC5C, 0xC5D}, + {0xC60, 0xC61}, + {0xC80, 0xC80}, + {0xC85, 0xC8C}, + {0xC8E, 0xC90}, + {0xC92, 0xCA8}, + {0xCAA, 0xCB3}, + {0xCB5, 0xCB9}, + {0xCBD, 0xCBD}, + {0xCDC, 0xCDE}, + {0xCE0, 0xCE1}, + {0xCF1, 0xCF2}, + {0xD04, 0xD0C}, + {0xD0E, 0xD10}, + {0xD12, 0xD3A}, + {0xD3D, 0xD3D}, + {0xD4E, 0xD4E}, + {0xD54, 0xD56}, + {0xD5F, 0xD61}, + {0xD7A, 0xD7F}, + {0xD85, 0xD96}, + {0xD9A, 0xDB1}, + {0xDB3, 0xDBB}, + {0xDBD, 0xDBD}, + {0xDC0, 0xDC6}, + {0xE01, 0xE30}, + {0xE32, 0xE33}, + {0xE40, 0xE46}, + {0xE81, 0xE82}, + {0xE84, 0xE84}, + {0xE86, 0xE8A}, + {0xE8C, 0xEA3}, + {0xEA5, 0xEA5}, + {0xEA7, 0xEB0}, + {0xEB2, 0xEB3}, + {0xEBD, 0xEBD}, + {0xEC0, 0xEC4}, + {0xEC6, 0xEC6}, + {0xEDC, 0xEDF}, + {0xF00, 0xF00}, + {0xF40, 0xF47}, + {0xF49, 0xF6C}, + {0xF88, 0xF8C}, + {0x1000, 0x102A}, + {0x103F, 0x103F}, + {0x1050, 0x1055}, + {0x105A, 0x105D}, + {0x1061, 0x1061}, + {0x1065, 0x1066}, + {0x106E, 0x1070}, + {0x1075, 0x1081}, + {0x108E, 0x108E}, + {0x10A0, 0x10C5}, + {0x10C7, 0x10C7}, + {0x10CD, 0x10CD}, + {0x10D0, 0x10FA}, + {0x10FC, 0x1248}, + {0x124A, 0x124D}, + {0x1250, 0x1256}, + {0x1258, 0x1258}, + {0x125A, 0x125D}, + {0x1260, 0x1288}, + {0x128A, 0x128D}, + {0x1290, 0x12B0}, + {0x12B2, 0x12B5}, + {0x12B8, 0x12BE}, + {0x12C0, 0x12C0}, + {0x12C2, 0x12C5}, + {0x12C8, 0x12D6}, + {0x12D8, 0x1310}, + {0x1312, 0x1315}, + {0x1318, 0x135A}, + {0x1380, 0x138F}, + {0x13A0, 0x13F5}, + {0x13F8, 0x13FD}, + {0x1401, 0x166C}, + {0x166F, 0x167F}, + {0x1681, 0x169A}, + {0x16A0, 0x16EA}, + {0x16F1, 0x16F8}, + {0x1700, 0x1711}, + {0x171F, 0x1731}, + {0x1740, 0x1751}, + {0x1760, 0x176C}, + {0x176E, 0x1770}, + {0x1780, 0x17B3}, + {0x17D7, 0x17D7}, + {0x17DC, 0x17DC}, + {0x1820, 0x1878}, + {0x1880, 0x1884}, + {0x1887, 0x18A8}, + {0x18AA, 0x18AA}, + {0x18B0, 0x18F5}, + {0x1900, 0x191E}, + {0x1950, 0x196D}, + {0x1970, 0x1974}, + {0x1980, 0x19AB}, + {0x19B0, 0x19C9}, + {0x1A00, 0x1A16}, + {0x1A20, 0x1A54}, + {0x1AA7, 0x1AA7}, + {0x1B05, 0x1B33}, + {0x1B45, 0x1B4C}, + {0x1B83, 0x1BA0}, + {0x1BAE, 0x1BAF}, + {0x1BBA, 0x1BE5}, + {0x1C00, 0x1C23}, + {0x1C4D, 0x1C4F}, + {0x1C5A, 0x1C7D}, + {0x1C80, 0x1C8A}, + {0x1C90, 0x1CBA}, + {0x1CBD, 0x1CBF}, + {0x1CE9, 0x1CEC}, + {0x1CEE, 0x1CF3}, + {0x1CF5, 0x1CF6}, + {0x1CFA, 0x1CFA}, + {0x1D00, 0x1DBF}, + {0x1E00, 0x1F15}, + {0x1F18, 0x1F1D}, + {0x1F20, 0x1F45}, + {0x1F48, 0x1F4D}, + {0x1F50, 0x1F57}, + {0x1F59, 0x1F59}, + {0x1F5B, 0x1F5B}, + {0x1F5D, 0x1F5D}, + {0x1F5F, 0x1F7D}, + {0x1F80, 0x1FB4}, + {0x1FB6, 0x1FBC}, + {0x1FBE, 0x1FBE}, + {0x1FC2, 0x1FC4}, + {0x1FC6, 0x1FCC}, + {0x1FD0, 0x1FD3}, + {0x1FD6, 0x1FDB}, + {0x1FE0, 0x1FEC}, + {0x1FF2, 0x1FF4}, + {0x1FF6, 0x1FFC}, + {0x2071, 0x2071}, + {0x207F, 0x207F}, + {0x2090, 0x209C}, + {0x2102, 0x2102}, + {0x2107, 0x2107}, + {0x210A, 0x2113}, + {0x2115, 0x2115}, + {0x2119, 0x211D}, + {0x2124, 0x2124}, + {0x2126, 0x2126}, + {0x2128, 0x2128}, + {0x212A, 0x212D}, + {0x212F, 0x2139}, + {0x213C, 0x213F}, + {0x2145, 0x2149}, + {0x214E, 0x214E}, + {0x2183, 0x2184}, + {0x2C00, 0x2CE4}, + {0x2CEB, 0x2CEE}, + {0x2CF2, 0x2CF3}, + {0x2D00, 0x2D25}, + {0x2D27, 0x2D27}, + {0x2D2D, 0x2D2D}, + {0x2D30, 0x2D67}, + {0x2D6F, 0x2D6F}, + {0x2D80, 0x2D96}, + {0x2DA0, 0x2DA6}, + {0x2DA8, 0x2DAE}, + {0x2DB0, 0x2DB6}, + {0x2DB8, 0x2DBE}, + {0x2DC0, 0x2DC6}, + {0x2DC8, 0x2DCE}, + {0x2DD0, 0x2DD6}, + {0x2DD8, 0x2DDE}, + {0x2E2F, 0x2E2F}, + {0x3005, 0x3006}, + {0x3031, 0x3035}, + {0x303B, 0x303C}, + {0x3041, 0x3096}, + {0x309D, 0x309F}, + {0x30A1, 0x30FA}, + {0x30FC, 0x30FF}, + {0x3105, 0x312F}, + {0x3131, 0x318E}, + {0x31A0, 0x31BF}, + {0x31F0, 0x31FF}, + {0x3400, 0x4DBF}, + {0x4E00, 0xA48C}, + {0xA4D0, 0xA4FD}, + {0xA500, 0xA60C}, + {0xA610, 0xA61F}, + {0xA62A, 0xA62B}, + {0xA640, 0xA66E}, + {0xA67F, 0xA69D}, + {0xA6A0, 0xA6E5}, + {0xA717, 0xA71F}, + {0xA722, 0xA788}, + {0xA78B, 0xA7DC}, + {0xA7F1, 0xA801}, + {0xA803, 0xA805}, + {0xA807, 0xA80A}, + {0xA80C, 0xA822}, + {0xA840, 0xA873}, + {0xA882, 0xA8B3}, + {0xA8F2, 0xA8F7}, + {0xA8FB, 0xA8FB}, + {0xA8FD, 0xA8FE}, + {0xA90A, 0xA925}, + {0xA930, 0xA946}, + {0xA960, 0xA97C}, + {0xA984, 0xA9B2}, + {0xA9CF, 0xA9CF}, + {0xA9E0, 0xA9E4}, + {0xA9E6, 0xA9EF}, + {0xA9FA, 0xA9FE}, + {0xAA00, 0xAA28}, + {0xAA40, 0xAA42}, + {0xAA44, 0xAA4B}, + {0xAA60, 0xAA76}, + {0xAA7A, 0xAA7A}, + {0xAA7E, 0xAAAF}, + {0xAAB1, 0xAAB1}, + {0xAAB5, 0xAAB6}, + {0xAAB9, 0xAABD}, + {0xAAC0, 0xAAC0}, + {0xAAC2, 0xAAC2}, + {0xAADB, 0xAADD}, + {0xAAE0, 0xAAEA}, + {0xAAF2, 0xAAF4}, + {0xAB01, 0xAB06}, + {0xAB09, 0xAB0E}, + {0xAB11, 0xAB16}, + {0xAB20, 0xAB26}, + {0xAB28, 0xAB2E}, + {0xAB30, 0xAB5A}, + {0xAB5C, 0xAB69}, + {0xAB70, 0xABE2}, + {0xAC00, 0xD7A3}, + {0xD7B0, 0xD7C6}, + {0xD7CB, 0xD7FB}, + {0xF900, 0xFA6D}, + {0xFA70, 0xFAD9}, + {0xFB00, 0xFB06}, + {0xFB13, 0xFB17}, + {0xFB1D, 0xFB1D}, + {0xFB1F, 0xFB28}, + {0xFB2A, 0xFB36}, + {0xFB38, 0xFB3C}, + {0xFB3E, 0xFB3E}, + {0xFB40, 0xFB41}, + {0xFB43, 0xFB44}, + {0xFB46, 0xFBB1}, + {0xFBD3, 0xFD3D}, + {0xFD50, 0xFD8F}, + {0xFD92, 0xFDC7}, + {0xFDF0, 0xFDFB}, + {0xFE70, 0xFE74}, + {0xFE76, 0xFEFC}, + {0xFF21, 0xFF3A}, + {0xFF41, 0xFF5A}, + {0xFF66, 0xFFBE}, + {0xFFC2, 0xFFC7}, + {0xFFCA, 0xFFCF}, + {0xFFD2, 0xFFD7}, + {0xFFDA, 0xFFDC}, + {0x10000, 0x1000B}, + {0x1000D, 0x10026}, + {0x10028, 0x1003A}, + {0x1003C, 0x1003D}, + {0x1003F, 0x1004D}, + {0x10050, 0x1005D}, + {0x10080, 0x100FA}, + {0x10280, 0x1029C}, + {0x102A0, 0x102D0}, + {0x10300, 0x1031F}, + {0x1032D, 0x10340}, + {0x10342, 0x10349}, + {0x10350, 0x10375}, + {0x10380, 0x1039D}, + {0x103A0, 0x103C3}, + {0x103C8, 0x103CF}, + {0x10400, 0x1049D}, + {0x104B0, 0x104D3}, + {0x104D8, 0x104FB}, + {0x10500, 0x10527}, + {0x10530, 0x10563}, + {0x10570, 0x1057A}, + {0x1057C, 0x1058A}, + {0x1058C, 0x10592}, + {0x10594, 0x10595}, + {0x10597, 0x105A1}, + {0x105A3, 0x105B1}, + {0x105B3, 0x105B9}, + {0x105BB, 0x105BC}, + {0x105C0, 0x105F3}, + {0x10600, 0x10736}, + {0x10740, 0x10755}, + {0x10760, 0x10767}, + {0x10780, 0x10785}, + {0x10787, 0x107B0}, + {0x107B2, 0x107BA}, + {0x10800, 0x10805}, + {0x10808, 0x10808}, + {0x1080A, 0x10835}, + {0x10837, 0x10838}, + {0x1083C, 0x1083C}, + {0x1083F, 0x10855}, + {0x10860, 0x10876}, + {0x10880, 0x1089E}, + {0x108E0, 0x108F2}, + {0x108F4, 0x108F5}, + {0x10900, 0x10915}, + {0x10920, 0x10939}, + {0x10940, 0x10959}, + {0x10980, 0x109B7}, + {0x109BE, 0x109BF}, + {0x10A00, 0x10A00}, + {0x10A10, 0x10A13}, + {0x10A15, 0x10A17}, + {0x10A19, 0x10A35}, + {0x10A60, 0x10A7C}, + {0x10A80, 0x10A9C}, + {0x10AC0, 0x10AC7}, + {0x10AC9, 0x10AE4}, + {0x10B00, 0x10B35}, + {0x10B40, 0x10B55}, + {0x10B60, 0x10B72}, + {0x10B80, 0x10B91}, + {0x10C00, 0x10C48}, + {0x10C80, 0x10CB2}, + {0x10CC0, 0x10CF2}, + {0x10D00, 0x10D23}, + {0x10D4A, 0x10D65}, + {0x10D6F, 0x10D85}, + {0x10E80, 0x10EA9}, + {0x10EB0, 0x10EB1}, + {0x10EC2, 0x10EC7}, + {0x10F00, 0x10F1C}, + {0x10F27, 0x10F27}, + {0x10F30, 0x10F45}, + {0x10F70, 0x10F81}, + {0x10FB0, 0x10FC4}, + {0x10FE0, 0x10FF6}, + {0x11003, 0x11037}, + {0x11071, 0x11072}, + {0x11075, 0x11075}, + {0x11083, 0x110AF}, + {0x110D0, 0x110E8}, + {0x11103, 0x11126}, + {0x11144, 0x11144}, + {0x11147, 0x11147}, + {0x11150, 0x11172}, + {0x11176, 0x11176}, + {0x11183, 0x111B2}, + {0x111C1, 0x111C4}, + {0x111DA, 0x111DA}, + {0x111DC, 0x111DC}, + {0x11200, 0x11211}, + {0x11213, 0x1122B}, + {0x1123F, 0x11240}, + {0x11280, 0x11286}, + {0x11288, 0x11288}, + {0x1128A, 0x1128D}, + {0x1128F, 0x1129D}, + {0x1129F, 0x112A8}, + {0x112B0, 0x112DE}, + {0x11305, 0x1130C}, + {0x1130F, 0x11310}, + {0x11313, 0x11328}, + {0x1132A, 0x11330}, + {0x11332, 0x11333}, + {0x11335, 0x11339}, + {0x1133D, 0x1133D}, + {0x11350, 0x11350}, + {0x1135D, 0x11361}, + {0x11380, 0x11389}, + {0x1138B, 0x1138B}, + {0x1138E, 0x1138E}, + {0x11390, 0x113B5}, + {0x113B7, 0x113B7}, + {0x113D1, 0x113D1}, + {0x113D3, 0x113D3}, + {0x11400, 0x11434}, + {0x11447, 0x1144A}, + {0x1145F, 0x11461}, + {0x11480, 0x114AF}, + {0x114C4, 0x114C5}, + {0x114C7, 0x114C7}, + {0x11580, 0x115AE}, + {0x115D8, 0x115DB}, + {0x11600, 0x1162F}, + {0x11644, 0x11644}, + {0x11680, 0x116AA}, + {0x116B8, 0x116B8}, + {0x11700, 0x1171A}, + {0x11740, 0x11746}, + {0x11800, 0x1182B}, + {0x118A0, 0x118DF}, + {0x118FF, 0x11906}, + {0x11909, 0x11909}, + {0x1190C, 0x11913}, + {0x11915, 0x11916}, + {0x11918, 0x1192F}, + {0x1193F, 0x1193F}, + {0x11941, 0x11941}, + {0x119A0, 0x119A7}, + {0x119AA, 0x119D0}, + {0x119E1, 0x119E1}, + {0x119E3, 0x119E3}, + {0x11A00, 0x11A00}, + {0x11A0B, 0x11A32}, + {0x11A3A, 0x11A3A}, + {0x11A50, 0x11A50}, + {0x11A5C, 0x11A89}, + {0x11A9D, 0x11A9D}, + {0x11AB0, 0x11AF8}, + {0x11BC0, 0x11BE0}, + {0x11C00, 0x11C08}, + {0x11C0A, 0x11C2E}, + {0x11C40, 0x11C40}, + {0x11C72, 0x11C8F}, + {0x11D00, 0x11D06}, + {0x11D08, 0x11D09}, + {0x11D0B, 0x11D30}, + {0x11D46, 0x11D46}, + {0x11D60, 0x11D65}, + {0x11D67, 0x11D68}, + {0x11D6A, 0x11D89}, + {0x11D98, 0x11D98}, + {0x11DB0, 0x11DDB}, + {0x11EE0, 0x11EF2}, + {0x11F02, 0x11F02}, + {0x11F04, 0x11F10}, + {0x11F12, 0x11F33}, + {0x11FB0, 0x11FB0}, + {0x12000, 0x12399}, + {0x12480, 0x12543}, + {0x12F90, 0x12FF0}, + {0x13000, 0x1342F}, + {0x13441, 0x13446}, + {0x13460, 0x143FA}, + {0x14400, 0x14646}, + {0x16100, 0x1611D}, + {0x16800, 0x16A38}, + {0x16A40, 0x16A5E}, + {0x16A70, 0x16ABE}, + {0x16AD0, 0x16AED}, + {0x16B00, 0x16B2F}, + {0x16B40, 0x16B43}, + {0x16B63, 0x16B77}, + {0x16B7D, 0x16B8F}, + {0x16D40, 0x16D6C}, + {0x16E40, 0x16E7F}, + {0x16EA0, 0x16EB8}, + {0x16EBB, 0x16ED3}, + {0x16F00, 0x16F4A}, + {0x16F50, 0x16F50}, + {0x16F93, 0x16F9F}, + {0x16FE0, 0x16FE1}, + {0x16FE3, 0x16FE3}, + {0x16FF2, 0x16FF3}, + {0x17000, 0x18CD5}, + {0x18CFF, 0x18D1E}, + {0x18D80, 0x18DF2}, + {0x1AFF0, 0x1AFF3}, + {0x1AFF5, 0x1AFFB}, + {0x1AFFD, 0x1AFFE}, + {0x1B000, 0x1B122}, + {0x1B132, 0x1B132}, + {0x1B150, 0x1B152}, + {0x1B155, 0x1B155}, + {0x1B164, 0x1B167}, + {0x1B170, 0x1B2FB}, + {0x1BC00, 0x1BC6A}, + {0x1BC70, 0x1BC7C}, + {0x1BC80, 0x1BC88}, + {0x1BC90, 0x1BC99}, + {0x1D400, 0x1D454}, + {0x1D456, 0x1D49C}, + {0x1D49E, 0x1D49F}, + {0x1D4A2, 0x1D4A2}, + {0x1D4A5, 0x1D4A6}, + {0x1D4A9, 0x1D4AC}, + {0x1D4AE, 0x1D4B9}, + {0x1D4BB, 0x1D4BB}, + {0x1D4BD, 0x1D4C3}, + {0x1D4C5, 0x1D505}, + {0x1D507, 0x1D50A}, + {0x1D50D, 0x1D514}, + {0x1D516, 0x1D51C}, + {0x1D51E, 0x1D539}, + {0x1D53B, 0x1D53E}, + {0x1D540, 0x1D544}, + {0x1D546, 0x1D546}, + {0x1D54A, 0x1D550}, + {0x1D552, 0x1D6A5}, + {0x1D6A8, 0x1D6C0}, + {0x1D6C2, 0x1D6DA}, + {0x1D6DC, 0x1D6FA}, + {0x1D6FC, 0x1D714}, + {0x1D716, 0x1D734}, + {0x1D736, 0x1D74E}, + {0x1D750, 0x1D76E}, + {0x1D770, 0x1D788}, + {0x1D78A, 0x1D7A8}, + {0x1D7AA, 0x1D7C2}, + {0x1D7C4, 0x1D7CB}, + {0x1DF00, 0x1DF1E}, + {0x1DF25, 0x1DF2A}, + {0x1E030, 0x1E06D}, + {0x1E100, 0x1E12C}, + {0x1E137, 0x1E13D}, + {0x1E14E, 0x1E14E}, + {0x1E290, 0x1E2AD}, + {0x1E2C0, 0x1E2EB}, + {0x1E4D0, 0x1E4EB}, + {0x1E5D0, 0x1E5ED}, + {0x1E5F0, 0x1E5F0}, + {0x1E6C0, 0x1E6DE}, + {0x1E6E0, 0x1E6E2}, + {0x1E6E4, 0x1E6E5}, + {0x1E6E7, 0x1E6ED}, + {0x1E6F0, 0x1E6F4}, + {0x1E6FE, 0x1E6FF}, + {0x1E7E0, 0x1E7E6}, + {0x1E7E8, 0x1E7EB}, + {0x1E7ED, 0x1E7EE}, + {0x1E7F0, 0x1E7FE}, + {0x1E800, 0x1E8C4}, + {0x1E900, 0x1E943}, + {0x1E94B, 0x1E94B}, + {0x1EE00, 0x1EE03}, + {0x1EE05, 0x1EE1F}, + {0x1EE21, 0x1EE22}, + {0x1EE24, 0x1EE24}, + {0x1EE27, 0x1EE27}, + {0x1EE29, 0x1EE32}, + {0x1EE34, 0x1EE37}, + {0x1EE39, 0x1EE39}, + {0x1EE3B, 0x1EE3B}, + {0x1EE42, 0x1EE42}, + {0x1EE47, 0x1EE47}, + {0x1EE49, 0x1EE49}, + {0x1EE4B, 0x1EE4B}, + {0x1EE4D, 0x1EE4F}, + {0x1EE51, 0x1EE52}, + {0x1EE54, 0x1EE54}, + {0x1EE57, 0x1EE57}, + {0x1EE59, 0x1EE59}, + {0x1EE5B, 0x1EE5B}, + {0x1EE5D, 0x1EE5D}, + {0x1EE5F, 0x1EE5F}, + {0x1EE61, 0x1EE62}, + {0x1EE64, 0x1EE64}, + {0x1EE67, 0x1EE6A}, + {0x1EE6C, 0x1EE72}, + {0x1EE74, 0x1EE77}, + {0x1EE79, 0x1EE7C}, + {0x1EE7E, 0x1EE7E}, + {0x1EE80, 0x1EE89}, + {0x1EE8B, 0x1EE9B}, + {0x1EEA1, 0x1EEA3}, + {0x1EEA5, 0x1EEA9}, + {0x1EEAB, 0x1EEBB}, + {0x20000, 0x2A6DF}, + {0x2A700, 0x2B81D}, + {0x2B820, 0x2CEAD}, + {0x2CEB0, 0x2EBE0}, + {0x2EBF0, 0x2EE5D}, + {0x2F800, 0x2FA1D}, + {0x30000, 0x3134A}, + {0x31350, 0x33479}, + }; + + for (const auto& r : ranges) { + if (ch >= r.start && ch <= r.end) + return true; + } + return false; +} + +bool is_space(char32_t cp) { + switch (cp) { + case 0x0009: // TAB \t + case 0x000A: // LF \n + case 0x000B: // VT + case 0x000C: // FF + case 0x000D: // CR \r + case 0x0020: // Space + case 0x00A0: // No-Break Space + case 0x1680: // Ogham Space Mark + case 0x2000: // En Quad + case 0x2001: // Em Quad + case 0x2002: // En Space + case 0x2003: // Em Space + case 0x2004: // Three-Per-Em Space + case 0x2005: // Four-Per-Em Space + case 0x2006: // Six-Per-Em Space + case 0x2007: // Figure Space + case 0x2008: // Punctuation Space + case 0x2009: // Thin Space + case 0x200A: // Hair Space + case 0x202F: // Narrow No-Break Space + case 0x205F: // Medium Mathematical Space + case 0x3000: // Ideographic Space + return true; + default: + return false; + } +} + +std::string str_to_lower(const std::string& input) { + std::string result = input; + std::transform(result.begin(), result.end(), result.begin(), + [](unsigned char c) { return std::tolower(c); }); + return result; +} + +// UTF-8 -> Unicode code points +std::vector utf8_to_codepoints(const std::string& str) { + std::vector codepoints; + size_t i = 0; + while (i < str.size()) { + unsigned char c = str[i]; + char32_t cp = 0; + size_t extra_bytes = 0; + + if ((c & 0x80) == 0) + cp = c; + else if ((c & 0xE0) == 0xC0) { + cp = c & 0x1F; + extra_bytes = 1; + } else if ((c & 0xF0) == 0xE0) { + cp = c & 0x0F; + extra_bytes = 2; + } else if ((c & 0xF8) == 0xF0) { + cp = c & 0x07; + extra_bytes = 3; + } else { + ++i; + continue; + } // Invalid UTF-8 + + if (i + extra_bytes >= str.size()) + break; + + for (size_t j = 1; j <= extra_bytes; ++j) + cp = (cp << 6) | (str[i + j] & 0x3F); + + codepoints.push_back(cp); + i += 1 + extra_bytes; + } + return codepoints; +} + +// Unicode code point -> UTF-8 +std::string codepoint_to_utf8(char32_t cp) { + std::string out; + if (cp <= 0x7F) + out.push_back(static_cast(cp)); + else if (cp <= 0x7FF) { + out.push_back(static_cast(0xC0 | (cp >> 6))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else if (cp <= 0xFFFF) { + out.push_back(static_cast(0xE0 | (cp >> 12))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else { + out.push_back(static_cast(0xF0 | (cp >> 18))); + out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + return out; +} + +bool starts_with(const std::vector& text, + const std::vector& prefix, + std::size_t index) { + if (index > text.size()) { + return false; + } + if (prefix.size() > text.size() - index) { + return false; + } + return std::equal(prefix.begin(), prefix.end(), text.begin() + index); +} + +// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+ +// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+ +std::vector token_split(const std::string& text) { + std::vector tokens; + auto cps = utf8_to_codepoints(text); + size_t i = 0; + + while (i < cps.size()) { + char32_t cp = cps[i]; + + // `(?i:'s|'t|'re|'ve|'m|'ll|'d)` + if (cp == U'\'' && i + 1 < cps.size()) { + std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1])); + if (next == "s" || next == "t" || next == "m") { + tokens.push_back("'" + next); + i += 2; + continue; + } + if (i + 2 < cps.size()) { + next += str_to_lower(codepoint_to_utf8(cps[i + 2])); + if (next == "re" || next == "ve" || next == "ll" || next == "d") { + tokens.push_back("'" + next); + i += 3; + continue; + } + } + } + + // `\p{N}` + if (is_number(cp)) { + tokens.push_back(codepoint_to_utf8(cp)); + ++i; + continue; + } + + // `[^\r\n\p{L}\p{N}]?\p{L}+` + { + // `[^\r\n\p{L}\p{N}]\p{L}+` + if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) { + std::string token = codepoint_to_utf8(cp); + ++i; + + while (i < cps.size() && is_letter(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + tokens.push_back(token); + continue; + } + + // `\p{L}+` + if (is_letter(cp)) { + std::string token = codepoint_to_utf8(cp); + ++i; + while (i < cps.size() && is_letter(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + tokens.push_back(token); + continue; + } + } + + // ` ?[^\s\p{L}\p{N}]+[\r\n]*` + { + // ` [^\s\p{L}\p{N}]+[\r\n]*` + if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) { + std::string token = codepoint_to_utf8(cp); + token += codepoint_to_utf8(cps[i + 1]); + i += 2; + + while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + tokens.push_back(token); + continue; + } + + // `[^\s\p{L}\p{N}]+[\r\n]*` + std::string token; + if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { + std::string token = codepoint_to_utf8(cp); + ++i; + + while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + tokens.push_back(token); + continue; + } + } + + // `\s*[\r\n]+|\s+(?!\S)|\s+` + if (is_space(cp)) { + std::string token; + bool saw_new_line = false; + + while (i < cps.size() && is_space(cps[i])) { + token += codepoint_to_utf8(cps[i]); + + if (cps[i] == U'\r' || cps[i] == U'\n') { + saw_new_line = true; + } else { + if (saw_new_line) { + break; + } + } + + ++i; + } + + tokens.push_back(token); + continue; + } + + // skip + ++i; + } + + return tokens; +} + +std::vector split_with_special_tokens( + const std::string& text, + const std::vector& special_tokens) { + std::vector result; + size_t pos = 0; + size_t text_len = text.size(); + + while (pos < text_len) { + size_t next_pos = text_len; + std::string matched_token; + + for (const auto& token : special_tokens) { + size_t token_pos = text.find(token, pos); + if (token_pos != std::string::npos && token_pos < next_pos) { + next_pos = token_pos; + matched_token = token; + } + } + + if (next_pos > pos) { + result.push_back(text.substr(pos, next_pos - pos)); + } + + if (!matched_token.empty()) { + result.push_back(matched_token); + pos = next_pos + matched_token.size(); + } else { + break; + } + } + + return result; +} + +// int main() { +// std::string text = "I'm testing C++ token_split function. Hello world 123"; +// auto tokens = token_split(text); + +// for (const auto& t : tokens) { +// std::cout << "[" << t << "] "; +// } +// std::cout << "\n"; +// return 0; +// } diff --git a/src/ucache.hpp b/src/ucache.hpp index d324761..3d785c5 100644 --- a/src/ucache.hpp +++ b/src/ucache.hpp @@ -6,8 +6,10 @@ #include #include +#include "condition_cache_utils.hpp" #include "denoiser.hpp" #include "ggml_extend.hpp" +#include "tensor.hpp" struct UCacheConfig { bool enabled = false; @@ -29,15 +31,15 @@ struct UCacheCacheEntry { struct UCacheState { UCacheConfig config; - Denoiser* denoiser = nullptr; - float start_sigma = std::numeric_limits::max(); - float end_sigma = 0.0f; - bool initialized = false; - bool initial_step = true; - bool skip_current_step = false; - bool step_active = false; - const SDCondition* anchor_condition = nullptr; - std::unordered_map cache_diffs; + Denoiser* denoiser = nullptr; + float start_sigma = std::numeric_limits::max(); + float end_sigma = 0.0f; + bool initialized = false; + bool initial_step = true; + bool skip_current_step = false; + bool step_active = false; + const void* anchor_condition = nullptr; + std::unordered_map cache_diffs; std::vector prev_input; std::vector prev_output; float output_prev_norm = 0.0f; @@ -233,43 +235,30 @@ struct UCacheState { return base_threshold * multiplier; } - bool has_cache(const SDCondition* cond) const { + bool has_cache(const void* cond) const { auto it = cache_diffs.find(cond); return it != cache_diffs.end() && !it->second.diff.empty(); } - void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void update_cache(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { UCacheCacheEntry& entry = cache_diffs[cond]; - size_t ne = static_cast(ggml_nelements(output)); - entry.diff.resize(ne); - float* out_data = (float*)output->data; - float* in_data = (float*)input->data; - - for (size_t i = 0; i < ne; ++i) { - entry.diff[i] = out_data[i] - in_data[i]; - } + sd::store_condition_cache_diff(&entry.diff, input, output); } - void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void apply_cache(const void* cond, const sd::Tensor& input, sd::Tensor* output) { auto it = cache_diffs.find(cond); if (it == cache_diffs.end() || it->second.diff.empty()) { return; } - - copy_ggml_tensor(output, input); - float* out_data = (float*)output->data; - const std::vector& diff = it->second.diff; - for (size_t i = 0; i < diff.size(); ++i) { - out_data[i] += diff[i]; - } + sd::apply_condition_cache_diff(it->second.diff, input, output); } - bool before_condition(const SDCondition* cond, - ggml_tensor* input, - ggml_tensor* output, + bool before_condition(const void* cond, + const sd::Tensor& input, + sd::Tensor* output, float sigma, int step_index) { - if (!enabled() || step_index < 0) { + if (!enabled() || step_index < 0 || output == nullptr) { return false; } if (step_index != current_step_index) { @@ -302,13 +291,13 @@ struct UCacheState { return false; } - size_t ne = static_cast(ggml_nelements(input)); + size_t ne = static_cast(input.numel()); if (prev_input.size() != ne) { return false; } - float* input_data = (float*)input->data; - last_input_change = 0.0f; + const float* input_data = input.data(); + last_input_change = 0.0f; for (size_t i = 0; i < ne; ++i) { last_input_change += std::fabs(input_data[i] - prev_input[i]); } @@ -354,7 +343,7 @@ struct UCacheState { return false; } - void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void after_condition(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { if (!step_is_active()) { return; } @@ -367,16 +356,16 @@ struct UCacheState { steps_computed_since_active++; consecutive_skipped_steps = 0; - size_t ne = static_cast(ggml_nelements(input)); - float* in_data = (float*)input->data; + size_t ne = static_cast(input.numel()); + const float* in_data = input.data(); prev_input.resize(ne); for (size_t i = 0; i < ne; ++i) { prev_input[i] = in_data[i]; } has_prev_input = true; - float* out_data = (float*)output->data; - float output_change = 0.0f; + const float* out_data = output.data(); + float output_change = 0.0f; if (has_prev_output && prev_output.size() == ne) { for (size_t i = 0; i < ne; ++i) { output_change += std::fabs(out_data[i] - prev_output[i]); diff --git a/src/unet.hpp b/src/unet.hpp index f7aa3f0..63e23eb 100644 --- a/src/unet.hpp +++ b/src/unet.hpp @@ -609,30 +609,31 @@ struct UNetModelRunner : public GGMLRunner { unet.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* c_concat = nullptr, - ggml_tensor* y = nullptr, - int num_video_frames = -1, - std::vector controls = {}, - float control_strength = 0.f) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& c_concat_tensor = {}, + const sd::Tensor& y_tensor = {}, + int num_video_frames = -1, + const std::vector>& controls_tensor = {}, + float control_strength = 0.f) { ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* c_concat = make_optional_input(c_concat_tensor); + ggml_tensor* y = make_optional_input(y_tensor); + std::vector controls; + controls.reserve(controls_tensor.size()); + for (const auto& control_tensor : controls_tensor) { + controls.push_back(make_input(control_tensor)); + } + if (num_video_frames == -1) { num_video_frames = static_cast(x->ne[3]); } - x = to_backend(x); - context = to_backend(context); - y = to_backend(y); - timesteps = to_backend(timesteps); - c_concat = to_backend(c_concat); - - for (int i = 0; i < controls.size(); i++) { - controls[i] = to_backend(controls[i]); - } - auto runner_ctx = get_context(); ggml_tensor* out = unet.forward(&runner_ctx, @@ -650,17 +651,15 @@ struct UNetModelRunner : public GGMLRunner { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* c_concat, - ggml_tensor* y, - int num_video_frames = -1, - std::vector controls = {}, - float control_strength = 0.f, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& c_concat = {}, + const sd::Tensor& y = {}, + int num_video_frames = -1, + const std::vector>& controls = {}, + float control_strength = 0.f) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] @@ -670,7 +669,7 @@ struct UNetModelRunner : public GGMLRunner { return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -679,8 +678,8 @@ struct UNetModelRunner : public GGMLRunner { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // CPU, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass @@ -689,27 +688,37 @@ struct UNetModelRunner : public GGMLRunner { // CUDA, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: nan int num_video_frames = 3; - auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 8, num_video_frames); + sd::Tensor x({8, 8, 8, num_video_frames}); std::vector timesteps_vec(num_video_frames, 999.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); - ggml_set_f32(x, 0.5f); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); + x.fill_(0.5f); // print_ggml_tensor(x); - auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 1024, 1, num_video_frames); - ggml_set_f32(context, 0.5f); + sd::Tensor context({1024, 1, num_video_frames}); + context.fill_(0.5f); // print_ggml_tensor(context); - auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, num_video_frames); - ggml_set_f32(y, 0.5f); + sd::Tensor y({768, num_video_frames}); + y.fill_(0.5f); // print_ggml_tensor(y); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, nullptr, y, num_video_frames, {}, 0.f, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + {}, + y, + num_video_frames, + {}, + 0.f); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("unet test done in %lldms", t1 - t0); } } diff --git a/src/upscaler.cpp b/src/upscaler.cpp index 18e185d..03f7714 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -2,6 +2,7 @@ #include "ggml_extend.hpp" #include "model.h" #include "stable-diffusion.h" +#include "util.h" struct UpscalerGGML { ggml_backend_t backend = nullptr; // general backend @@ -64,6 +65,39 @@ struct UpscalerGGML { return true; } + sd::Tensor upscale_tensor(const sd::Tensor& input_tensor) { + sd::Tensor upscaled; + if (tile_size <= 0 || (input_tensor.shape()[0] <= tile_size && input_tensor.shape()[1] <= tile_size)) { + upscaled = esrgan_upscaler->compute(n_threads, input_tensor); + } else { + auto on_processing = [&](const sd::Tensor& input_tile) -> sd::Tensor { + auto output_tile = esrgan_upscaler->compute(n_threads, input_tile); + if (output_tile.empty()) { + LOG_ERROR("esrgan compute failed while processing a tile"); + return {}; + } + return output_tile; + }; + + upscaled = process_tiles_2d(input_tensor, + static_cast(input_tensor.shape()[0] * esrgan_upscaler->scale), + static_cast(input_tensor.shape()[1] * esrgan_upscaler->scale), + esrgan_upscaler->scale, + tile_size, + tile_size, + 0.25f, + false, + false, + on_processing); + } + esrgan_upscaler->free_compute_buffer(); + if (upscaled.empty()) { + LOG_ERROR("esrgan compute failed"); + return {}; + } + return upscaled; + } + sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) { // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth sd_image_t upscaled_image = {0, 0, 0, nullptr}; @@ -72,40 +106,17 @@ struct UpscalerGGML { LOG_INFO("upscaling from (%i x %i) to (%i x %i)", input_image.width, input_image.height, output_width, output_height); - ggml_init_params params; - params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = nullptr; - params.no_alloc = false; - - // draft context - ggml_context* upscale_ctx = ggml_init(params); - if (!upscale_ctx) { - LOG_ERROR("ggml_init() failed"); + sd::Tensor input_tensor = sd_image_to_tensor(input_image); + sd::Tensor upscaled; + int64_t t0 = ggml_time_ms(); + upscaled = upscale_tensor(input_tensor); + if (upscaled.empty()) { return upscaled_image; } - // LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); - ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1); - sd_image_to_ggml_tensor(input_image, input_image_tensor); - - ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1); - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return esrgan_upscaler->compute(n_threads, in, &out); - }; - int64_t t0 = ggml_time_ms(); - // TODO: circular upscaling? - sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, false, false, on_tiling); - esrgan_upscaler->free_compute_buffer(); - ggml_ext_tensor_clamp_inplace(upscaled, 0.f, 1.f); - uint8_t* upscaled_data = ggml_tensor_to_sd_image(upscaled); - ggml_free(upscale_ctx); - int64_t t3 = ggml_time_ms(); + sd_image_t upscaled_data = tensor_to_sd_image(upscaled); + int64_t t3 = ggml_time_ms(); LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f); - upscaled_image = { - (uint32_t)output_width, - (uint32_t)output_height, - 3, - upscaled_data, - }; + upscaled_image = upscaled_data; return upscaled_image; } }; diff --git a/src/util.cpp b/src/util.cpp index a94cfd9..2d330a2 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -479,158 +479,96 @@ const char* sd_get_system_info() { return buffer; } -sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) { - sd_image_f32_t converted_image; - converted_image.width = image.width; - converted_image.height = image.height; - converted_image.channel = image.channel; +sd_image_t tensor_to_sd_image(const sd::Tensor& tensor, int frame_index) { + const auto& shape = tensor.shape(); + GGML_ASSERT(shape.size() == 4 || shape.size() == 5); + int width = static_cast(shape[0]); + int height = static_cast(shape[1]); + int channel = static_cast(shape[shape.size() == 5 ? 3 : 2]); + uint8_t* data = (uint8_t*)malloc(static_cast(width * height * channel)); + GGML_ASSERT(data != nullptr); - // Allocate memory for float data - converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float)); - - for (uint32_t i = 0; i < image.width * image.height * image.channel; i++) { - // Convert uint8_t to float - converted_image.data[i] = (float)image.data[i]; - } - - return converted_image; -} - -// Function to perform double linear interpolation -float interpolate(float v1, float v2, float v3, float v4, float x_ratio, float y_ratio) { - return v1 * (1 - x_ratio) * (1 - y_ratio) + v2 * x_ratio * (1 - y_ratio) + v3 * (1 - x_ratio) * y_ratio + v4 * x_ratio * y_ratio; -} - -sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height) { - sd_image_f32_t resized_image; - resized_image.width = target_width; - resized_image.height = target_height; - resized_image.channel = image.channel; - - // Allocate memory for resized float data - resized_image.data = (float*)malloc(target_width * target_height * image.channel * sizeof(float)); - - for (int y = 0; y < target_height; y++) { - for (int x = 0; x < target_width; x++) { - float original_x = (float)x * image.width / target_width; - float original_y = (float)y * image.height / target_height; - - uint32_t x1 = (uint32_t)original_x; - uint32_t y1 = (uint32_t)original_y; - uint32_t x2 = std::min(x1 + 1, image.width - 1); - uint32_t y2 = std::min(y1 + 1, image.height - 1); - - for (uint32_t k = 0; k < image.channel; k++) { - float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k); - float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k); - float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k); - float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k); - - float x_ratio = original_x - x1; - float y_ratio = original_y - y1; - - float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio); - - *(resized_image.data + y * target_width * image.channel + x * image.channel + k) = value; + for (int iw = 0; iw < width; ++iw) { + for (int ih = 0; ih < height; ++ih) { + for (int ic = 0; ic < channel; ++ic) { + float value = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0) + : tensor.index(iw, ih, ic, frame_index); + value = std::clamp(value, 0.0f, 1.0f); + data[(ih * width + iw) * channel + ic] = static_cast(std::round(value * 255.0f)); } } } - - return resized_image; + return { + static_cast(width), + static_cast(height), + static_cast(channel), + data, + }; } -void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) { - for (uint32_t y = 0; y < image.height; y++) { - for (uint32_t x = 0; x < image.width; x++) { - for (uint32_t k = 0; k < image.channel; k++) { - int index = (y * image.width + x) * image.channel + k; - image.data[index] = (image.data[index] - means[k]) / stds[k]; +sd::Tensor sd_image_to_tensor(sd_image_t image, + int target_width, + int target_height, + bool scale) { + sd::Tensor tensor = sd::zeros({static_cast(image.width), + static_cast(image.height), + static_cast(image.channel), + 1}); + for (uint32_t iw = 0; iw < image.width; ++iw) { + for (uint32_t ih = 0; ih < image.height; ++ih) { + for (uint32_t ic = 0; ic < image.channel; ++ic) { + tensor.index(iw, ih, ic, 0) = sd_image_get_f32(image, iw, ih, ic, scale); } } } + if (target_width >= 0 && target_height >= 0 && + (tensor.shape()[0] != target_width || tensor.shape()[1] != target_height)) { + tensor = sd::ops::interpolate(tensor, + {target_width, + target_height, + tensor.shape()[2], + tensor.shape()[3]}); + } + return tensor; } // Constants for means and std float means[3] = {0.48145466f, 0.4578275f, 0.40821073f}; float stds[3] = {0.26862954f, 0.26130258f, 0.27577711f}; -// Function to clip and preprocess sd_image_f32_t -sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height) { - float width_scale = (float)target_width / image.width; - float height_scale = (float)target_height / image.height; +sd::Tensor clip_preprocess(const sd::Tensor& image, int target_width, int target_height) { + GGML_ASSERT(image.dim() == 4); + GGML_ASSERT(image.shape()[2] == 3); + GGML_ASSERT(image.shape()[3] == 1); + GGML_ASSERT(target_width > 0 && target_height > 0); - float scale = std::fmax(width_scale, height_scale); + float width_scale = static_cast(target_width) / static_cast(image.shape()[0]); + float height_scale = static_cast(target_height) / static_cast(image.shape()[1]); + float scale = std::fmax(width_scale, height_scale); - // Interpolation - int resized_width = (int)(scale * image.width); - int resized_height = (int)(scale * image.height); - float* resized_data = (float*)malloc(resized_width * resized_height * image.channel * sizeof(float)); + int64_t resized_width = static_cast(scale * static_cast(image.shape()[0])); + int64_t resized_height = static_cast(scale * static_cast(image.shape()[1])); - for (int y = 0; y < resized_height; y++) { - for (int x = 0; x < resized_width; x++) { - float original_x = (float)x * image.width / resized_width; - float original_y = (float)y * image.height / resized_height; + sd::Tensor resized = sd::ops::interpolate( + image, + {resized_width, resized_height, image.shape()[2], image.shape()[3]}); - uint32_t x1 = (uint32_t)original_x; - uint32_t y1 = (uint32_t)original_y; - uint32_t x2 = std::min(x1 + 1, image.width - 1); - uint32_t y2 = std::min(y1 + 1, image.height - 1); + int64_t h_offset = std::max((resized_height - target_height) / 2, 0); + int64_t w_offset = std::max((resized_width - target_width) / 2, 0); - for (uint32_t k = 0; k < image.channel; k++) { - float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k); - float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k); - float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k); - float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k); - - float x_ratio = original_x - x1; - float y_ratio = original_y - y1; - - float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio); - - *(resized_data + y * resized_width * image.channel + x * image.channel + k) = value; + sd::Tensor cropped({target_width, target_height, image.shape()[2], image.shape()[3]}); + for (int64_t y = 0; y < target_height; ++y) { + for (int64_t x = 0; x < target_width; ++x) { + for (int64_t c = 0; c < image.shape()[2]; ++c) { + cropped.index(x, y, c, 0) = resized.index(x + w_offset, y + h_offset, c, 0); } } } - // Clip and preprocess - int h_offset = std::max((int)(resized_height - target_height) / 2, 0); - int w_offset = std::max((int)(resized_width - target_width) / 2, 0); - - sd_image_f32_t result; - result.width = target_width; - result.height = target_height; - result.channel = image.channel; - result.data = (float*)malloc(target_height * target_width * image.channel * sizeof(float)); - - for (uint32_t k = 0; k < image.channel; k++) { - for (uint32_t i = 0; i < result.height; i++) { - for (uint32_t j = 0; j < result.width; j++) { - int src_y = std::min(static_cast(i + h_offset), resized_height - 1); - int src_x = std::min(static_cast(j + w_offset), resized_width - 1); - *(result.data + i * result.width * image.channel + j * image.channel + k) = - fmin(fmax(*(resized_data + src_y * resized_width * image.channel + src_x * image.channel + k), 0.0f), 255.0f) / 255.0f; - } - } - } - - // Free allocated memory - free(resized_data); - - // Normalize - for (uint32_t k = 0; k < image.channel; k++) { - for (uint32_t i = 0; i < result.height; i++) { - for (uint32_t j = 0; j < result.width; j++) { - // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f; - int offset = i * result.width * image.channel + j * image.channel + k; - float value = *(result.data + offset); - value = (value - means[k]) / stds[k]; - // value = 0.5f; - *(result.data + offset) = value; - } - } - } - - return result; + sd::Tensor normalized = sd::ops::clamp(cropped, 0.0f, 1.0f); + sd::Tensor mean({1, 1, 3, 1}, {means[0], means[1], means[2]}); + sd::Tensor std({1, 1, 3, 1}, {stds[0], stds[1], stds[2]}); + return (normalized - mean) / std; } // Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345 diff --git a/src/util.h b/src/util.h index 7dee7bf..24ce4cf 100644 --- a/src/util.h +++ b/src/util.h @@ -7,6 +7,7 @@ #include #include "stable-diffusion.h" +#include "tensor.hpp" #define SAFE_STR(s) ((s) ? (s) : "") #define BOOL_STR(b) ((b) ? "true" : "false") @@ -29,20 +30,14 @@ std::string utf32_to_utf8(const std::u32string& utf32_str); std::u32string unicode_value_to_utf32(int unicode_value); // std::string sd_basename(const std::string& path); -typedef struct { - uint32_t width; - uint32_t height; - uint32_t channel; - float* data; -} sd_image_f32_t; +sd_image_t tensor_to_sd_image(const sd::Tensor& tensor, int frame_index = 0); -void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]); +sd::Tensor sd_image_to_tensor(sd_image_t image, + int target_width = -1, + int target_height = -1, + bool scale = true); -sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image); - -sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height); - -sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height); +sd::Tensor clip_preprocess(const sd::Tensor& image, int target_width, int target_height); class MmapWrapper { public: diff --git a/src/vae.hpp b/src/vae.hpp index dafc0d4..22be886 100644 --- a/src/vae.hpp +++ b/src/vae.hpp @@ -2,16 +2,64 @@ #define __VAE_HPP__ #include "common_block.hpp" +#include "tensor_ggml.hpp" struct VAE : public GGMLRunner { protected: SDVersion version; - bool scale_input = true; - virtual bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx) = 0; + bool scale_input = true; + virtual sd::Tensor _compute(const int n_threads, + const sd::Tensor& z, + bool decode_graph) = 0; + + static inline void scale_tensor_to_minus1_1(sd::Tensor* tensor) { + GGML_ASSERT(tensor != nullptr); + for (int64_t i = 0; i < tensor->numel(); ++i) { + (*tensor)[i] = (*tensor)[i] * 2.0f - 1.0f; + } + } + + static inline void scale_tensor_to_0_1(sd::Tensor* tensor) { + GGML_ASSERT(tensor != nullptr); + for (int64_t i = 0; i < tensor->numel(); ++i) { + float value = ((*tensor)[i] + 1.0f) * 0.5f; + (*tensor)[i] = std::max(0.0f, std::min(1.0f, value)); + } + } + + sd::Tensor tiled_compute(const sd::Tensor& input, + int n_threads, + int output_width, + int output_height, + int scale, + int p_tile_size_x, + int p_tile_size_y, + float tile_overlap_factor, + bool circular_x, + bool circular_y, + bool decode_graph, + const char* error_message, + bool silent = false) { + auto on_processing = [&](const sd::Tensor& input_tile) { + auto output_tile = _compute(n_threads, input_tile, decode_graph); + if (output_tile.empty()) { + LOG_ERROR("%s", error_message); + return sd::Tensor(); + } + return output_tile; + }; + return ::process_tiles_2d(input, + output_width, + output_height, + scale, + p_tile_size_x, + p_tile_size_y, + tile_overlap_factor, + circular_x, + circular_y, + on_processing, + silent); + } public: VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu) @@ -60,133 +108,109 @@ public: tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y); } - ggml_tensor* encode(int n_threads, - ggml_context* work_ctx, - ggml_tensor* x, - sd_tiling_params_t tiling_params, - bool circular_x = false, - bool circular_y = false) { - int64_t t0 = ggml_time_ms(); - ggml_tensor* result = nullptr; - const int scale_factor = get_scale_factor(); - int64_t W = x->ne[0] / scale_factor; - int64_t H = x->ne[1] / scale_factor; - int channel_dim = sd_version_is_wan(version) ? 3 : 2; - int64_t C = get_encoder_output_channels(static_cast(x->ne[channel_dim])); - int64_t ne2; - int64_t ne3; - if (sd_version_is_wan(version)) { - int64_t T = x->ne[2]; - ne2 = (T - 1) / 4 + 1; - ne3 = C; - } else { - ne2 = C; - ne3 = x->ne[3]; - } - result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3); - + sd::Tensor encode(int n_threads, + const sd::Tensor& x, + sd_tiling_params_t tiling_params, + bool circular_x = false, + bool circular_y = false) { + int64_t t0 = ggml_time_ms(); + sd::Tensor input = x; + sd::Tensor output; if (scale_input) { - scale_to_minus1_1(x); - } - - if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); + scale_tensor_to_minus1_1(&input); } if (tiling_params.enabled) { + const int scale_factor = get_scale_factor(); + int64_t W = input.shape()[0] / scale_factor; + int64_t H = input.shape()[1] / scale_factor; float tile_overlap; int tile_size_x, tile_size_y; - // multiply tile size for encode to keep the compute buffer size consistent get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, W, H, 1.30539f); - LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y); - - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return _compute(n_threads, in, false, &out, work_ctx); - }; - sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling); + output = tiled_compute(input, + n_threads, + static_cast(W), + static_cast(H), + scale_factor, + tile_size_x, + tile_size_y, + tile_overlap, + circular_x, + circular_y, + false, + "vae encode compute failed while processing a tile"); } else { - _compute(n_threads, x, false, &result, work_ctx); + output = _compute(n_threads, input, false); + free_compute_buffer(); } - free_compute_buffer(); + if (output.empty()) { + LOG_ERROR("vae encode compute failed"); + return {}; + } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - return result; + return std::move(output); } - ggml_tensor* decode(int n_threads, - ggml_context* work_ctx, - ggml_tensor* x, - sd_tiling_params_t tiling_params, - bool decode_video = false, - bool circular_x = false, - bool circular_y = false, - ggml_tensor* result = nullptr, - bool silent = false) { - const int scale_factor = get_scale_factor(); - int64_t W = x->ne[0] * scale_factor; - int64_t H = x->ne[1] * scale_factor; - int64_t C = 3; - if (result == nullptr) { - if (decode_video) { - int64_t T = x->ne[2]; - if (sd_version_is_wan(version)) { - T = ((T - 1) * 4) + 1; - } - result = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - W, - H, - T, - 3); - } else { - result = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - W, - H, - C, - x->ne[3]); - } - } - int64_t t0 = ggml_time_ms(); - if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); - } + sd::Tensor decode(int n_threads, + const sd::Tensor& x, + sd_tiling_params_t tiling_params, + bool decode_video = false, + bool circular_x = false, + bool circular_y = false, + bool silent = false) { + int64_t t0 = ggml_time_ms(); + sd::Tensor input = x; + sd::Tensor output; + if (tiling_params.enabled) { + const int scale_factor = get_scale_factor(); + int64_t W = input.shape()[0] * scale_factor; + int64_t H = input.shape()[1] * scale_factor; float tile_overlap; int tile_size_x, tile_size_y; - get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, x->ne[0], x->ne[1]); - + get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, input.shape()[0], input.shape()[1]); if (!silent) { LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y); } - - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return _compute(n_threads, in, true, &out, nullptr); - }; - sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling, silent); + output = tiled_compute( + input, + n_threads, + static_cast(W), + static_cast(H), + scale_factor, + tile_size_x, + tile_size_y, + tile_overlap, + circular_x, + circular_y, + true, + "vae decode compute failed while processing a tile", + silent); } else { - if (!_compute(n_threads, x, true, &result, work_ctx)) { - LOG_ERROR("Failed to decode latetnts"); - free_compute_buffer(); - return nullptr; - } + output = _compute(n_threads, input, true); } + free_compute_buffer(); + + if (output.empty()) { + LOG_ERROR("vae decode compute failed"); + return {}; + } if (scale_input) { - scale_to_0_1(result); + scale_tensor_to_0_1(&output); } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f); - return result; + return std::move(output); } - virtual ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) = 0; - virtual ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0; - virtual ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0; - virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; + virtual sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) = 0; + virtual sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) = 0; + virtual sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) = 0; + virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; }; @@ -198,31 +222,25 @@ struct FakeVAE : public VAE { return input_channels; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx) override { - if (*output == nullptr && output_ctx != nullptr) { - *output = ggml_dup_tensor(output_ctx, z); - } - ggml_ext_tensor_iter(z, [&](ggml_tensor* z, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(z, i0, i1, i2, i3); - ggml_ext_tensor_set_f32(*output, value, i0, i1, i2, i3); - }); - return true; + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z, + bool decode_graph) override { + SD_UNUSED(n_threads); + SD_UNUSED(decode_graph); + return z; } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { + SD_UNUSED(rng); return vae_output; } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { + return latents; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { + return latents; } void get_param_tensors(std::map& tensors, const std::string prefix) override {} diff --git a/src/wan.hpp b/src/wan.hpp index af8acbf..6860262 100644 --- a/src/wan.hpp +++ b/src/wan.hpp @@ -1131,105 +1131,66 @@ namespace WAN { ae.get_param_tensors(tensors, prefix); } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { + SD_UNUSED(rng); return vae_output; } - void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector& latents_mean_vec, std::vector& latents_std_vec) { - GGML_ASSERT(latents->ne[channel_dim] == 16 || latents->ne[channel_dim] == 48); - if (latents->ne[channel_dim] == 16) { // Wan2.1 VAE - latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f, - 0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f}; - latents_std_vec = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f, - 3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f}; - } else if (latents->ne[channel_dim] == 48) { // Wan2.2 VAE - latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f, - -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f, - -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f, - -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f, - -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, - 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}; - latents_std_vec = { - 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, - 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, - 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, - 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, - 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, - 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; + std::pair, sd::Tensor> get_latents_mean_std(const sd::Tensor& latents) { + int channel_dim = latents.dim() == 5 ? 3 : 2; + std::vector stats_shape(static_cast(latents.dim()), 1); + if (latents.shape()[channel_dim] == 16) { // Wan2.1 VAE + stats_shape[static_cast(channel_dim)] = 16; + + auto mean_tensor = sd::Tensor::from_vector({-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f, + 0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f}); + mean_tensor.reshape_(stats_shape); + auto std_tensor = sd::Tensor::from_vector({2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f, + 3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f}); + std_tensor.reshape_(stats_shape); + return {std::move(mean_tensor), std::move(std_tensor)}; } + if (latents.shape()[channel_dim] == 48) { // Wan2.2 VAE + stats_shape[static_cast(channel_dim)] = 48; + + auto mean_tensor = sd::Tensor::from_vector({-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f, + -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f, + -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f, + -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f, + -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, + 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}); + mean_tensor.reshape_(stats_shape); + auto std_tensor = sd::Tensor::from_vector({0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, + 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, + 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, + 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, + 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, + 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}); + std_tensor.reshape_(stats_shape); + return {std::move(mean_tensor), std::move(std_tensor)}; + } + GGML_ABORT("unexpected latent channel dimension %lld for version %d", + (long long)latents.shape()[channel_dim], + version); } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - ggml_tensor* vae_latents = ggml_dup(work_ctx, latents); - int channel_dim = sd_version_is_wan(version) ? 3 : 2; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latents->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latents->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[j]; - std_ = latents_std_vec[j]; - } - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); - value = value * std_ / scale_factor + mean; - ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i); - } - } - } - } - - return vae_latents; + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { + auto [mean_tensor, std_tensor] = get_latents_mean_std(latents); + return (latents * std_tensor) / scale_factor + mean_tensor; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents); - int channel_dim = sd_version_is_wan(version) ? 3 : 2; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latents->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latents->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[j]; - std_ = latents_std_vec[j]; - } - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); - value = (value - mean) * scale_factor / std_; - ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i); - } - } - } - } - return diffusion_latents; + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { + auto [mean_tensor, std_tensor] = get_latents_mean_std(latents); + return ((latents - mean_tensor) * scale_factor) / std_tensor; } int get_encoder_output_channels(int input_channels) { return static_cast(ae.z_dim); } - ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { - ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]); - - z = to_backend(z); + ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { + ggml_cgraph* gf = new_graph_custom(10240 * z_tensor.shape()[2]); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); @@ -1240,7 +1201,7 @@ namespace WAN { return gf; } - ggml_cgraph* build_graph_partial(ggml_tensor* z, bool decode_graph, int i) { + ggml_cgraph* build_graph_partial(const sd::Tensor& z_tensor, bool decode_graph, int i) { ggml_cgraph* gf = new_graph_custom(20480); ae.clear_cache(); @@ -1250,7 +1211,7 @@ namespace WAN { ae._feat_map[feat_idx] = feat_cache; } - z = to_backend(z); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); @@ -1269,58 +1230,57 @@ namespace WAN { return gf; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) override { + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z, + bool decode_graph) override { if (true) { + sd::Tensor input; + if (z.dim() == 4) { + input = z.unsqueeze(2); + } auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(z, decode_graph); + if (input.empty()) { + return build_graph(z, decode_graph); + } else { + return build_graph(input, decode_graph); + } }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true), + input.empty() ? z.dim() : input.dim()); + if (!result.empty() && z.dim() == 4) { + result.squeeze_(2); + } + return result; } else { // chunk 1 result is weird ae.clear_cache(); - int64_t t = z->ne[2]; + int64_t t = z.shape()[2]; int i = 0; auto get_graph = [&]() -> ggml_cgraph* { return build_graph_partial(z, decode_graph, i); }; - ggml_tensor* out = nullptr; - bool res = GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx); + auto out_opt = GGMLRunner::compute(get_graph, n_threads, true); + if (!out_opt.has_value()) { + return {}; + } + sd::Tensor out = std::move(*out_opt); ae.clear_cache(); if (t == 1) { - *output = out; - return res; + return out; } - *output = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], (t - 1) * 4 + 1, out->ne[3]); - - auto copy_to_output = [&]() { - for (int64_t i3 = 0; i3 < out->ne[3]; i3++) { - for (int64_t i2 = 0; i2 < out->ne[2]; i2++) { - for (int64_t i1 = 0; i1 < out->ne[1]; i1++) { - for (int64_t i0 = 0; i0 < out->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(out, i0, i1, i2, i3); - int64_t offset = (i == 0) ? 0 : (1 + (i - 1) * 4); - ggml_ext_tensor_set_f32(*output, value, i0, i1, offset + i2, i3); - } - } - } - } - }; - - copy_to_output(); - - out = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], 4, out->ne[3]); + sd::Tensor output = std::move(out); for (i = 1; i < t; i++) { - res = res || GGMLRunner::compute(get_graph, n_threads, true, &out); + auto chunk_opt = GGMLRunner::compute(get_graph, n_threads, true); + if (!chunk_opt.has_value()) { + return {}; + } + out = std::move(*chunk_opt); ae.clear_cache(); - copy_to_output(); + output = sd::ops::concat(output, out, 2); } free_cache_ctx_and_buffer(); - return res; + return output; } } @@ -1330,25 +1290,25 @@ namespace WAN { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); if (true) { // cpu f32, pass // cpu f16, pass // cuda f16, pass // cuda f32, pass - auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 104, 60, 2, 16); - ggml_set_f32(z, 0.5f); - z = load_tensor_from_file(work_ctx, "wan_vae_z.bin"); - print_ggml_tensor(z); - ggml_tensor* out = nullptr; + auto z = sd::load_tensor_from_file_as_tensor("wan_vae_z.bin"); + print_sd_tensor(z); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - _compute(8, z, true, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = _compute(8, z, true); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("decode test done in %ldms", t1 - t0); } }; @@ -2229,23 +2189,23 @@ namespace WAN { wan.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* clip_fea = nullptr, - ggml_tensor* c_concat = nullptr, - ggml_tensor* time_dim_concat = nullptr, - ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& clip_fea_tensor = {}, + const sd::Tensor& c_concat_tensor = {}, + const sd::Tensor& time_dim_concat_tensor = {}, + const sd::Tensor& vace_context_tensor = {}, + float vace_strength = 1.f) { ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE); - x = to_backend(x); - timesteps = to_backend(timesteps); - context = to_backend(context); - clip_fea = to_backend(clip_fea); - c_concat = to_backend(c_concat); - time_dim_concat = to_backend(time_dim_concat); - vace_context = to_backend(vace_context); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* clip_fea = make_optional_input(clip_fea_tensor); + ggml_tensor* c_concat = make_optional_input(c_concat_tensor); + ggml_tensor* time_dim_concat = make_optional_input(time_dim_concat_tensor); + ggml_tensor* vace_context = make_optional_input(vace_context_tensor); pe_vec = Rope::gen_wan_pe(static_cast(x->ne[2]), static_cast(x->ne[1]), @@ -2285,22 +2245,20 @@ namespace WAN { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* clip_fea = nullptr, - ggml_tensor* c_concat = nullptr, - ggml_tensor* time_dim_concat = nullptr, - ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& clip_fea = {}, + const sd::Tensor& c_concat = {}, + const sd::Tensor& time_dim_concat = {}, + const sd::Tensor& vace_context = {}, + float vace_strength = 1.f) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -2309,36 +2267,38 @@ namespace WAN { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // cpu f16: pass // cuda f16: pass // cpu q8_0: pass - // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 104, 60, 1, 16); + // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 104, 60, 1, 16); // ggml_set_f32(x, 0.01f); - auto x = load_tensor_from_file(work_ctx, "wan_dit_x.bin"); - print_ggml_tensor(x); + auto x = sd::load_tensor_from_file_as_tensor("wan_dit_x.bin"); + print_sd_tensor(x); std::vector timesteps_vec(3, 1000.f); timesteps_vec[0] = 0.f; - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); - // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 512, 1); + // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 4096, 512, 1); // ggml_set_f32(context, 0.01f); - auto context = load_tensor_from_file(work_ctx, "wan_dit_context.bin"); - print_ggml_tensor(context); - // auto clip_fea = load_tensor_from_file(work_ctx, "wan_dit_clip_fea.bin"); + auto context = sd::load_tensor_from_file_as_tensor("wan_dit_context.bin"); + print_sd_tensor(context); + // auto clip_fea = load_tensor_from_file(ctx, "wan_dit_clip_fea.bin"); // print_ggml_tensor(clip_fea); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, nullptr, nullptr, nullptr, nullptr, 1.f, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, x, timesteps, context, {}, {}, {}, {}, 1.f); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("wan test done in %lldms", t1 - t0); } } diff --git a/src/z_image.hpp b/src/z_image.hpp index 53a7cf8..363ce5f 100644 --- a/src/z_image.hpp +++ b/src/z_image.hpp @@ -481,20 +481,21 @@ namespace ZImage { z_image.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor, + const std::vector>& ref_latents_tensor = {}, + bool increase_ref_index = false) { + ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); GGML_ASSERT(x->ne[3] == 1); - ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE); - - x = to_backend(x); - context = to_backend(context); - timesteps = to_backend(timesteps); - - for (int i = 0; i < ref_latents.size(); i++) { - ref_latents[i] = to_backend(ref_latents[i]); + GGML_ASSERT(!context_tensor.empty()); + ggml_tensor* context = make_input(context_tensor); + std::vector ref_latents; + ref_latents.reserve(ref_latents_tensor.size()); + for (const auto& ref_latent_tensor : ref_latents_tensor) { + ref_latents.push_back(make_input(ref_latent_tensor)); } pe_vec = Rope::gen_z_image_pe(static_cast(x->ne[1]), @@ -530,14 +531,12 @@ namespace ZImage { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context, + const std::vector>& ref_latents = {}, + bool increase_ref_index = false) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] @@ -545,7 +544,7 @@ namespace ZImage { return build_graph(x, timesteps, context, ref_latents, increase_ref_index); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -554,30 +553,37 @@ namespace ZImage { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { - // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1); + // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 16, 16, 16, 1); // ggml_set_f32(x, 0.01f); - auto x = load_tensor_from_file(work_ctx, "./z_image_x.bin"); - print_ggml_tensor(x); + auto x = sd::load_tensor_from_file_as_tensor("./z_image_x.bin"); + print_sd_tensor(x); std::vector timesteps_vec(1, 0.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); - // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 2560, 256, 1); + // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2560, 256, 1); // ggml_set_f32(context, 0.01f); - auto context = load_tensor_from_file(work_ctx, "./z_image_context.bin"); - print_ggml_tensor(context); + auto context = sd::load_tensor_from_file_as_tensor("./z_image_context.bin"); + print_sd_tensor(context); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, {}, false, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + {}, + false); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("z_image test done in %lldms", t1 - t0); } }