From 9b424db0f4ed90197c0189d6de0f7fcd2e6eb499 Mon Sep 17 00:00:00 2001 From: WinkelCode <29005772+WinkelCode@users.noreply.github.com> Date: Sat, 7 Mar 2026 17:23:23 +0100 Subject: [PATCH 01/20] ci: change workflow owner of "actions-commit-hash" from "pr-mpt" to "prompt" (#1323) --- .github/workflows/build.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 666887d9..b2da00dc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -70,7 +70,7 @@ jobs: - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: pr-mpt/actions-commit-hash@v2 + uses: prompt/actions-commit-hash@v2 - name: Fetch system info id: system-info @@ -123,7 +123,7 @@ jobs: - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: pr-mpt/actions-commit-hash@v2 + uses: prompt/actions-commit-hash@v2 - name: Fetch system info id: system-info @@ -177,7 +177,7 @@ jobs: - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: pr-mpt/actions-commit-hash@v2 + uses: prompt/actions-commit-hash@v2 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -240,7 +240,7 @@ jobs: - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: pr-mpt/actions-commit-hash@v2 + uses: prompt/actions-commit-hash@v2 - name: Fetch system info id: system-info @@ -340,7 +340,7 @@ jobs: - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: pr-mpt/actions-commit-hash@v2 + uses: prompt/actions-commit-hash@v2 - name: Pack artifacts id: pack_artifacts @@ -463,7 +463,7 @@ jobs: - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: pr-mpt/actions-commit-hash@v2 + uses: prompt/actions-commit-hash@v2 - name: Pack artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -581,7 +581,7 @@ jobs: - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: pr-mpt/actions-commit-hash@v2 + uses: prompt/actions-commit-hash@v2 - name: Prepare artifacts id: prepare_artifacts @@ -660,7 +660,7 @@ jobs: - name: Get commit hash id: commit - uses: pr-mpt/actions-commit-hash@v2 + uses: prompt/actions-commit-hash@v2 - name: Create release id: create_release From 3d33caaef8b1f1f0460abca4f8133f3bee56d73b Mon Sep 17 00:00:00 2001 From: stduhpf Date: Sat, 7 Mar 2026 17:25:07 +0100 Subject: [PATCH 02/20] fix: make tiling work better when using circular (#1299) --- src/ggml_extend.hpp | 61 ++++++++++++++++++++++++++++---------- src/stable-diffusion.cpp | 64 ++++++++++++++++++++++++++++++++-------- src/upscaler.cpp | 3 +- 3 files changed, 99 insertions(+), 29 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 131d66fb..954aee2b 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -491,12 +491,16 @@ __STATIC_INLINE__ void ggml_ext_tensor_split_2d(struct ggml_tensor* input, int64_t height = output->ne[1]; int64_t channels = output->ne[2]; int64_t ne3 = output->ne[3]; + + int64_t input_width = input->ne[0]; + int64_t input_height = input->ne[1]; + GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32); for (int iy = 0; iy < height; iy++) { for (int ix = 0; ix < width; ix++) { for (int k = 0; k < channels; k++) { for (int l = 0; l < ne3; l++) { - float value = ggml_ext_tensor_get_f32(input, ix + x, iy + y, k, l); + float value = ggml_ext_tensor_get_f32(input, (ix + x) % input_width, (iy + y) % input_height, k, l); ggml_ext_tensor_set_f32(output, value, ix, iy, k, l); } } @@ -516,6 +520,8 @@ __STATIC_INLINE__ void ggml_ext_tensor_merge_2d(struct ggml_tensor* input, int y, int overlap_x, int overlap_y, + bool circular_x, + bool circular_y, int x_skip = 0, int y_skip = 0) { int64_t width = input->ne[0]; @@ -533,12 +539,12 @@ __STATIC_INLINE__ void ggml_ext_tensor_merge_2d(struct ggml_tensor* input, for (int l = 0; l < ne3; l++) { float new_value = ggml_ext_tensor_get_f32(input, ix, iy, k, l); if (overlap_x > 0 || overlap_y > 0) { // blend colors in overlapped area - float old_value = ggml_ext_tensor_get_f32(output, x + ix, y + iy, k, l); + float old_value = ggml_ext_tensor_get_f32(output, (x + ix) % img_width, (y + iy) % img_height, k, l); - const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1; - const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1; - const float y_f_0 = (overlap_y > 0 && y > 0) ? (iy - y_skip) / float(overlap_y) : 1; - const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float(overlap_y) : 1; + const float x_f_0 = (circular_x || (overlap_x > 0 && x > 0)) ? (ix - x_skip) / float(overlap_x) : 1; + const float x_f_1 = (circular_x || (overlap_x > 0 && x < (img_width - width))) ? (width - ix) / float(overlap_x) : 1; + const float y_f_0 = (circular_y || (overlap_y > 0 && y > 0)) ? (iy - y_skip) / float(overlap_y) : 1; + const float y_f_1 = (circular_y || (overlap_y > 0 && y < (img_height - height))) ? (height - iy) / float(overlap_y) : 1; const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f); const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f); @@ -546,9 +552,9 @@ __STATIC_INLINE__ void ggml_ext_tensor_merge_2d(struct ggml_tensor* input, ggml_ext_tensor_set_f32( output, old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f), - x + ix, y + iy, k, l); + (x + ix) % img_width, (y + iy) % img_height, k, l); } else { - ggml_ext_tensor_set_f32(output, new_value, x + ix, y + iy, k, l); + ggml_ext_tensor_set_f32(output, new_value, (x + ix) % img_width, (y + iy) % img_height, k, l); } } } @@ -773,10 +779,31 @@ __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim, float& tile_overlap_factor_dim, int small_dim, int tile_size, - const float tile_overlap_factor) { + const float tile_overlap_factor, + bool circular) { int tile_overlap = static_cast(tile_size * tile_overlap_factor); int non_tile_overlap = tile_size - tile_overlap; + if (circular) { + // circular means the last and first tile are overlapping (wraping around) + num_tiles_dim = small_dim / non_tile_overlap; + + if (num_tiles_dim < 1) { + num_tiles_dim = 1; + } + + tile_overlap_factor_dim = (tile_size - small_dim / num_tiles_dim) / (float)tile_size; + + // if single tile and tile_overlap_factor is not 0, add one to ensure we have at least two overlapping tiles + if (num_tiles_dim == 1 && tile_overlap_factor_dim > 0) { + num_tiles_dim++; + tile_overlap_factor_dim = 0.5; + } + + return; + } + // else, non-circular means the last and first tile are not overlapping + num_tiles_dim = (small_dim - tile_overlap) / non_tile_overlap; int overshoot_dim = ((num_tiles_dim + 1) * non_tile_overlap + tile_overlap) % small_dim; @@ -805,6 +832,8 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, const int p_tile_size_x, const int p_tile_size_y, const float tile_overlap_factor, + const bool circular_x, + const bool circular_y, on_tile_process on_processing) { output = ggml_set_f32(output, 0); @@ -829,11 +858,11 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, int num_tiles_x; float tile_overlap_factor_x; - sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor); + sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor, circular_x); int num_tiles_y; float tile_overlap_factor_y; - sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor); + sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor, circular_y); LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y); LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor); @@ -887,7 +916,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, float last_time = 0.0f; for (int y = 0; y < small_height && !last_y; y += non_tile_overlap_y) { int dy = 0; - if (y + tile_size_y >= small_height) { + if (!circular_y && y + tile_size_y >= small_height) { int _y = y; y = small_height - tile_size_y; dy = _y - y; @@ -898,7 +927,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, } for (int x = 0; x < small_width && !last_x; x += non_tile_overlap_x) { int dx = 0; - if (x + tile_size_x >= small_width) { + if (!circular_x && x + tile_size_x >= small_width) { int _x = x; x = small_width - tile_size_x; dx = _x - x; @@ -919,7 +948,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, int64_t t1 = ggml_time_ms(); ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in); if (on_processing(input_tile, output_tile, false)) { - ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy); + ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, circular_x, circular_y, dx, dy); int64_t t2 = ggml_time_ms(); last_time = (t2 - t1) / 1000.0f; @@ -942,8 +971,10 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, const int scale, const int tile_size, const float tile_overlap_factor, + const bool circular_x, + const bool circular_y, on_tile_process on_processing) { - sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing); + sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, circular_x, circular_y, on_processing); } __STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm_32(struct ggml_context* ctx, diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 717fec18..d905011a 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -111,6 +111,9 @@ public: bool external_vae_is_invalid = false; bool free_params_immediately = false; + bool circular_x = false; + bool circular_y = false; + std::shared_ptr rng = std::make_shared(); std::shared_ptr sampler_rng = nullptr; int n_threads = -1; @@ -759,12 +762,8 @@ public: if (control_net) { control_net->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y); } - if (first_stage_model) { - first_stage_model->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y); - } - if (tae_first_stage) { - tae_first_stage->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y); - } + circular_x = sd_ctx_params->circular_x; + circular_y = sd_ctx_params->circular_y; } struct ggml_init_params params; @@ -1479,7 +1478,7 @@ public: sd_progress_cb_t cb = sd_get_progress_callback(); void* cbd = sd_get_progress_callback_data(); sd_set_progress_callback((sd_progress_cb_t)suppress_pp, nullptr); - sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing); + sd_tiling(input, output, scale, tile_size, tile_overlap_factor, circular_x, circular_y, on_processing); sd_set_progress_callback(cb, cbd); } @@ -2573,7 +2572,7 @@ public: auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { return first_stage_model->compute(n_threads, in, false, &out, work_ctx); }; - sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling); + sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling); } else { first_stage_model->compute(n_threads, x, false, &result, work_ctx); } @@ -2584,7 +2583,7 @@ public: auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { return tae_first_stage->compute(n_threads, in, false, &out, nullptr); }; - sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling); + sd_tiling(x, result, vae_scale_factor, 64, 0.5f, circular_x, circular_y, on_tiling); } else { tae_first_stage->compute(n_threads, x, false, &result, work_ctx); } @@ -2703,7 +2702,7 @@ public: auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { return first_stage_model->compute(n_threads, in, true, &out, nullptr); }; - sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling); + sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling); } else { if (!first_stage_model->compute(n_threads, x, true, &result, work_ctx)) { LOG_ERROR("Failed to decode latetnts"); @@ -2719,7 +2718,7 @@ public: auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { return tae_first_stage->compute(n_threads, in, true, &out); }; - sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling); + sd_tiling(x, result, vae_scale_factor, 64, 0.5f, circular_x, circular_y, on_tiling); } else { if (!tae_first_stage->compute(n_threads, x, true, &result)) { LOG_ERROR("Failed to decode latetnts"); @@ -3522,8 +3521,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params; - int width = sd_img_gen_params->width; - int height = sd_img_gen_params->height; + + int width = sd_img_gen_params->width; + int height = sd_img_gen_params->height; int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); @@ -3537,6 +3537,40 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple); } + bool circular_x = sd_ctx->sd->circular_x; + bool circular_y = sd_ctx->sd->circular_y; + + if (!sd_img_gen_params->vae_tiling_params.enabled) { + if (sd_ctx->sd->first_stage_model) { + sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + if (sd_ctx->sd->tae_first_stage) { + sd_ctx->sd->tae_first_stage->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + } else { + int tile_size_x, tile_size_y; + float _overlap; + int latent_size_x = width / sd_ctx->sd->get_vae_scale_factor(); + int latent_size_y = height / sd_ctx->sd->get_vae_scale_factor(); + sd_ctx->sd->get_tile_sizes(tile_size_x, tile_size_y, _overlap, sd_img_gen_params->vae_tiling_params, latent_size_x, latent_size_y); + + // force disable circular padding for vae if tiling is enabled unless latent is smaller than tile size + // otherwise it will cause artifacts at the edges of the tiles + sd_ctx->sd->circular_x = sd_ctx->sd->circular_x && (tile_size_x >= latent_size_x); + sd_ctx->sd->circular_y = sd_ctx->sd->circular_y && (tile_size_y >= latent_size_y); + + if (sd_ctx->sd->first_stage_model) { + sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + if (sd_ctx->sd->tae_first_stage) { + sd_ctx->sd->tae_first_stage->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + + // disable circular tiling if it's enabled for the VAE + sd_ctx->sd->circular_x = circular_x && (tile_size_x < latent_size_x); + sd_ctx->sd->circular_y = circular_y && (tile_size_y < latent_size_y); + } + LOG_DEBUG("generate_image %dx%d", width, height); if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { return nullptr; @@ -3806,6 +3840,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g denoise_mask, &sd_img_gen_params->cache); + // restore circular params + sd_ctx->sd->circular_x = circular_x; + sd_ctx->sd->circular_y = circular_y; + size_t t2 = ggml_time_ms(); LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000); diff --git a/src/upscaler.cpp b/src/upscaler.cpp index fd0dc824..41825ee5 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -92,7 +92,8 @@ struct UpscalerGGML { return esrgan_upscaler->compute(n_threads, in, &out); }; int64_t t0 = ggml_time_ms(); - sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling); + // TODO: circular upscaling? + sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, false, false, on_tiling); esrgan_upscaler->free_compute_buffer(); ggml_ext_tensor_clamp_inplace(upscaled, 0.f, 1.f); uint8_t* upscaled_data = ggml_tensor_to_sd_image(upscaled); From c8fb3d245858d495be1f140efdcfaa0d49de41e5 Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 8 Mar 2026 00:28:05 +0800 Subject: [PATCH 03/20] fix: resolve SD1 Pix2Pix issue (#1329) --- src/stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index d905011a..2a770eca 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -2645,7 +2645,7 @@ public: } else { latent = gaussian_latent_sample(work_ctx, vae_output); } - if (!use_tiny_autoencoder) { + if (!use_tiny_autoencoder && version != VERSION_SD1_PIX2PIX) { process_latent_in(latent); } if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { From dea4980f4e27dd2caeaf817c56ecd4645afd30da Mon Sep 17 00:00:00 2001 From: rmatif Date: Mon, 9 Mar 2026 17:35:32 +0100 Subject: [PATCH 04/20] feat: add spectrum caching method (#1322) --- docs/caching.md | 23 +++++ examples/cli/README.md | 8 +- examples/common/common.hpp | 31 +++++- include/stable-diffusion.h | 8 ++ src/spectrum.hpp | 195 +++++++++++++++++++++++++++++++++++++ src/stable-diffusion.cpp | 67 ++++++++++++- 6 files changed, 325 insertions(+), 7 deletions(-) create mode 100644 src/spectrum.hpp diff --git a/docs/caching.md b/docs/caching.md index 7b4be3ce..559b26a9 100644 --- a/docs/caching.md +++ b/docs/caching.md @@ -11,6 +11,7 @@ Caching methods accelerate diffusion inference by reusing intermediate computati | `dbcache` | DiT models | Block-level L1 residual threshold | | `taylorseer` | DiT models | Taylor series approximation | | `cache-dit` | DiT models | Combined DBCache + TaylorSeer | +| `spectrum` | UNET models | Chebyshev + Taylor output forecasting | ### UCache (UNET Models) @@ -118,6 +119,28 @@ Mask values: `1` = compute, `0` = can cache. --scm-policy dynamic ``` +### Spectrum (UNET Models) + +Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum). + +```bash +sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum +``` + +#### Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 | +| `m` | Chebyshev polynomial degree | 3 | +| `lam` | Ridge regression regularization | 1.0 | +| `window` | Initial window size (compute every N steps) | 2 | +| `flex` | Window growth per computed step after warmup | 0.50 | +| `warmup` | Steps to always compute before caching starts | 4 | +| `stop` | Stop caching at this fraction of total steps | 0.9 | + +``` + ### Performance Tips - Start with default thresholds and adjust based on output quality diff --git a/examples/cli/README.md b/examples/cli/README.md index 564e5ce0..6c2ef1eb 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -138,10 +138,12 @@ Generation Options: --skip-layers layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) -r, --ref-image reference image for Flux Kontext models (can be used multiple times) - --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level) + --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), + 'spectrum' (UNET Chebyshev+Taylor forecasting) --cache-option named cache params (key=value format, comma-separated). easycache/ucache: - threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: - "threshold=0.25" or "threshold=1.5,reset=0" + threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; + spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: + "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2" --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u' --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache --scm-policy SCM policy: 'dynamic' (default) or 'static' diff --git a/examples/common/common.hpp b/examples/common/common.hpp index 369c1f07..9c50c159 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -1422,8 +1422,8 @@ struct SDGenerationParams { } cache_mode = argv_to_utf8(index, argv); if (cache_mode != "easycache" && cache_mode != "ucache" && - cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit") { - fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', or 'cache-dit'\n", cache_mode.c_str()); + cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit" && cache_mode != "spectrum") { + fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', 'cache-dit', or 'spectrum'\n", cache_mode.c_str()); return -1; } return 1; @@ -1779,7 +1779,23 @@ struct SDGenerationParams { } else if (key == "Bn" || key == "bn") { cache_params.Bn_compute_blocks = std::stoi(val); } else if (key == "warmup") { - cache_params.max_warmup_steps = std::stoi(val); + if (cache_mode == "spectrum") { + cache_params.spectrum_warmup_steps = std::stoi(val); + } else { + cache_params.max_warmup_steps = std::stoi(val); + } + } else if (key == "w") { + cache_params.spectrum_w = std::stof(val); + } else if (key == "m") { + cache_params.spectrum_m = std::stoi(val); + } else if (key == "lam") { + cache_params.spectrum_lam = std::stof(val); + } else if (key == "window") { + cache_params.spectrum_window_size = std::stoi(val); + } else if (key == "flex") { + cache_params.spectrum_flex_window = std::stof(val); + } else if (key == "stop") { + cache_params.spectrum_stop_percent = std::stof(val); } else { LOG_ERROR("error: unknown cache parameter '%s'", key.c_str()); return false; @@ -1827,6 +1843,15 @@ struct SDGenerationParams { cache_params.Bn_compute_blocks = 0; cache_params.residual_diff_threshold = 0.08f; cache_params.max_warmup_steps = 8; + } else if (cache_mode == "spectrum") { + cache_params.mode = SD_CACHE_SPECTRUM; + cache_params.spectrum_w = 0.40f; + cache_params.spectrum_m = 3; + cache_params.spectrum_lam = 1.0f; + cache_params.spectrum_window_size = 2; + cache_params.spectrum_flex_window = 0.50f; + cache_params.spectrum_warmup_steps = 4; + cache_params.spectrum_stop_percent = 0.9f; } if (!cache_option.empty()) { diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 51b2b329..029c2ab1 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -251,6 +251,7 @@ enum sd_cache_mode_t { SD_CACHE_DBCACHE, SD_CACHE_TAYLORSEER, SD_CACHE_CACHE_DIT, + SD_CACHE_SPECTRUM, }; typedef struct { @@ -271,6 +272,13 @@ typedef struct { int taylorseer_skip_interval; const char* scm_mask; bool scm_policy_dynamic; + float spectrum_w; + int spectrum_m; + float spectrum_lam; + int spectrum_window_size; + float spectrum_flex_window; + int spectrum_warmup_steps; + float spectrum_stop_percent; } sd_cache_params_t; typedef struct { diff --git a/src/spectrum.hpp b/src/spectrum.hpp new file mode 100644 index 00000000..0b206c18 --- /dev/null +++ b/src/spectrum.hpp @@ -0,0 +1,195 @@ +#ifndef __SPECTRUM_HPP__ +#define __SPECTRUM_HPP__ + +#include +#include +#include + +#include "ggml_extend.hpp" + +struct SpectrumConfig { + float w = 0.40f; + int m = 3; + float lam = 1.0f; + int window_size = 2; + float flex_window = 0.50f; + int warmup_steps = 4; + float stop_percent = 0.9f; +}; + +struct SpectrumState { + SpectrumConfig config; + int cnt = 0; + int num_cached = 0; + float curr_ws = 2.0f; + int K = 6; + int stop_step = 0; + int total_steps_skipped = 0; + + std::vector> H_buf; + std::vector T_buf; + + void init(const SpectrumConfig& cfg, size_t total_steps) { + config = cfg; + cnt = 0; + num_cached = 0; + curr_ws = (float)cfg.window_size; + K = std::max(cfg.m + 1, 6); + stop_step = (int)(cfg.stop_percent * (float)total_steps); + total_steps_skipped = 0; + H_buf.clear(); + T_buf.clear(); + } + + float taus(int step_cnt) const { + return (step_cnt / 50.0f) * 2.0f - 1.0f; + } + + bool should_predict() { + if (cnt < config.warmup_steps) + return false; + if (stop_step > 0 && cnt >= stop_step) + return false; + if ((int)H_buf.size() < 2) + return false; + + int ws = std::max(1, (int)std::floor(curr_ws)); + return (num_cached + 1) % ws != 0; + } + + void update(const struct ggml_tensor* denoised) { + int64_t ne = ggml_nelements(denoised); + const float* data = (const float*)denoised->data; + + H_buf.emplace_back(data, data + ne); + T_buf.push_back(taus(cnt)); + + while ((int)H_buf.size() > K) { + H_buf.erase(H_buf.begin()); + T_buf.erase(T_buf.begin()); + } + + if (cnt >= config.warmup_steps) + curr_ws += config.flex_window; + + num_cached = 0; + cnt++; + } + + void predict(struct ggml_tensor* denoised) { + int64_t F = (int64_t)H_buf[0].size(); + int K_curr = (int)H_buf.size(); + int M1 = config.m + 1; + float tau_at = taus(cnt); + + // Design matrix X: K_curr x M1 (Chebyshev basis) + std::vector X(K_curr * M1); + for (int i = 0; i < K_curr; i++) { + X[i * M1] = 1.0f; + if (M1 > 1) + X[i * M1 + 1] = T_buf[i]; + for (int j = 2; j < M1; j++) + X[i * M1 + j] = 2.0f * T_buf[i] * X[i * M1 + j - 1] - X[i * M1 + j - 2]; + } + + // x_star: Chebyshev basis at current tau + std::vector x_star(M1); + x_star[0] = 1.0f; + if (M1 > 1) + x_star[1] = tau_at; + for (int j = 2; j < M1; j++) + x_star[j] = 2.0f * tau_at * x_star[j - 1] - x_star[j - 2]; + + // XtX = X^T X + lambda I + std::vector XtX(M1 * M1, 0.0f); + for (int i = 0; i < M1; i++) { + for (int j = 0; j < M1; j++) { + float sum = 0.0f; + for (int k = 0; k < K_curr; k++) + sum += X[k * M1 + i] * X[k * M1 + j]; + XtX[i * M1 + j] = sum + (i == j ? config.lam : 0.0f); + } + } + + // Cholesky decomposition + std::vector L(M1 * M1, 0.0f); + if (!cholesky_decompose(XtX.data(), L.data(), M1)) { + float trace = 0.0f; + for (int i = 0; i < M1; i++) + trace += XtX[i * M1 + i]; + for (int i = 0; i < M1; i++) + XtX[i * M1 + i] += 1e-4f * trace / M1; + cholesky_decompose(XtX.data(), L.data(), M1); + } + + // Solve XtX v = x_star + std::vector v(M1); + cholesky_solve(L.data(), x_star.data(), v.data(), M1); + + // Prediction weights per history entry + std::vector weights(K_curr, 0.0f); + for (int k = 0; k < K_curr; k++) + for (int j = 0; j < M1; j++) + weights[k] += X[k * M1 + j] * v[j]; + + // Blend Chebyshev and Taylor predictions + float* out = (float*)denoised->data; + float w_cheb = config.w; + float w_taylor = 1.0f - w_cheb; + const float* h_last = H_buf.back().data(); + const float* h_prev = H_buf[H_buf.size() - 2].data(); + + for (int64_t f = 0; f < F; f++) { + float pred_cheb = 0.0f; + for (int k = 0; k < K_curr; k++) + pred_cheb += weights[k] * H_buf[k][f]; + + float pred_taylor = h_last[f] + 0.5f * (h_last[f] - h_prev[f]); + + out[f] = w_taylor * pred_taylor + w_cheb * pred_cheb; + } + + num_cached++; + total_steps_skipped++; + cnt++; + } + +private: + static bool cholesky_decompose(const float* A, float* L, int n) { + std::memset(L, 0, n * n * sizeof(float)); + for (int i = 0; i < n; i++) { + for (int j = 0; j <= i; j++) { + float sum = 0.0f; + for (int k = 0; k < j; k++) + sum += L[i * n + k] * L[j * n + k]; + if (i == j) { + float diag = A[i * n + i] - sum; + if (diag <= 0.0f) + return false; + L[i * n + j] = std::sqrt(diag); + } else { + L[i * n + j] = (A[i * n + j] - sum) / L[j * n + j]; + } + } + } + return true; + } + + static void cholesky_solve(const float* L, const float* b, float* x, int n) { + std::vector y(n); + for (int i = 0; i < n; i++) { + float sum = 0.0f; + for (int j = 0; j < i; j++) + sum += L[i * n + j] * y[j]; + y[i] = (b[i] - sum) / L[i * n + i]; + } + for (int i = n - 1; i >= 0; i--) { + float sum = 0.0f; + for (int j = i + 1; j < n; j++) + sum += L[j * n + i] * x[j]; + x[i] = (y[i] - sum) / L[i * n + i]; + } + } +}; + +#endif // __SPECTRUM_HPP__ diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 2a770eca..08dfd01c 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -16,6 +16,7 @@ #include "esrgan.hpp" #include "lora.hpp" #include "pmid.hpp" +#include "spectrum.hpp" #include "tae.hpp" #include "ucache.hpp" #include "vae.hpp" @@ -1687,9 +1688,11 @@ public: EasyCacheState easycache_state; UCacheState ucache_state; CacheDitConditionState cachedit_state; + SpectrumState spectrum_state; bool easycache_enabled = false; bool ucache_enabled = false; bool cachedit_enabled = false; + bool spectrum_enabled = false; if (cache_params != nullptr && cache_params->mode != SD_CACHE_DISABLED) { bool percent_valid = true; @@ -1793,6 +1796,27 @@ public: LOG_WARN("CacheDIT requested but could not be initialized for this run"); } } + } else if (cache_params->mode == SD_CACHE_SPECTRUM) { + bool spectrum_supported = sd_version_is_unet(version); + if (!spectrum_supported) { + LOG_WARN("Spectrum requested but not supported for this model type (only UNET models)"); + } else { + SpectrumConfig spectrum_config; + spectrum_config.w = cache_params->spectrum_w; + spectrum_config.m = cache_params->spectrum_m; + spectrum_config.lam = cache_params->spectrum_lam; + spectrum_config.window_size = cache_params->spectrum_window_size; + spectrum_config.flex_window = cache_params->spectrum_flex_window; + spectrum_config.warmup_steps = cache_params->spectrum_warmup_steps; + spectrum_config.stop_percent = cache_params->spectrum_stop_percent; + size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; + spectrum_state.init(spectrum_config, total_steps); + spectrum_enabled = true; + LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%", + spectrum_config.w, spectrum_config.m, spectrum_config.lam, + spectrum_config.window_size, spectrum_config.flex_window, + spectrum_config.warmup_steps, spectrum_config.stop_percent * 100.0f); + } } } @@ -2015,7 +2039,29 @@ public: timesteps_vec.assign(1, t); } - timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); + timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); + + if (spectrum_enabled && spectrum_state.should_predict()) { + spectrum_state.predict(denoised); + + if (denoise_mask != nullptr) { + apply_mask(denoised, init_latent, denoise_mask); + } + + if (sd_preview_cb != nullptr && sd_should_preview_denoised()) { + if (step % sd_get_preview_interval() == 0) { + preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false); + } + } + + int64_t t1 = ggml_time_us(); + if (step > 0 || step == -(int)steps) { + int showstep = std::abs(step); + pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep); + } + return denoised; + } + auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); std::vector guidance_vec(1, guidance.distilled_guidance); auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); @@ -2189,6 +2235,10 @@ public: vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; } + if (spectrum_enabled) { + spectrum_state.update(denoised); + } + if (denoise_mask != nullptr) { apply_mask(denoised, init_latent, denoise_mask); } @@ -2280,6 +2330,14 @@ public: } } + if (spectrum_enabled && spectrum_state.total_steps_skipped > 0) { + size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; + double speedup = static_cast(total_steps) / + static_cast(total_steps - spectrum_state.total_steps_skipped); + LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)", + spectrum_state.total_steps_skipped, total_steps, speedup); + } + if (inverse_noise_scaling) { x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); } @@ -2941,6 +2999,13 @@ void sd_cache_params_init(sd_cache_params_t* cache_params) { cache_params->taylorseer_skip_interval = 1; cache_params->scm_mask = nullptr; cache_params->scm_policy_dynamic = true; + cache_params->spectrum_w = 0.40f; + cache_params->spectrum_m = 3; + cache_params->spectrum_lam = 1.0f; + cache_params->spectrum_window_size = 2; + cache_params->spectrum_flex_window = 0.50f; + cache_params->spectrum_warmup_steps = 4; + cache_params->spectrum_stop_percent = 0.9f; } void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { From d6dd6d7b555c233bb9bc9f20b4751eb8c9269743 Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 10 Mar 2026 00:36:09 +0800 Subject: [PATCH 05/20] refactor: remove ununsed encode_video (#1332) --- src/stable-diffusion.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 08dfd01c..613ebb00 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -2584,14 +2584,14 @@ public: tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y); } - ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) { + ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x) { int64_t t0 = ggml_time_ms(); ggml_tensor* result = nullptr; const int vae_scale_factor = get_vae_scale_factor(); int64_t W = x->ne[0] / vae_scale_factor; int64_t H = x->ne[1] / vae_scale_factor; int64_t C = get_latent_channel(); - if (vae_tiling_params.enabled && !encode_video) { + if (vae_tiling_params.enabled) { // TODO wan2.2 vae support? int64_t ne2; int64_t ne3; @@ -2619,7 +2619,7 @@ public: if (!use_tiny_autoencoder) { process_vae_input_tensor(x); - if (vae_tiling_params.enabled && !encode_video) { + if (vae_tiling_params.enabled) { float tile_overlap; int tile_size_x, tile_size_y; // multiply tile size for encode to keep the compute buffer size consistent @@ -2636,7 +2636,7 @@ public: } first_stage_model->free_compute_buffer(); } else { - if (vae_tiling_params.enabled && !encode_video) { + if (vae_tiling_params.enabled) { // split latent in 32x32 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { return tae_first_stage->compute(n_threads, in, false, &out, nullptr); @@ -2712,8 +2712,8 @@ public: return latent; } - ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) { - ggml_tensor* vae_output = vae_encode(work_ctx, x, encode_video); + ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { + ggml_tensor* vae_output = vae_encode(work_ctx, x); return get_first_stage_encoding(work_ctx, vae_output); } From 6fa7ca9317beca4786b191a636e6f849611b9e53 Mon Sep 17 00:00:00 2001 From: JusteLeo Date: Sun, 15 Mar 2026 09:40:14 +0100 Subject: [PATCH 06/20] docs: add Anima2 gguf download link to anima.md (#1335) --- docs/anima.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/anima.md b/docs/anima.md index 9c941785..debc370b 100644 --- a/docs/anima.md +++ b/docs/anima.md @@ -5,6 +5,7 @@ - Download Anima - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models - gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main + - gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main - Download vae - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae - Download Qwen3-0.6B-Base @@ -17,4 +18,4 @@ .\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa ``` -anima image example \ No newline at end of file +anima image example From adfef629009ac68e5cd9316ad40f63a2ab10b174 Mon Sep 17 00:00:00 2001 From: rmatif Date: Sun, 15 Mar 2026 09:41:05 +0100 Subject: [PATCH 07/20] feat: add generic DiT support to spectrum cache (#1336) --- examples/cli/README.md | 2 +- examples/common/common.hpp | 4 ++-- examples/server/README.md | 2 +- src/stable-diffusion.cpp | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/cli/README.md b/examples/cli/README.md index 6c2ef1eb..0450be92 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -139,7 +139,7 @@ Generation Options: --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) -r, --ref-image reference image for Flux Kontext models (can be used multiple times) --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), - 'spectrum' (UNET Chebyshev+Taylor forecasting) + 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) --cache-option named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: diff --git a/examples/common/common.hpp b/examples/common/common.hpp index 9c50c159..896edc3b 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -1513,11 +1513,11 @@ struct SDGenerationParams { on_ref_image_arg}, {"", "--cache-mode", - "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)", + "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)", on_cache_mode_arg}, {"", "--cache-option", - "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"", + "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"", on_cache_option_arg}, {"", "--cache-preset", diff --git a/examples/server/README.md b/examples/server/README.md index 75544364..8ed3baa4 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -129,7 +129,7 @@ Default Generation Options: --skip-layers layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) -r, --ref-image reference image for Flux Kontext models (can be used multiple times) - --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level) + --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) --cache-option named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: "threshold=0.25" or "threshold=1.5,reset=0" diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 613ebb00..2c80c9e3 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1797,9 +1797,9 @@ public: } } } else if (cache_params->mode == SD_CACHE_SPECTRUM) { - bool spectrum_supported = sd_version_is_unet(version); + bool spectrum_supported = sd_version_is_unet(version) || sd_version_is_dit(version); if (!spectrum_supported) { - LOG_WARN("Spectrum requested but not supported for this model type (only UNET models)"); + LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)"); } else { SpectrumConfig spectrum_config; spectrum_config.w = cache_params->spectrum_w; From f6968bc58949bfd407a003e7bdac249c3b242cad Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 15 Mar 2026 05:42:47 -0300 Subject: [PATCH 08/20] chore: remove SD_FAST_SOFTMAX build flag (#1338) --- CMakeLists.txt | 7 ------- 1 file changed, 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b90086ea..bad1ba4c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,6 @@ option(SD_VULKAN "sd: vulkan backend" OFF) option(SD_OPENCL "sd: opencl backend" OFF) option(SD_SYCL "sd: sycl backend" OFF) option(SD_MUSA "sd: musa backend" OFF) -option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF) option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF) option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF) option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF) @@ -70,18 +69,12 @@ if (SD_HIPBLAS) message("-- Use HIPBLAS as backend stable-diffusion") set(GGML_HIP ON) add_definitions(-DSD_USE_CUDA) - if(SD_FAST_SOFTMAX) - set(GGML_CUDA_FAST_SOFTMAX ON) - endif() endif () if(SD_MUSA) message("-- Use MUSA as backend stable-diffusion") set(GGML_MUSA ON) add_definitions(-DSD_USE_CUDA) - if(SD_FAST_SOFTMAX) - set(GGML_CUDA_FAST_SOFTMAX ON) - endif() endif() set(SD_LIB stable-diffusion) From 630ee03f23bd9947f610dd9fe038c56c0ff9c2de Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 15 Mar 2026 05:43:46 -0300 Subject: [PATCH 09/20] refactor: move all cache parameter defaults to the library (#1327) --- docs/caching.md | 10 +---- examples/cli/README.md | 1 - examples/common/common.hpp | 64 +++--------------------------- examples/server/README.md | 1 - src/cache_dit.hpp | 81 -------------------------------------- src/stable-diffusion.cpp | 21 ++++++++-- 6 files changed, 24 insertions(+), 154 deletions(-) diff --git a/docs/caching.md b/docs/caching.md index 559b26a9..cb103aee 100644 --- a/docs/caching.md +++ b/docs/caching.md @@ -80,7 +80,7 @@ Uses Taylor series approximation to predict block outputs: Combines DBCache and TaylorSeer: ```bash ---cache-mode cache-dit --cache-preset fast +--cache-mode cache-dit ``` #### Parameters @@ -92,14 +92,6 @@ Combines DBCache and TaylorSeer: | `threshold` | L1 residual difference threshold | 0.08 | | `warmup` | Steps before caching starts | 8 | -#### Presets - -Available presets: `slow`, `medium`, `fast`, `ultra` (or `s`, `m`, `f`, `u`). - -```bash ---cache-mode cache-dit --cache-preset fast -``` - #### SCM Options Steps Computation Mask controls which steps can be cached: diff --git a/examples/cli/README.md b/examples/cli/README.md index 0450be92..904f3c44 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -144,7 +144,6 @@ Generation Options: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2" - --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u' --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache --scm-policy SCM policy: 'dynamic' (default) or 'static' ``` diff --git a/examples/common/common.hpp b/examples/common/common.hpp index 896edc3b..9389b03a 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -1047,7 +1047,6 @@ struct SDGenerationParams { std::string cache_mode; std::string cache_option; - std::string cache_preset; std::string scm_mask; bool scm_policy_dynamic = true; sd_cache_params_t cache_params{}; @@ -1461,21 +1460,6 @@ struct SDGenerationParams { return 1; }; - auto on_cache_preset_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - cache_preset = argv_to_utf8(index, argv); - if (cache_preset != "slow" && cache_preset != "s" && cache_preset != "S" && - cache_preset != "medium" && cache_preset != "m" && cache_preset != "M" && - cache_preset != "fast" && cache_preset != "f" && cache_preset != "F" && - cache_preset != "ultra" && cache_preset != "u" && cache_preset != "U") { - fprintf(stderr, "error: invalid cache preset '%s', must be 'slow'/'s', 'medium'/'m', 'fast'/'f', or 'ultra'/'u'\n", cache_preset.c_str()); - return -1; - } - return 1; - }; - options.manual_options = { {"-s", "--seed", @@ -1519,10 +1503,6 @@ struct SDGenerationParams { "--cache-option", "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"", on_cache_option_arg}, - {"", - "--cache-preset", - "cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'", - on_cache_preset_arg}, {"", "--scm-mask", "SCM steps mask for cache-dit: comma-separated 0/1 (e.g., \"1,1,1,0,0,1,0,0,1,0\") - 1=compute, 0=can cache", @@ -1575,7 +1555,6 @@ struct SDGenerationParams { load_if_exists("negative_prompt", negative_prompt); load_if_exists("cache_mode", cache_mode); load_if_exists("cache_option", cache_option); - load_if_exists("cache_preset", cache_preset); load_if_exists("scm_mask", scm_mask); load_if_exists("clip_skip", clip_skip); @@ -1810,48 +1789,17 @@ struct SDGenerationParams { if (!cache_mode.empty()) { if (cache_mode == "easycache") { - cache_params.mode = SD_CACHE_EASYCACHE; - cache_params.reuse_threshold = 0.2f; - cache_params.start_percent = 0.15f; - cache_params.end_percent = 0.95f; - cache_params.error_decay_rate = 1.0f; - cache_params.use_relative_threshold = true; - cache_params.reset_error_on_compute = true; + cache_params.mode = SD_CACHE_EASYCACHE; } else if (cache_mode == "ucache") { - cache_params.mode = SD_CACHE_UCACHE; - cache_params.reuse_threshold = 1.0f; - cache_params.start_percent = 0.15f; - cache_params.end_percent = 0.95f; - cache_params.error_decay_rate = 1.0f; - cache_params.use_relative_threshold = true; - cache_params.reset_error_on_compute = true; + cache_params.mode = SD_CACHE_UCACHE; } else if (cache_mode == "dbcache") { - cache_params.mode = SD_CACHE_DBCACHE; - cache_params.Fn_compute_blocks = 8; - cache_params.Bn_compute_blocks = 0; - cache_params.residual_diff_threshold = 0.08f; - cache_params.max_warmup_steps = 8; + cache_params.mode = SD_CACHE_DBCACHE; } else if (cache_mode == "taylorseer") { - cache_params.mode = SD_CACHE_TAYLORSEER; - cache_params.Fn_compute_blocks = 8; - cache_params.Bn_compute_blocks = 0; - cache_params.residual_diff_threshold = 0.08f; - cache_params.max_warmup_steps = 8; + cache_params.mode = SD_CACHE_TAYLORSEER; } else if (cache_mode == "cache-dit") { - cache_params.mode = SD_CACHE_CACHE_DIT; - cache_params.Fn_compute_blocks = 8; - cache_params.Bn_compute_blocks = 0; - cache_params.residual_diff_threshold = 0.08f; - cache_params.max_warmup_steps = 8; + cache_params.mode = SD_CACHE_CACHE_DIT; } else if (cache_mode == "spectrum") { - cache_params.mode = SD_CACHE_SPECTRUM; - cache_params.spectrum_w = 0.40f; - cache_params.spectrum_m = 3; - cache_params.spectrum_lam = 1.0f; - cache_params.spectrum_window_size = 2; - cache_params.spectrum_flex_window = 0.50f; - cache_params.spectrum_warmup_steps = 4; - cache_params.spectrum_stop_percent = 0.9f; + cache_params.mode = SD_CACHE_SPECTRUM; } if (!cache_option.empty()) { diff --git a/examples/server/README.md b/examples/server/README.md index 8ed3baa4..38deff61 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -133,7 +133,6 @@ Default Generation Options: --cache-option named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: "threshold=0.25" or "threshold=1.5,reset=0" - --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u' --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache --scm-policy SCM policy: 'dynamic' (default) or 'static' ``` diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp index 6fe104da..4e3cf693 100644 --- a/src/cache_dit.hpp +++ b/src/cache_dit.hpp @@ -603,87 +603,6 @@ inline std::vector generate_scm_mask( return mask; } -inline std::vector get_scm_preset(const std::string& preset, int total_steps) { - struct Preset { - std::vector compute_bins; - std::vector cache_bins; - }; - - Preset slow = {{8, 3, 3, 2, 1, 1}, {1, 2, 2, 2, 3}}; - Preset medium = {{6, 2, 2, 2, 2, 1}, {1, 3, 3, 3, 3}}; - Preset fast = {{6, 1, 1, 1, 1, 1}, {1, 3, 4, 5, 4}}; - Preset ultra = {{4, 1, 1, 1, 1}, {2, 5, 6, 7}}; - - Preset* p = nullptr; - if (preset == "slow" || preset == "s" || preset == "S") - p = &slow; - else if (preset == "medium" || preset == "m" || preset == "M") - p = &medium; - else if (preset == "fast" || preset == "f" || preset == "F") - p = &fast; - else if (preset == "ultra" || preset == "u" || preset == "U") - p = &ultra; - else - return {}; - - if (total_steps != 28 && total_steps > 0) { - float scale = static_cast(total_steps) / 28.0f; - std::vector scaled_compute, scaled_cache; - - for (int v : p->compute_bins) { - scaled_compute.push_back(std::max(1, static_cast(v * scale + 0.5f))); - } - for (int v : p->cache_bins) { - scaled_cache.push_back(std::max(1, static_cast(v * scale + 0.5f))); - } - - return generate_scm_mask(scaled_compute, scaled_cache, total_steps); - } - - return generate_scm_mask(p->compute_bins, p->cache_bins, total_steps); -} - -inline float get_preset_threshold(const std::string& preset) { - if (preset == "slow" || preset == "s" || preset == "S") - return 0.20f; - if (preset == "medium" || preset == "m" || preset == "M") - return 0.25f; - if (preset == "fast" || preset == "f" || preset == "F") - return 0.30f; - if (preset == "ultra" || preset == "u" || preset == "U") - return 0.34f; - return 0.08f; -} - -inline int get_preset_warmup(const std::string& preset) { - if (preset == "slow" || preset == "s" || preset == "S") - return 8; - if (preset == "medium" || preset == "m" || preset == "M") - return 6; - if (preset == "fast" || preset == "f" || preset == "F") - return 6; - if (preset == "ultra" || preset == "u" || preset == "U") - return 4; - return 8; -} - -inline int get_preset_Fn(const std::string& preset) { - if (preset == "slow" || preset == "s" || preset == "S") - return 8; - if (preset == "medium" || preset == "m" || preset == "M") - return 8; - if (preset == "fast" || preset == "f" || preset == "F") - return 6; - if (preset == "ultra" || preset == "u" || preset == "U") - return 4; - return 8; -} - -inline int get_preset_Bn(const std::string& preset) { - (void)preset; - return 0; -} - inline void parse_dbcache_options(const std::string& opts, DBCacheConfig& cfg) { if (opts.empty()) return; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 2c80c9e3..d4b64ee8 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -98,6 +98,19 @@ void suppress_pp(int step, int steps, float time, void* data) { return; } +static float get_cache_reuse_threshold(const sd_cache_params_t& params) { + float reuse_threshold = params.reuse_threshold; + if (reuse_threshold == INFINITY) { + if (params.mode == SD_CACHE_EASYCACHE) { + reuse_threshold = 0.2; + } + else if (params.mode == SD_CACHE_UCACHE) { + reuse_threshold = 1.0; + } + } + return std::max(0.0f, reuse_threshold); +} + /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -1715,7 +1728,7 @@ public: } else { EasyCacheConfig easycache_config; easycache_config.enabled = true; - easycache_config.reuse_threshold = std::max(0.0f, cache_params->reuse_threshold); + easycache_config.reuse_threshold = get_cache_reuse_threshold(*cache_params); easycache_config.start_percent = cache_params->start_percent; easycache_config.end_percent = cache_params->end_percent; easycache_state.init(easycache_config, denoiser.get()); @@ -1736,7 +1749,7 @@ public: } else { UCacheConfig ucache_config; ucache_config.enabled = true; - ucache_config.reuse_threshold = std::max(0.0f, cache_params->reuse_threshold); + ucache_config.reuse_threshold = get_cache_reuse_threshold(*cache_params); ucache_config.start_percent = cache_params->start_percent; ucache_config.end_percent = cache_params->end_percent; ucache_config.error_decay_rate = std::max(0.0f, std::min(1.0f, cache_params->error_decay_rate)); @@ -2983,7 +2996,7 @@ enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) { void sd_cache_params_init(sd_cache_params_t* cache_params) { *cache_params = {}; cache_params->mode = SD_CACHE_DISABLED; - cache_params->reuse_threshold = 1.0f; + cache_params->reuse_threshold = INFINITY; cache_params->start_percent = 0.15f; cache_params->end_percent = 0.95f; cache_params->error_decay_rate = 1.0f; @@ -3229,7 +3242,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { snprintf(buf + strlen(buf), 4096 - strlen(buf), "cache: %s (threshold=%.3f, start=%.2f, end=%.2f)\n", cache_mode_str, - sd_img_gen_params->cache.reuse_threshold, + get_cache_reuse_threshold(sd_img_gen_params->cache), sd_img_gen_params->cache.start_percent, sd_img_gen_params->cache.end_percent); free(sample_params_str); From 83eabd7c0123eeb8cf4b96588c059dfacc4883e6 Mon Sep 17 00:00:00 2001 From: Kevin Nause Date: Sun, 15 Mar 2026 04:46:01 -0400 Subject: [PATCH 10/20] ci: add CUDA Dockerfile (#1314) --- .github/workflows/build.yml | 2 +- Dockerfile.cuda | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 Dockerfile.cuda diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b2da00dc..9816e424 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -162,7 +162,7 @@ jobs: strategy: matrix: - variant: [musa, sycl, vulkan] + variant: [musa, sycl, vulkan, cuda] env: REGISTRY: ghcr.io diff --git a/Dockerfile.cuda b/Dockerfile.cuda new file mode 100644 index 00000000..13fef89a --- /dev/null +++ b/Dockerfile.cuda @@ -0,0 +1,25 @@ +ARG CUDA_VERSION=12.6.3 +ARG UBUNTU_VERSION=24.04 + +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build + +RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake + +WORKDIR /sd.cpp + +COPY . . + +ARG CUDACXX=/usr/local/cuda/bin/nvcc +RUN cmake . -B ./build -DSD_CUDA=ON +RUN cmake --build ./build --config Release --parallel + +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime + +RUN apt-get update && \ + apt-get install --yes --no-install-recommends libgomp1 && \ + apt-get clean + +COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli +COPY --from=build /sd.cpp/build/bin/sd-server /sd-server + +ENTRYPOINT [ "/sd-cli" ] From acc3bf1fdcb682c30b4ff9a80f19b891012d4ebc Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 15 Mar 2026 16:57:42 +0800 Subject: [PATCH 11/20] refactor: optimize the VAE architecture (#1345) --- src/auto_encoder_kl.hpp | 930 ++++++++++++++++++++++++++++++++++++++ src/ggml_extend.hpp | 35 +- src/model.cpp | 6 +- src/name_conversion.cpp | 6 +- src/stable-diffusion.cpp | 637 +++++---------------------- src/tae.hpp | 139 +++--- src/vae.hpp | 931 +++++++++------------------------------ src/wan.hpp | 112 ++++- 8 files changed, 1437 insertions(+), 1359 deletions(-) create mode 100644 src/auto_encoder_kl.hpp diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp new file mode 100644 index 00000000..581bc59d --- /dev/null +++ b/src/auto_encoder_kl.hpp @@ -0,0 +1,930 @@ +#ifndef __AUTO_ENCODER_KL_HPP__ +#define __AUTO_ENCODER_KL_HPP__ + +#include "vae.hpp" + +/*================================================== AutoEncoderKL ===================================================*/ + +#define VAE_GRAPH_SIZE 20480 + +class ResnetBlock : public UnaryBlock { +protected: + int64_t in_channels; + int64_t out_channels; + +public: + ResnetBlock(int64_t in_channels, + int64_t out_channels) + : in_channels(in_channels), + out_channels(out_channels) { + // temb_channels is always 0 + blocks["norm1"] = std::shared_ptr(new GroupNorm32(in_channels)); + blocks["conv1"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + + blocks["norm2"] = std::shared_ptr(new GroupNorm32(out_channels)); + blocks["conv2"] = std::shared_ptr(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + + if (out_channels != in_channels) { + blocks["nin_shortcut"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {1, 1})); + } + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + // x: [N, in_channels, h, w] + // t_emb is always None + auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); + auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); + auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); + auto conv2 = std::dynamic_pointer_cast(blocks["conv2"]); + + auto h = x; + h = norm1->forward(ctx, h); + h = ggml_silu_inplace(ctx->ggml_ctx, h); // swish + h = conv1->forward(ctx, h); + // return h; + + h = norm2->forward(ctx, h); + h = ggml_silu_inplace(ctx->ggml_ctx, h); // swish + // dropout, skip for inference + h = conv2->forward(ctx, h); + + // skip connection + if (out_channels != in_channels) { + auto nin_shortcut = std::dynamic_pointer_cast(blocks["nin_shortcut"]); + + x = nin_shortcut->forward(ctx, x); // [N, out_channels, h, w] + } + + h = ggml_add(ctx->ggml_ctx, h, x); + return h; // [N, out_channels, h, w] + } +}; + +class AttnBlock : public UnaryBlock { +protected: + int64_t in_channels; + bool use_linear; + + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { + auto iter = tensor_storage_map.find(prefix + "proj_out.weight"); + if (iter != tensor_storage_map.end()) { + if (iter->second.n_dims == 4 && use_linear) { + use_linear = false; + blocks["q"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); + blocks["k"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); + blocks["v"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); + blocks["proj_out"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); + } else if (iter->second.n_dims == 2 && !use_linear) { + use_linear = true; + blocks["q"] = std::make_shared(in_channels, in_channels); + blocks["k"] = std::make_shared(in_channels, in_channels); + blocks["v"] = std::make_shared(in_channels, in_channels); + blocks["proj_out"] = std::make_shared(in_channels, in_channels); + } + } + } + +public: + AttnBlock(int64_t in_channels, bool use_linear) + : in_channels(in_channels), use_linear(use_linear) { + blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels)); + if (use_linear) { + blocks["q"] = std::shared_ptr(new Linear(in_channels, in_channels)); + blocks["k"] = std::shared_ptr(new Linear(in_channels, in_channels)); + blocks["v"] = std::shared_ptr(new Linear(in_channels, in_channels)); + blocks["proj_out"] = std::shared_ptr(new Linear(in_channels, in_channels)); + } else { + blocks["q"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); + blocks["k"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); + blocks["v"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); + blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); + } + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + // x: [N, in_channels, h, w] + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto q_proj = std::dynamic_pointer_cast(blocks["q"]); + auto k_proj = std::dynamic_pointer_cast(blocks["k"]); + auto v_proj = std::dynamic_pointer_cast(blocks["v"]); + auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); + + auto h_ = norm->forward(ctx, x); + + const int64_t n = h_->ne[3]; + const int64_t c = h_->ne[2]; + const int64_t h = h_->ne[1]; + const int64_t w = h_->ne[0]; + + ggml_tensor* q; + ggml_tensor* k; + ggml_tensor* v; + if (use_linear) { + h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 2, 0, 3)); // [N, h, w, in_channels] + h_ = ggml_reshape_3d(ctx->ggml_ctx, h_, c, h * w, n); // [N, h * w, in_channels] + + q = q_proj->forward(ctx, h_); // [N, h * w, in_channels] + k = k_proj->forward(ctx, h_); // [N, h * w, in_channels] + v = v_proj->forward(ctx, h_); // [N, h * w, in_channels] + } else { + q = q_proj->forward(ctx, h_); // [N, in_channels, h, w] + q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels] + q = ggml_reshape_3d(ctx->ggml_ctx, q, c, h * w, n); // [N, h * w, in_channels] + + k = k_proj->forward(ctx, h_); // [N, in_channels, h, w] + k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels] + k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n); // [N, h * w, in_channels] + + v = v_proj->forward(ctx, h_); // [N, in_channels, h, w] + v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3)); // [N, h, w, in_channels] + v = ggml_reshape_3d(ctx->ggml_ctx, v, c, h * w, n); // [N, h * w, in_channels] + } + + h_ = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, false, ctx->flash_attn_enabled); + + if (use_linear) { + h_ = proj_out->forward(ctx, h_); // [N, h * w, in_channels] + + h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w] + h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n); // [N, in_channels, h, w] + } else { + h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w] + h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n); // [N, in_channels, h, w] + + h_ = proj_out->forward(ctx, h_); // [N, in_channels, h, w] + } + + h_ = ggml_add(ctx->ggml_ctx, h_, x); + return h_; + } +}; + +class AE3DConv : public Conv2d { +public: + AE3DConv(int64_t in_channels, + int64_t out_channels, + std::pair kernel_size, + int video_kernel_size = 3, + std::pair stride = {1, 1}, + std::pair padding = {0, 0}, + std::pair dilation = {1, 1}, + bool bias = true) + : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) { + int kernel_padding = video_kernel_size / 2; + blocks["time_mix_conv"] = std::shared_ptr(new Conv3d(out_channels, + out_channels, + {video_kernel_size, 1, 1}, + {1, 1, 1}, + {kernel_padding, 0, 0})); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x) override { + // timesteps always None + // skip_video always False + // x: [N, IC, IH, IW] + // result: [N, OC, OH, OW] + auto time_mix_conv = std::dynamic_pointer_cast(blocks["time_mix_conv"]); + + x = Conv2d::forward(ctx, x); + // timesteps = x.shape[0] + // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps) + // x = conv3d(x) + // return rearrange(x, "b c t h w -> (b t) c h w") + int64_t T = x->ne[3]; + int64_t B = x->ne[3] / T; + int64_t C = x->ne[2]; + int64_t H = x->ne[1]; + int64_t W = x->ne[0]; + + x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w) + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w) + x = time_mix_conv->forward(ctx, x); // [B, OC, T, OH * OW] + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) + x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w + return x; // [B*T, OC, OH, OW] + } +}; + +class VideoResnetBlock : public ResnetBlock { +protected: + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32); + params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); + } + + float get_alpha() { + float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]); + return sigmoid(alpha); + } + +public: + VideoResnetBlock(int64_t in_channels, + int64_t out_channels, + int video_kernel_size = 3) + : ResnetBlock(in_channels, out_channels) { + // merge_strategy is always learned + blocks["time_stack"] = std::shared_ptr(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true)); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w] + // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w] + // t_emb is always None + // skip_video is always False + // timesteps is always None + auto time_stack = std::dynamic_pointer_cast(blocks["time_stack"]); + + x = ResnetBlock::forward(ctx, x); // [N, out_channels, h, w] + // return x; + + int64_t T = x->ne[3]; + int64_t B = x->ne[3] / T; + int64_t C = x->ne[2]; + int64_t H = x->ne[1]; + int64_t W = x->ne[0]; + + x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w) + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w) + auto x_mix = x; + + x = time_stack->forward(ctx, x); // b t c (h w) + + float alpha = get_alpha(); + x = ggml_add(ctx->ggml_ctx, + ggml_ext_scale(ctx->ggml_ctx, x, alpha), + ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha)); + + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) + x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w + + return x; + } +}; + +// ldm.modules.diffusionmodules.model.Encoder +class Encoder : public GGMLBlock { +protected: + int ch = 128; + std::vector ch_mult = {1, 2, 4, 4}; + int num_res_blocks = 2; + int in_channels = 3; + int z_channels = 4; + bool double_z = true; + +public: + Encoder(int ch, + std::vector ch_mult, + int num_res_blocks, + int in_channels, + int z_channels, + bool double_z = true, + bool use_linear_projection = false) + : ch(ch), + ch_mult(ch_mult), + num_res_blocks(num_res_blocks), + in_channels(in_channels), + z_channels(z_channels), + double_z(double_z) { + blocks["conv_in"] = std::shared_ptr(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1})); + + size_t num_resolutions = ch_mult.size(); + + int block_in = 1; + for (int i = 0; i < num_resolutions; i++) { + if (i == 0) { + block_in = ch; + } else { + block_in = ch * ch_mult[i - 1]; + } + int block_out = ch * ch_mult[i]; + for (int j = 0; j < num_res_blocks; j++) { + std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j); + blocks[name] = std::shared_ptr(new ResnetBlock(block_in, block_out)); + block_in = block_out; + } + if (i != num_resolutions - 1) { + std::string name = "down." + std::to_string(i) + ".downsample"; + blocks[name] = std::shared_ptr(new DownSampleBlock(block_in, block_in, true)); + } + } + + blocks["mid.block_1"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); + blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, use_linear_projection)); + blocks["mid.block_2"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); + + blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); + blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1})); + } + + virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + // x: [N, in_channels, h, w] + + auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); + auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]); + auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]); + auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]); + auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); + auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); + + auto h = conv_in->forward(ctx, x); // [N, ch, h, w] + + // downsampling + size_t num_resolutions = ch_mult.size(); + for (int i = 0; i < num_resolutions; i++) { + for (int j = 0; j < num_res_blocks; j++) { + std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j); + auto down_block = std::dynamic_pointer_cast(blocks[name]); + + h = down_block->forward(ctx, h); + } + if (i != num_resolutions - 1) { + std::string name = "down." + std::to_string(i) + ".downsample"; + auto down_sample = std::dynamic_pointer_cast(blocks[name]); + + h = down_sample->forward(ctx, h); + } + } + + // middle + h = mid_block_1->forward(ctx, h); + h = mid_attn_1->forward(ctx, h); + h = mid_block_2->forward(ctx, h); // [N, block_in, h, w] + + // end + h = norm_out->forward(ctx, h); + h = ggml_silu_inplace(ctx->ggml_ctx, h); // nonlinearity/swish + h = conv_out->forward(ctx, h); // [N, z_channels*2, h, w] + return h; + } +}; + +// ldm.modules.diffusionmodules.model.Decoder +class Decoder : public GGMLBlock { +protected: + int ch = 128; + int out_ch = 3; + std::vector ch_mult = {1, 2, 4, 4}; + int num_res_blocks = 2; + int z_channels = 4; + bool video_decoder = false; + int video_kernel_size = 3; + + virtual std::shared_ptr get_conv_out(int64_t in_channels, + int64_t out_channels, + std::pair kernel_size, + std::pair stride = {1, 1}, + std::pair padding = {0, 0}) { + if (video_decoder) { + return std::shared_ptr(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding)); + } else { + return std::shared_ptr(new Conv2d(in_channels, out_channels, kernel_size, stride, padding)); + } + } + + virtual std::shared_ptr get_resnet_block(int64_t in_channels, + int64_t out_channels) { + if (video_decoder) { + return std::shared_ptr(new VideoResnetBlock(in_channels, out_channels, video_kernel_size)); + } else { + return std::shared_ptr(new ResnetBlock(in_channels, out_channels)); + } + } + +public: + Decoder(int ch, + int out_ch, + std::vector ch_mult, + int num_res_blocks, + int z_channels, + bool use_linear_projection = false, + bool video_decoder = false, + int video_kernel_size = 3) + : ch(ch), + out_ch(out_ch), + ch_mult(ch_mult), + num_res_blocks(num_res_blocks), + z_channels(z_channels), + video_decoder(video_decoder), + video_kernel_size(video_kernel_size) { + int num_resolutions = static_cast(ch_mult.size()); + int block_in = ch * ch_mult[num_resolutions - 1]; + + blocks["conv_in"] = std::shared_ptr(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1})); + + blocks["mid.block_1"] = get_resnet_block(block_in, block_in); + blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, use_linear_projection)); + blocks["mid.block_2"] = get_resnet_block(block_in, block_in); + + for (int i = num_resolutions - 1; i >= 0; i--) { + int mult = ch_mult[i]; + int block_out = ch * mult; + for (int j = 0; j < num_res_blocks + 1; j++) { + std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j); + blocks[name] = get_resnet_block(block_in, block_out); + + block_in = block_out; + } + if (i != 0) { + std::string name = "up." + std::to_string(i) + ".upsample"; + blocks[name] = std::shared_ptr(new UpSampleBlock(block_in, block_in)); + } + } + + blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); + blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}); + } + + virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) { + // z: [N, z_channels, h, w] + // alpha is always 0 + // merge_strategy is always learned + // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock + // AttnVideoBlock will not be used + auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); + auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]); + auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]); + auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]); + auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); + auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); + + // conv_in + auto h = conv_in->forward(ctx, z); // [N, block_in, h, w] + + // middle + h = mid_block_1->forward(ctx, h); + // return h; + + h = mid_attn_1->forward(ctx, h); + h = mid_block_2->forward(ctx, h); // [N, block_in, h, w] + + // upsampling + int num_resolutions = static_cast(ch_mult.size()); + for (int i = num_resolutions - 1; i >= 0; i--) { + for (int j = 0; j < num_res_blocks + 1; j++) { + std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j); + auto up_block = std::dynamic_pointer_cast(blocks[name]); + + h = up_block->forward(ctx, h); + } + if (i != 0) { + std::string name = "up." + std::to_string(i) + ".upsample"; + auto up_sample = std::dynamic_pointer_cast(blocks[name]); + + h = up_sample->forward(ctx, h); + } + } + + h = norm_out->forward(ctx, h); + h = ggml_silu_inplace(ctx->ggml_ctx, h); // nonlinearity/swish + h = conv_out->forward(ctx, h); // [N, out_ch, h*8, w*8] + return h; + } +}; + +// ldm.models.autoencoder.AutoencoderKL +class AutoEncoderKLModel : public GGMLBlock { +protected: + SDVersion version; + bool decode_only = true; + bool use_video_decoder = false; + bool use_quant = true; + int embed_dim = 4; + struct { + int z_channels = 4; + int resolution = 256; + int in_channels = 3; + int out_ch = 3; + int ch = 128; + std::vector ch_mult = {1, 2, 4, 4}; + int num_res_blocks = 2; + bool double_z = true; + } dd_config; + +public: + AutoEncoderKLModel(SDVersion version = VERSION_SD1, + bool decode_only = true, + bool use_linear_projection = false, + bool use_video_decoder = false) + : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) { + if (sd_version_is_dit(version)) { + if (sd_version_is_flux2(version)) { + dd_config.z_channels = 32; + embed_dim = 32; + } else { + use_quant = false; + dd_config.z_channels = 16; + } + } + if (use_video_decoder) { + use_quant = false; + } + blocks["decoder"] = std::shared_ptr(new Decoder(dd_config.ch, + dd_config.out_ch, + dd_config.ch_mult, + dd_config.num_res_blocks, + dd_config.z_channels, + use_linear_projection, + use_video_decoder)); + if (use_quant) { + blocks["post_quant_conv"] = std::shared_ptr(new Conv2d(dd_config.z_channels, + embed_dim, + {1, 1})); + } + if (!decode_only) { + blocks["encoder"] = std::shared_ptr(new Encoder(dd_config.ch, + dd_config.ch_mult, + dd_config.num_res_blocks, + dd_config.in_channels, + dd_config.z_channels, + dd_config.double_z, + use_linear_projection)); + if (use_quant) { + int factor = dd_config.double_z ? 2 : 1; + + blocks["quant_conv"] = std::shared_ptr(new Conv2d(embed_dim * factor, + dd_config.z_channels * factor, + {1, 1})); + } + } + } + + struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) { + // z: [N, z_channels, h, w] + if (sd_version_is_flux2(version)) { + // [N, C*p*p, h, w] -> [N, C, h*p, w*p] + int64_t p = 2; + + int64_t N = z->ne[3]; + int64_t C = z->ne[2] / p / p; + int64_t h = z->ne[1]; + int64_t w = z->ne[0]; + int64_t H = h * p; + int64_t W = w * p; + + z = ggml_reshape_4d(ctx->ggml_ctx, z, w * h, p * p, C, N); // [N, C, p*p, h*w] + z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 1, 0, 2, 3)); // [N, C, h*w, p*p] + z = ggml_reshape_4d(ctx->ggml_ctx, z, p, p, w, h * C * N); // [N*C*h, w, p, p] + z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 0, 2, 1, 3)); // [N*C*h, p, w, p] + z = ggml_reshape_4d(ctx->ggml_ctx, z, W, H, C, N); // [N, C, h*p, w*p] + } + + if (use_quant) { + auto post_quant_conv = std::dynamic_pointer_cast(blocks["post_quant_conv"]); + z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w] + } + auto decoder = std::dynamic_pointer_cast(blocks["decoder"]); + + ggml_set_name(z, "bench-start"); + auto h = decoder->forward(ctx, z); + ggml_set_name(h, "bench-end"); + return h; + } + + struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + // x: [N, in_channels, h, w] + auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); + + auto z = encoder->forward(ctx, x); // [N, 2*z_channels, h/8, w/8] + if (use_quant) { + auto quant_conv = std::dynamic_pointer_cast(blocks["quant_conv"]); + z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8] + } + if (sd_version_is_flux2(version)) { + z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0]; + + // [N, C, H, W] -> [N, C*p*p, H/p, W/p] + int64_t p = 2; + int64_t N = z->ne[3]; + int64_t C = z->ne[2]; + int64_t H = z->ne[1]; + int64_t W = z->ne[0]; + int64_t h = H / p; + int64_t w = W / p; + + z = ggml_reshape_4d(ctx->ggml_ctx, z, p, w, p, h * C * N); // [N*C*h, p, w, p] + z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 2, 1, 3)); // [N*C*h, w, p, p] + z = ggml_reshape_4d(ctx->ggml_ctx, z, p * p, w * h, C, N); // [N, C, h*w, p*p] + z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 1, 0, 2, 3)); // [N, C, p*p, h*w] + z = ggml_reshape_4d(ctx->ggml_ctx, z, w, h, p * p * C, N); // [N, C*p*p, h*w] + } + return z; + } + + int get_encoder_output_channels() { + int factor = dd_config.double_z ? 2 : 1; + return dd_config.z_channels * factor; + } +}; + +struct AutoEncoderKL : public VAE { + float scale_factor = 1.f; + float shift_factor = 0.f; + bool decode_only = true; + AutoEncoderKLModel ae; + + AutoEncoderKL(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map, + const std::string prefix, + bool decode_only = false, + bool use_video_decoder = false, + SDVersion version = VERSION_SD1) + : decode_only(decode_only), VAE(version, backend, offload_params_to_cpu) { + if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { + scale_factor = 0.18215f; + shift_factor = 0.f; + } else if (sd_version_is_sdxl(version)) { + scale_factor = 0.13025f; + shift_factor = 0.f; + } else if (sd_version_is_sd3(version)) { + scale_factor = 1.5305f; + shift_factor = 0.0609f; + } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) { + scale_factor = 0.3611f; + shift_factor = 0.1159f; + } else if (sd_version_is_flux2(version)) { + scale_factor = 1.0f; + shift_factor = 0.f; + } + bool use_linear_projection = false; + for (const auto& [name, tensor_storage] : tensor_storage_map) { + if (!starts_with(name, prefix)) { + continue; + } + if (ends_with(name, "attn_1.proj_out.weight")) { + if (tensor_storage.n_dims == 2) { + use_linear_projection = true; + } + break; + } + } + ae = AutoEncoderKLModel(version, decode_only, use_linear_projection, use_video_decoder); + ae.init(params_ctx, tensor_storage_map, prefix); + } + + void set_conv2d_scale(float scale) override { + std::vector blocks; + ae.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->set_scale(scale); + } + } + } + + std::string get_desc() override { + return "vae"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) override { + ae.get_param_tensors(tensors, prefix); + } + + struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { + struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + + z = to_backend(z); + + auto runner_ctx = get_context(); + + struct ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z); + + ggml_build_forward_expand(gf, out); + + return gf; + } + + bool _compute(const int n_threads, + struct ggml_tensor* z, + bool decode_graph, + struct ggml_tensor** output, + struct ggml_context* output_ctx = nullptr) override { + GGML_ASSERT(!decode_only || decode_graph); + auto get_graph = [&]() -> struct ggml_cgraph* { + return build_graph(z, decode_graph); + }; + // ggml_set_f32(z, 0.5f); + // print_ggml_tensor(z); + return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + } + + ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments, std::shared_ptr rng) { + // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample + ggml_tensor* latents = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); + struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latents); + ggml_ext_im_set_randn_f32(noise, rng); + { + float mean = 0; + float logvar = 0; + float value = 0; + float std_ = 0; + for (int i = 0; i < latents->ne[3]; i++) { + for (int j = 0; j < latents->ne[2]; j++) { + for (int k = 0; k < latents->ne[1]; k++) { + for (int l = 0; l < latents->ne[0]; l++) { + mean = ggml_ext_tensor_get_f32(moments, l, k, j, i); + logvar = ggml_ext_tensor_get_f32(moments, l, k, j + (int)latents->ne[2], i); + logvar = std::max(-30.0f, std::min(logvar, 20.0f)); + std_ = std::exp(0.5f * logvar); + value = mean + std_ * ggml_ext_tensor_get_f32(noise, l, k, j, i); + // printf("%d %d %d %d -> %f\n", i, j, k, l, value); + ggml_ext_tensor_set_f32(latents, value, l, k, j, i); + } + } + } + } + } + return latents; + } + + ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + if (sd_version_is_flux2(version)) { + return vae_output; + } else if (version == VERSION_SD1_PIX2PIX) { + return ggml_view_3d(work_ctx, + vae_output, + vae_output->ne[0], + vae_output->ne[1], + vae_output->ne[2] / 2, + vae_output->nb[1], + vae_output->nb[2], + 0); + } else { + return gaussian_latent_sample(work_ctx, vae_output, rng); + } + } + + void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector& latents_mean_vec, std::vector& latents_std_vec) { + // flux2 + if (sd_version_is_flux2(version)) { + GGML_ASSERT(latents->ne[channel_dim] == 128); + latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f, + -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f, + -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f, + 0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f, + -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f, + 0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f, + 0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f, + 0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f, + -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f, + 0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f, + -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f, + 0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f, + -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f, + 0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f, + -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f, + -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f}; + latents_std_vec = { + 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f, + 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f, + 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f, + 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f, + 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f, + 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f, + 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f, + 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f, + 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f, + 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f, + 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f, + 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f, + 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f, + 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f, + 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f, + 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f}; + } else { + GGML_ABORT("unknown version %d", version); + } + } + + ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { + ggml_tensor* vae_latents = ggml_dup(work_ctx, latents); + if (sd_version_is_flux2(version)) { + int channel_dim = 2; + std::vector latents_mean_vec; + std::vector latents_std_vec; + get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); + + float mean; + float std_; + for (int i = 0; i < latents->ne[3]; i++) { + if (channel_dim == 3) { + mean = latents_mean_vec[i]; + std_ = latents_std_vec[i]; + } + for (int j = 0; j < latents->ne[2]; j++) { + if (channel_dim == 2) { + mean = latents_mean_vec[j]; + std_ = latents_std_vec[j]; + } + for (int k = 0; k < latents->ne[1]; k++) { + for (int l = 0; l < latents->ne[0]; l++) { + float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); + value = value * std_ / scale_factor + mean; + ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i); + } + } + } + } + } else { + ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3); + value = (value / scale_factor) + shift_factor; + ggml_ext_tensor_set_f32(vae_latents, value, i0, i1, i2, i3); + }); + } + return vae_latents; + } + + ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { + ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents); + if (sd_version_is_flux2(version)) { + int channel_dim = 2; + std::vector latents_mean_vec; + std::vector latents_std_vec; + get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); + + float mean; + float std_; + for (int i = 0; i < latents->ne[3]; i++) { + if (channel_dim == 3) { + mean = latents_mean_vec[i]; + std_ = latents_std_vec[i]; + } + for (int j = 0; j < latents->ne[2]; j++) { + if (channel_dim == 2) { + mean = latents_mean_vec[j]; + std_ = latents_std_vec[j]; + } + for (int k = 0; k < latents->ne[1]; k++) { + for (int l = 0; l < latents->ne[0]; l++) { + float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); + value = (value - mean) * scale_factor / std_; + ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i); + } + } + } + } + } else { + ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3); + value = (value - shift_factor) * scale_factor; + ggml_ext_tensor_set_f32(diffusion_latents, value, i0, i1, i2, i3); + }); + } + return diffusion_latents; + } + + int get_encoder_output_channels(int input_channels) { + return ae.get_encoder_output_channels(); + } + + void test() { + struct ggml_init_params params; + params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB + params.mem_buffer = nullptr; + params.no_alloc = false; + + struct ggml_context* work_ctx = ggml_init(params); + GGML_ASSERT(work_ctx != nullptr); + + { + // CPU, x{1, 3, 64, 64}: Pass + // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan + // CPU, x{2, 3, 64, 64}: Wrong result + // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result + auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2); + ggml_set_f32(x, 0.5f); + print_ggml_tensor(x); + struct ggml_tensor* out = nullptr; + + int64_t t0 = ggml_time_ms(); + _compute(8, x, false, &out, work_ctx); + int64_t t1 = ggml_time_ms(); + + print_ggml_tensor(out); + LOG_DEBUG("encode test done in %lldms", t1 - t0); + } + + if (false) { + // CPU, z{1, 4, 8, 8}: Pass + // CUDA, z{1, 4, 8, 8}: Pass + // CPU, z{3, 4, 8, 8}: Wrong result + // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result + auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); + ggml_set_f32(z, 0.5f); + print_ggml_tensor(z); + struct ggml_tensor* out = nullptr; + + int64_t t0 = ggml_time_ms(); + _compute(8, z, true, &out, work_ctx); + int64_t t1 = ggml_time_ms(); + + print_ggml_tensor(out); + LOG_DEBUG("decode test done in %lldms", t1 - t0); + } + }; +}; + +#endif // __AUTO_ENCODER_KL_HPP__ \ No newline at end of file diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 954aee2b..a51976e1 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -377,6 +377,12 @@ __STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_ten ggml_free(ctx); } +__STATIC_INLINE__ ggml_tensor* ggml_ext_dup_and_cpy_tensor(ggml_context* ctx, ggml_tensor* src) { + ggml_tensor* dup = ggml_dup_tensor(ctx, src); + copy_ggml_tensor(dup, src); + return dup; +} + __STATIC_INLINE__ float sigmoid(float x) { return 1 / (1.0f + expf(-x)); } @@ -637,7 +643,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_tensor_concat(struct ggml_context } // convert values from [0, 1] to [-1, 1] -__STATIC_INLINE__ void process_vae_input_tensor(struct ggml_tensor* src) { +__STATIC_INLINE__ void scale_to_minus1_1(struct ggml_tensor* src) { int64_t nelements = ggml_nelements(src); float* data = (float*)src->data; for (int i = 0; i < nelements; i++) { @@ -647,7 +653,7 @@ __STATIC_INLINE__ void process_vae_input_tensor(struct ggml_tensor* src) { } // convert values from [-1, 1] to [0, 1] -__STATIC_INLINE__ void process_vae_output_tensor(struct ggml_tensor* src) { +__STATIC_INLINE__ void scale_to_0_1(struct ggml_tensor* src) { int64_t nelements = ggml_nelements(src); float* data = (float*)src->data; for (int i = 0; i < nelements; i++) { @@ -834,7 +840,8 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, const float tile_overlap_factor, const bool circular_x, const bool circular_y, - on_tile_process on_processing) { + on_tile_process on_processing, + bool slient = false) { output = ggml_set_f32(output, 0); int input_width = (int)input->ne[0]; @@ -864,8 +871,10 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, float tile_overlap_factor_y; sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor, circular_y); - LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y); - LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor); + if (!slient) { + LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y); + LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor); + } int tile_overlap_x = (int32_t)(p_tile_size_x * tile_overlap_factor_x); int non_tile_overlap_x = p_tile_size_x - tile_overlap_x; @@ -896,7 +905,9 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, params.mem_buffer = nullptr; params.no_alloc = false; - LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); + if (!slient) { + LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); + } // draft context struct ggml_context* tiles_ctx = ggml_init(params); @@ -909,8 +920,10 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]); ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]); int num_tiles = num_tiles_x * num_tiles_y; - LOG_DEBUG("processing %i tiles", num_tiles); - pretty_progress(0, num_tiles, 0.0f); + if (!slient) { + LOG_DEBUG("processing %i tiles", num_tiles); + pretty_progress(0, num_tiles, 0.0f); + } int tile_count = 1; bool last_y = false, last_x = false; float last_time = 0.0f; @@ -960,8 +973,10 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, } last_x = false; } - if (tile_count < num_tiles) { - pretty_progress(num_tiles, num_tiles, last_time); + if (!slient) { + if (tile_count < num_tiles) { + pretty_progress(num_tiles, num_tiles, last_time); + } } ggml_free(tiles_ctx); } diff --git a/src/model.cpp b/src/model.cpp index 77b032c2..87b65455 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -1104,10 +1104,12 @@ SDVersion ModelLoader::get_sd_version() { tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) { has_middle_block_1 = true; } - if (tensor_storage.name.find("model.diffusion_model.output_blocks.3.1.transformer_blocks.1") != std::string::npos) { + if (tensor_storage.name.find("model.diffusion_model.output_blocks.3.1.transformer_blocks.1") != std::string::npos || + tensor_storage.name.find("unet.up_blocks.1.attentions.0.transformer_blocks.1") != std::string::npos) { has_output_block_311 = true; } - if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos) { + if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos || + tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) { has_output_block_71 = true; } if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" || diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp index 3b3abfb6..d5d5e052 100644 --- a/src/name_conversion.cpp +++ b/src/name_conversion.cpp @@ -1120,7 +1120,11 @@ std::string convert_tensor_name(std::string name, SDVersion version) { for (const auto& prefix : first_stage_model_prefix_vec) { if (starts_with(name, prefix)) { name = convert_first_stage_model_name(name.substr(prefix.size()), prefix); - name = prefix + name; + if (version == VERSION_SDXS) { + name = "tae." + name; + } else { + name = prefix + name; + } break; } } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index d4b64ee8..b1243d69 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -7,6 +7,7 @@ #include "stable-diffusion.h" #include "util.h" +#include "auto_encoder_kl.hpp" #include "cache_dit.hpp" #include "conditioner.hpp" #include "control.hpp" @@ -90,14 +91,6 @@ void calculate_alphas_cumprod(float* alphas_cumprod, } } -void suppress_pp(int step, int steps, float time, void* data) { - (void)step; - (void)steps; - (void)time; - (void)data; - return; -} - static float get_cache_reuse_threshold(const sd_cache_params_t& params) { float reuse_threshold = params.reuse_threshold; if (reuse_threshold == INFINITY) { @@ -131,8 +124,6 @@ public: std::shared_ptr rng = std::make_shared(); std::shared_ptr sampler_rng = nullptr; int n_threads = -1; - float scale_factor = 0.18215f; - float shift_factor = 0.f; float default_flow_shift = INFINITY; std::shared_ptr cond_stage_model; @@ -140,7 +131,7 @@ public: std::shared_ptr diffusion_model; std::shared_ptr high_noise_diffusion_model; std::shared_ptr first_stage_model; - std::shared_ptr tae_first_stage; + std::shared_ptr preview_vae; std::shared_ptr control_net; std::shared_ptr pmid_model; std::shared_ptr pmid_lora; @@ -151,7 +142,6 @@ public: bool apply_lora_immediately = false; std::string taesd_path; - bool use_tiny_autoencoder = false; sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0}; bool offload_params_to_cpu = false; bool use_pmid = false; @@ -252,10 +242,10 @@ public: n_threads = sd_ctx_params->n_threads; vae_decode_only = sd_ctx_params->vae_decode_only; free_params_immediately = sd_ctx_params->free_params_immediately; - taesd_path = SAFE_STR(sd_ctx_params->taesd_path); - use_tiny_autoencoder = taesd_path.size() > 0; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; + bool use_tae = false; + rng = get_rng(sd_ctx_params->rng_type); if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) { sampler_rng = get_rng(sd_ctx_params->sampler_rng_type); @@ -345,6 +335,14 @@ public: } } + if (strlen(SAFE_STR(sd_ctx_params->taesd_path)) > 0) { + LOG_INFO("loading tae from '%s'", sd_ctx_params->taesd_path); + if (!model_loader.init_from_file(sd_ctx_params->taesd_path, "tae.")) { + LOG_WARN("loading tae from '%s' failed", sd_ctx_params->taesd_path); + } + use_tae = true; + } + model_loader.convert_tensors_name(); version = model_loader.get_sd_version(); @@ -413,22 +411,6 @@ public: apply_lora_immediately = false; } - if (sd_version_is_sdxl(version)) { - scale_factor = 0.13025f; - } else if (sd_version_is_sd3(version)) { - scale_factor = 1.5305f; - shift_factor = 0.0609f; - } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) { - scale_factor = 0.3611f; - shift_factor = 0.1159f; - } else if (sd_version_is_wan(version) || - sd_version_is_qwen_image(version) || - sd_version_is_anima(version) || - sd_version_is_flux2(version)) { - scale_factor = 1.0f; - shift_factor = 0.f; - } - if (sd_version_is_control(version)) { // Might need vae encode for control cond vae_decode_only = false; @@ -437,6 +419,7 @@ public: bool tae_preview_only = sd_ctx_params->tae_preview_only; if (version == VERSION_SDXS) { tae_preview_only = false; + use_tae = true; } if (sd_ctx_params->circular_x || sd_ctx_params->circular_y) { @@ -623,31 +606,46 @@ public: vae_backend = backend; } - if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - first_stage_model = std::make_shared(vae_backend, - offload_params_to_cpu, - tensor_storage_map, - "first_stage_model", - vae_decode_only, - version); - first_stage_model->alloc_params_buffer(); - first_stage_model->get_param_tensors(tensors, "first_stage_model"); - } else if (version == VERSION_CHROMA_RADIANCE) { - first_stage_model = std::make_shared(vae_backend, - offload_params_to_cpu); + auto create_tae = [&]() -> std::shared_ptr { + if (sd_version_is_wan(version) || + sd_version_is_qwen_image(version) || + sd_version_is_anima(version)) { + return std::make_shared(vae_backend, + offload_params_to_cpu, + tensor_storage_map, + "decoder", + vae_decode_only, + version); + } else { - first_stage_model = std::make_shared(vae_backend, + auto model = std::make_shared(vae_backend, offload_params_to_cpu, tensor_storage_map, - "first_stage_model", + "decoder.layers", vae_decode_only, - false, version); - if (sd_ctx_params->vae_conv_direct) { - LOG_INFO("Using Conv2d direct in the vae model"); - first_stage_model->set_conv2d_direct_enabled(true); - } + return model; + } + }; + + auto create_vae = [&]() -> std::shared_ptr { + if (sd_version_is_wan(version) || + sd_version_is_qwen_image(version) || + sd_version_is_anima(version)) { + return std::make_shared(vae_backend, + offload_params_to_cpu, + tensor_storage_map, + "first_stage_model", + vae_decode_only, + version); + } else { + auto model = std::make_shared(vae_backend, + offload_params_to_cpu, + tensor_storage_map, + "first_stage_model", + vae_decode_only, + false, + version); if (sd_version_is_sdxl(version) && (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale || external_vae_is_invalid)) { float vae_conv_2d_scale = 1.f / 32.f; @@ -655,35 +653,40 @@ public: "No valid VAE specified with --vae or --force-sdxl-vae-conv-scale flag set, " "using Conv2D scale %.3f", vae_conv_2d_scale); - first_stage_model->set_conv2d_scale(vae_conv_2d_scale); + model->set_conv2d_scale(vae_conv_2d_scale); } - first_stage_model->alloc_params_buffer(); - first_stage_model->get_param_tensors(tensors, "first_stage_model"); + return model; + } + }; + + if (version == VERSION_CHROMA_RADIANCE) { + LOG_INFO("using FakeVAE"); + first_stage_model = std::make_shared(version, + vae_backend, + offload_params_to_cpu); + } else if (use_tae && !tae_preview_only) { + LOG_INFO("using TAE for encoding / decoding"); + first_stage_model = create_tae(); + first_stage_model->alloc_params_buffer(); + first_stage_model->get_param_tensors(tensors, "tae"); + } else { + LOG_INFO("using VAE for encoding / decoding"); + first_stage_model = create_vae(); + first_stage_model->alloc_params_buffer(); + first_stage_model->get_param_tensors(tensors, "first_stage_model"); + if (use_tae && tae_preview_only) { + LOG_INFO("using TAE for preview"); + preview_vae = create_tae(); + preview_vae->alloc_params_buffer(); + preview_vae->get_param_tensors(tensors, "tae"); } } - if (use_tiny_autoencoder || version == VERSION_SDXS) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - tae_first_stage = std::make_shared(vae_backend, - offload_params_to_cpu, - tensor_storage_map, - "decoder", - vae_decode_only, - version); - } else { - tae_first_stage = std::make_shared(vae_backend, - offload_params_to_cpu, - tensor_storage_map, - "decoder.layers", - vae_decode_only, - version); - if (version == VERSION_SDXS) { - tae_first_stage->alloc_params_buffer(); - tae_first_stage->get_param_tensors(tensors, "first_stage_model"); - } - } - if (sd_ctx_params->vae_conv_direct) { - LOG_INFO("Using Conv2d direct in the tae model"); - tae_first_stage->set_conv2d_direct_enabled(true); + + if (sd_ctx_params->vae_conv_direct) { + LOG_INFO("Using Conv2d direct in the vae model"); + first_stage_model->set_conv2d_direct_enabled(true); + if (preview_vae) { + preview_vae->set_conv2d_direct_enabled(true); } } @@ -756,8 +759,8 @@ public: if (first_stage_model) { first_stage_model->set_flash_attention_enabled(true); } - if (tae_first_stage) { - tae_first_stage->set_flash_attention_enabled(true); + if (preview_vae) { + preview_vae->set_flash_attention_enabled(true); } } @@ -795,7 +798,7 @@ public: std::set ignore_tensors; tensors["alphas_cumprod"] = alphas_cumprod_tensor; - if (use_tiny_autoencoder) { + if (use_tae && !tae_preview_only) { ignore_tensors.insert("first_stage_model."); } if (use_pmid) { @@ -809,6 +812,7 @@ public: ignore_tensors.insert("first_stage_model.encoder"); ignore_tensors.insert("first_stage_model.conv1"); ignore_tensors.insert("first_stage_model.quant"); + ignore_tensors.insert("tae.encoder"); ignore_tensors.insert("text_encoders.llm.visual."); } if (version == VERSION_OVIS_IMAGE) { @@ -835,15 +839,9 @@ public: unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size(); } size_t vae_params_mem_size = 0; - if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) { - vae_params_mem_size = first_stage_model->get_params_buffer_size(); - } - if (use_tiny_autoencoder || version == VERSION_SDXS) { - if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path, n_threads)) { - return false; - } - use_tiny_autoencoder = true; // now the processing is identical for VERSION_SDXS - vae_params_mem_size = tae_first_stage->get_params_buffer_size(); + vae_params_mem_size = first_stage_model->get_params_buffer_size(); + if (preview_vae) { + vae_params_mem_size += preview_vae->get_params_buffer_size(); } size_t control_net_params_mem_size = 0; if (control_net) { @@ -996,7 +994,6 @@ public: } ggml_free(ctx); - use_tiny_autoencoder = use_tiny_autoencoder && !tae_preview_only; return true; } @@ -1435,8 +1432,7 @@ public: ggml_ext_tensor_scale_inplace(noise, augmentation_level); ggml_ext_tensor_add_inplace(init_img, noise); } - ggml_tensor* moments = vae_encode(work_ctx, init_img); - c_concat = get_first_stage_encoding(work_ctx, moments); + c_concat = encode_first_stage(work_ctx, init_img); } } @@ -1488,14 +1484,6 @@ public: } } - void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) { - sd_progress_cb_t cb = sd_get_progress_callback(); - void* cbd = sd_get_progress_callback_data(); - sd_set_progress_callback((sd_progress_cb_t)suppress_pp, nullptr); - sd_tiling(input, output, scale, tile_size, tile_overlap_factor, circular_x, circular_y, on_processing); - sd_set_progress_callback(cb, cbd); - } - void preview_image(ggml_context* work_ctx, int step, struct ggml_tensor* latents, @@ -1588,37 +1576,14 @@ public: free(data); free(images); } else { - if (preview_mode == PREVIEW_VAE) { - process_latent_out(latents); - if (vae_tiling_params.enabled) { - // split latent in 32x32 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return first_stage_model->compute(n_threads, in, true, &out, nullptr); - }; - silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling); - + if (preview_mode == PREVIEW_VAE || preview_mode == PREVIEW_TAE) { + if (preview_vae) { + latents = preview_vae->diffusion_to_vae_latents(work_ctx, latents); + result = preview_vae->decode(n_threads, work_ctx, latents, vae_tiling_params, false, circular_x, circular_y, result, true); } else { - first_stage_model->compute(n_threads, latents, true, &result, work_ctx); + latents = first_stage_model->diffusion_to_vae_latents(work_ctx, latents); + result = first_stage_model->decode(n_threads, work_ctx, latents, vae_tiling_params, false, circular_x, circular_y, result, true); } - - first_stage_model->free_compute_buffer(); - process_vae_output_tensor(result); - process_latent_in(latents); - } else if (preview_mode == PREVIEW_TAE) { - if (tae_first_stage == nullptr) { - LOG_WARN("TAE not found for preview"); - return; - } - if (vae_tiling_params.enabled) { - // split latent in 64x64 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return tae_first_stage->compute(n_threads, in, true, &out, nullptr); - }; - silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling); - } else { - tae_first_stage->compute(n_threads, latents, true, &result, work_ctx); - } - tae_first_stage->free_compute_buffer(); } else { return; } @@ -1842,8 +1807,7 @@ public: } size_t steps = sigmas.size() - 1; - struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent); - copy_ggml_tensor(x, init_latent); + struct ggml_tensor* x = ggml_ext_dup_and_cpy_tensor(work_ctx, init_latent); if (noise) { x = denoiser->noise_scaling(sigmas[0], noise, x); @@ -2364,15 +2328,7 @@ public: } int get_vae_scale_factor() { - int vae_scale_factor = 8; - if (version == VERSION_WAN2_2_TI2V) { - vae_scale_factor = 16; - } else if (sd_version_is_flux2(version)) { - vae_scale_factor = 16; - } else if (version == VERSION_CHROMA_RADIANCE) { - vae_scale_factor = 1; - } - return vae_scale_factor; + return first_stage_model->get_scale_factor(); } int get_diffusion_model_down_factor() { @@ -2427,383 +2383,28 @@ public: } else { init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); } - ggml_set_f32(init_latent, shift_factor); + ggml_set_f32(init_latent, 0.f); return init_latent; } - void get_latents_mean_std_vec(ggml_tensor* latent, int channel_dim, std::vector& latents_mean_vec, std::vector& latents_std_vec) { - GGML_ASSERT(latent->ne[channel_dim] == 16 || latent->ne[channel_dim] == 48 || latent->ne[channel_dim] == 128); - if (latent->ne[channel_dim] == 16) { - latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f, - 0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f}; - latents_std_vec = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f, - 3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f}; - } else if (latent->ne[channel_dim] == 48) { - latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f, - -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f, - -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f, - -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f, - -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, - 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}; - latents_std_vec = { - 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, - 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, - 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, - 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, - 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, - 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; - } else if (latent->ne[channel_dim] == 128) { - // flux2 - latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f, - -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f, - -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f, - 0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f, - -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f, - 0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f, - 0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f, - 0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f, - -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f, - 0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f, - -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f, - 0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f, - -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f, - 0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f, - -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f, - -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f}; - latents_std_vec = { - 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f, - 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f, - 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f, - 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f, - 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f, - 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f, - 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f, - 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f, - 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f, - 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f, - 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f, - 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f, - 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f, - 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f, - 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f, - 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f}; - } - } - - void process_latent_in(ggml_tensor* latent) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) { - int channel_dim = sd_version_is_flux2(version) ? 2 : 3; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latent, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latent->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latent->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int k = 0; k < latent->ne[1]; k++) { - for (int l = 0; l < latent->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latent, l, k, j, i); - value = (value - mean) * scale_factor / std_; - ggml_ext_tensor_set_f32(latent, value, l, k, j, i); - } - } - } - } - } else if (version == VERSION_CHROMA_RADIANCE) { - // pass - } else { - ggml_ext_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(latent, i0, i1, i2, i3); - value = (value - shift_factor) * scale_factor; - ggml_ext_tensor_set_f32(latent, value, i0, i1, i2, i3); - }); - } - } - - void process_latent_out(ggml_tensor* latent) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) { - int channel_dim = sd_version_is_flux2(version) ? 2 : 3; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latent, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latent->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latent->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int k = 0; k < latent->ne[1]; k++) { - for (int l = 0; l < latent->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latent, l, k, j, i); - value = value * std_ / scale_factor + mean; - ggml_ext_tensor_set_f32(latent, value, l, k, j, i); - } - } - } - } - } else if (version == VERSION_CHROMA_RADIANCE) { - // pass - } else { - ggml_ext_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(latent, i0, i1, i2, i3); - value = (value / scale_factor) + shift_factor; - ggml_ext_tensor_set_f32(latent, value, i0, i1, i2, i3); - }); - } - } - - void get_tile_sizes(int& tile_size_x, - int& tile_size_y, - float& tile_overlap, - const sd_tiling_params_t& params, - int64_t latent_x, - int64_t latent_y, - float encoding_factor = 1.0f) { - tile_overlap = std::max(std::min(params.target_overlap, 0.5f), 0.0f); - auto get_tile_size = [&](int requested_size, float factor, int64_t latent_size) { - const int default_tile_size = 32; - const int min_tile_dimension = 4; - int tile_size = default_tile_size; - // factor <= 1 means simple fraction of the latent dimension - // factor > 1 means number of tiles across that dimension - if (factor > 0.f) { - if (factor > 1.0) - factor = 1 / (factor - factor * tile_overlap + tile_overlap); - tile_size = static_cast(std::round(latent_size * factor)); - } else if (requested_size >= min_tile_dimension) { - tile_size = requested_size; - } - tile_size = static_cast(tile_size * encoding_factor); - return std::max(std::min(tile_size, static_cast(latent_size)), min_tile_dimension); - }; - - tile_size_x = get_tile_size(params.tile_size_x, params.rel_size_x, latent_x); - tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y); - } - - ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x) { - int64_t t0 = ggml_time_ms(); - ggml_tensor* result = nullptr; - const int vae_scale_factor = get_vae_scale_factor(); - int64_t W = x->ne[0] / vae_scale_factor; - int64_t H = x->ne[1] / vae_scale_factor; - int64_t C = get_latent_channel(); - if (vae_tiling_params.enabled) { - // TODO wan2.2 vae support? - int64_t ne2; - int64_t ne3; - if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - ne2 = 1; - ne3 = C * x->ne[3]; - } else { - int64_t out_channels = C; - bool encode_outputs_mu = use_tiny_autoencoder || - sd_version_is_wan(version) || - sd_version_is_flux2(version) || - version == VERSION_CHROMA_RADIANCE; - if (!encode_outputs_mu) { - out_channels *= 2; - } - ne2 = out_channels; - ne3 = x->ne[3]; - } - result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3); - } - - if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); - } - - if (!use_tiny_autoencoder) { - process_vae_input_tensor(x); - if (vae_tiling_params.enabled) { - float tile_overlap; - int tile_size_x, tile_size_y; - // multiply tile size for encode to keep the compute buffer size consistent - get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, W, H, 1.30539f); - - LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y); - - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return first_stage_model->compute(n_threads, in, false, &out, work_ctx); - }; - sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling); - } else { - first_stage_model->compute(n_threads, x, false, &result, work_ctx); - } - first_stage_model->free_compute_buffer(); - } else { - if (vae_tiling_params.enabled) { - // split latent in 32x32 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return tae_first_stage->compute(n_threads, in, false, &out, nullptr); - }; - sd_tiling(x, result, vae_scale_factor, 64, 0.5f, circular_x, circular_y, on_tiling); - } else { - tae_first_stage->compute(n_threads, x, false, &result, work_ctx); - } - tae_first_stage->free_compute_buffer(); - } - - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - return result; - } - - ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments) { - // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); - ggml_ext_im_set_randn_f32(noise, rng); - { - float mean = 0; - float logvar = 0; - float value = 0; - float std_ = 0; - for (int i = 0; i < latent->ne[3]; i++) { - for (int j = 0; j < latent->ne[2]; j++) { - for (int k = 0; k < latent->ne[1]; k++) { - for (int l = 0; l < latent->ne[0]; l++) { - mean = ggml_ext_tensor_get_f32(moments, l, k, j, i); - logvar = ggml_ext_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i); - logvar = std::max(-30.0f, std::min(logvar, 20.0f)); - std_ = std::exp(0.5f * logvar); - value = mean + std_ * ggml_ext_tensor_get_f32(noise, l, k, j, i); - // printf("%d %d %d %d -> %f\n", i, j, k, l, value); - ggml_ext_tensor_set_f32(latent, value, l, k, j, i); - } - } - } - } - } - return latent; - } - - ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* vae_output) { - ggml_tensor* latent; - if (use_tiny_autoencoder || - sd_version_is_qwen_image(version) || - sd_version_is_anima(version) || - sd_version_is_wan(version) || - sd_version_is_flux2(version) || - version == VERSION_CHROMA_RADIANCE) { - latent = vae_output; - } else if (version == VERSION_SD1_PIX2PIX) { - latent = ggml_view_3d(work_ctx, - vae_output, - vae_output->ne[0], - vae_output->ne[1], - vae_output->ne[2] / 2, - vae_output->nb[1], - vae_output->nb[2], - 0); - } else { - latent = gaussian_latent_sample(work_ctx, vae_output); - } - if (!use_tiny_autoencoder && version != VERSION_SD1_PIX2PIX) { - process_latent_in(latent); - } - if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - latent = ggml_reshape_4d(work_ctx, latent, latent->ne[0], latent->ne[1], latent->ne[3], 1); - } - return latent; + ggml_tensor* encode_to_vae_latents(ggml_context* work_ctx, ggml_tensor* x) { + ggml_tensor* vae_output = first_stage_model->encode(n_threads, work_ctx, x, vae_tiling_params, circular_x, circular_y); + ggml_tensor* latents = first_stage_model->vae_output_to_latents(work_ctx, vae_output, rng); + return latents; } ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { - ggml_tensor* vae_output = vae_encode(work_ctx, x); - return get_first_stage_encoding(work_ctx, vae_output); + ggml_tensor* latents = encode_to_vae_latents(work_ctx, x); + if (version != VERSION_SD1_PIX2PIX) { + latents = first_stage_model->vae_to_diffuison_latents(work_ctx, latents); + } + return latents; } ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) { - const int vae_scale_factor = get_vae_scale_factor(); - int64_t W = x->ne[0] * vae_scale_factor; - int64_t H = x->ne[1] * vae_scale_factor; - int64_t C = 3; - ggml_tensor* result = nullptr; - if (decode_video) { - int64_t T = x->ne[2]; - if (sd_version_is_wan(version)) { - T = ((T - 1) * 4) + 1; - } - result = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - W, - H, - T, - 3); - } else { - result = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - W, - H, - C, - x->ne[3]); - } - int64_t t0 = ggml_time_ms(); - if (!use_tiny_autoencoder) { - if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); - } - process_latent_out(x); - // x = load_tensor_from_file(work_ctx, "wan_vae_z.bin"); - if (vae_tiling_params.enabled) { - float tile_overlap; - int tile_size_x, tile_size_y; - get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, x->ne[0], x->ne[1]); - - LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y); - - // split latent in 32x32 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return first_stage_model->compute(n_threads, in, true, &out, nullptr); - }; - sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling); - } else { - if (!first_stage_model->compute(n_threads, x, true, &result, work_ctx)) { - LOG_ERROR("Failed to decode latetnts"); - first_stage_model->free_compute_buffer(); - return nullptr; - } - } - first_stage_model->free_compute_buffer(); - process_vae_output_tensor(result); - } else { - if (vae_tiling_params.enabled) { - // split latent in 64x64 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return tae_first_stage->compute(n_threads, in, true, &out); - }; - sd_tiling(x, result, vae_scale_factor, 64, 0.5f, circular_x, circular_y, on_tiling); - } else { - if (!tae_first_stage->compute(n_threads, x, true, &result)) { - LOG_ERROR("Failed to decode latetnts"); - tae_first_stage->free_compute_buffer(); - return nullptr; - } - } - tae_first_stage->free_compute_buffer(); - } - - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f); - return result; + x = first_stage_model->diffusion_to_vae_latents(work_ctx, x); + x = first_stage_model->decode(n_threads, work_ctx, x, vae_tiling_params, decode_video, circular_x, circular_y); + return x; } void set_flow_shift(float flow_shift = INFINITY) { @@ -3573,7 +3174,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, int64_t t4 = ggml_time_ms(); LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { + if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->first_stage_model->free_params_buffer(); } @@ -3622,15 +3223,15 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g if (sd_ctx->sd->first_stage_model) { sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); } - if (sd_ctx->sd->tae_first_stage) { - sd_ctx->sd->tae_first_stage->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + if (sd_ctx->sd->preview_vae) { + sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); } } else { int tile_size_x, tile_size_y; float _overlap; int latent_size_x = width / sd_ctx->sd->get_vae_scale_factor(); int latent_size_y = height / sd_ctx->sd->get_vae_scale_factor(); - sd_ctx->sd->get_tile_sizes(tile_size_x, tile_size_y, _overlap, sd_img_gen_params->vae_tiling_params, latent_size_x, latent_size_y); + sd_ctx->sd->first_stage_model->get_tile_sizes(tile_size_x, tile_size_y, _overlap, sd_img_gen_params->vae_tiling_params, latent_size_x, latent_size_y); // force disable circular padding for vae if tiling is enabled unless latent is smaller than tile size // otherwise it will cause artifacts at the edges of the tiles @@ -3640,8 +3241,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g if (sd_ctx->sd->first_stage_model) { sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); } - if (sd_ctx->sd->tae_first_stage) { - sd_ctx->sd->tae_first_stage->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + if (sd_ctx->sd->preview_vae) { + sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); } // disable circular tiling if it's enabled for the VAE @@ -4118,14 +3719,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, init_img); init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3); - auto init_image_latent = sd_ctx->sd->vae_encode(work_ctx, init_img); // [b*c, 1, h/16, w/16] + auto init_image_latent = sd_ctx->sd->encode_to_vae_latents(work_ctx, init_img); // [b*c, 1, h/16, w/16] init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true); denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1); ggml_set_f32(denoise_mask, 1.f); - if (!sd_ctx->sd->use_tiny_autoencoder) - sd_ctx->sd->process_latent_out(init_latent); + init_latent = sd_ctx->sd->first_stage_model->diffusion_to_vae_latents(work_ctx, init_latent); ggml_ext_tensor_iter(init_image_latent, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { float value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3); @@ -4135,8 +3735,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } }); - if (!sd_ctx->sd->use_tiny_autoencoder) - sd_ctx->sd->process_latent_in(init_latent); + init_latent = sd_ctx->sd->first_stage_model->vae_to_diffuison_latents(work_ctx, init_latent); int64_t t2 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); @@ -4359,7 +3958,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s struct ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true); int64_t t5 = ggml_time_ms(); LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { + if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->first_stage_model->free_params_buffer(); } diff --git a/src/tae.hpp b/src/tae.hpp index 83152578..60df7b29 100644 --- a/src/tae.hpp +++ b/src/tae.hpp @@ -442,11 +442,13 @@ protected: bool decode_only; SDVersion version; +public: + int z_channels = 16; + public: TAEHV(bool decode_only = true, SDVersion version = VERSION_WAN2) : decode_only(decode_only), version(version) { - int z_channels = 16; - int patch = 1; + int patch = 1; if (version == VERSION_WAN2_2_TI2V) { z_channels = 48; patch = 2; @@ -494,10 +496,12 @@ protected: bool decode_only; bool taef2 = false; +public: + int z_channels = 4; + public: TAESD(bool decode_only = true, SDVersion version = VERSION_SD1) : decode_only(decode_only) { - int z_channels = 4; bool use_midblock_gn = false; taef2 = sd_version_is_flux2(version); @@ -533,20 +537,7 @@ public: } }; -struct TinyAutoEncoder : public GGMLRunner { - TinyAutoEncoder(ggml_backend_t backend, bool offload_params_to_cpu) - : GGMLRunner(backend, offload_params_to_cpu) {} - virtual bool compute(const int n_threads, - struct ggml_tensor* z, - bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx = nullptr) = 0; - - virtual bool load_from_file(const std::string& file_path, int n_threads) = 0; - virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; -}; - -struct TinyImageAutoEncoder : public TinyAutoEncoder { +struct TinyImageAutoEncoder : public VAE { TAESD taesd; bool decode_only = false; @@ -558,7 +549,8 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder { SDVersion version = VERSION_SD1) : decode_only(decoder_only), taesd(decoder_only, version), - TinyAutoEncoder(backend, offload_params_to_cpu) { + VAE(version, backend, offload_params_to_cpu) { + scale_input = false; taesd.init(params_ctx, tensor_storage_map, prefix); } @@ -566,37 +558,26 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder { return "taesd"; } - bool load_from_file(const std::string& file_path, int n_threads) { - LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false"); - alloc_params_buffer(); - std::map taesd_tensors; - taesd.get_param_tensors(taesd_tensors); - std::set ignore_tensors; - if (decode_only) { - ignore_tensors.insert("encoder."); - } - - ModelLoader model_loader; - if (!model_loader.init_from_file_and_convert_name(file_path)) { - LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str()); - return false; - } - - bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors, n_threads); - - if (!success) { - LOG_ERROR("load tae tensors from model loader failed"); - return false; - } - - LOG_INFO("taesd model loaded"); - return success; - } - void get_param_tensors(std::map& tensors, const std::string prefix) { taesd.get_param_tensors(tensors, prefix); } + ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + return vae_output; + } + + ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { + return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + } + + ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { + return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + } + + int get_encoder_output_channels(int input_channels) { + return taesd.z_channels; + } + struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); z = to_backend(z); @@ -606,11 +587,11 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder { return gf; } - bool compute(const int n_threads, - struct ggml_tensor* z, - bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx = nullptr) { + bool _compute(const int n_threads, + struct ggml_tensor* z, + bool decode_graph, + struct ggml_tensor** output, + struct ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(z, decode_graph); }; @@ -619,7 +600,7 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder { } }; -struct TinyVideoAutoEncoder : public TinyAutoEncoder { +struct TinyVideoAutoEncoder : public VAE { TAEHV taehv; bool decode_only = false; @@ -631,7 +612,8 @@ struct TinyVideoAutoEncoder : public TinyAutoEncoder { SDVersion version = VERSION_WAN2) : decode_only(decoder_only), taehv(decoder_only, version), - TinyAutoEncoder(backend, offload_params_to_cpu) { + VAE(version, backend, offload_params_to_cpu) { + scale_input = false; taehv.init(params_ctx, tensor_storage_map, prefix); } @@ -639,37 +621,26 @@ struct TinyVideoAutoEncoder : public TinyAutoEncoder { return "taehv"; } - bool load_from_file(const std::string& file_path, int n_threads) { - LOG_INFO("loading taehv from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false"); - alloc_params_buffer(); - std::map taehv_tensors; - taehv.get_param_tensors(taehv_tensors); - std::set ignore_tensors; - if (decode_only) { - ignore_tensors.insert("encoder."); - } - - ModelLoader model_loader; - if (!model_loader.init_from_file(file_path)) { - LOG_ERROR("init taehv model loader from file failed: '%s'", file_path.c_str()); - return false; - } - - bool success = model_loader.load_tensors(taehv_tensors, ignore_tensors, n_threads); - - if (!success) { - LOG_ERROR("load tae tensors from model loader failed"); - return false; - } - - LOG_INFO("taehv model loaded"); - return success; - } - void get_param_tensors(std::map& tensors, const std::string prefix) { taehv.get_param_tensors(tensors, prefix); } + ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + return vae_output; + } + + ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { + return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + } + + ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { + return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + } + + int get_encoder_output_channels(int input_channels) { + return taehv.z_channels; + } + struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); z = to_backend(z); @@ -679,11 +650,11 @@ struct TinyVideoAutoEncoder : public TinyAutoEncoder { return gf; } - bool compute(const int n_threads, - struct ggml_tensor* z, - bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx = nullptr) { + bool _compute(const int n_threads, + struct ggml_tensor* z, + bool decode_graph, + struct ggml_tensor** output, + struct ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(z, decode_graph); }; diff --git a/src/vae.hpp b/src/vae.hpp index 7ccba6ee..ad83e01a 100644 --- a/src/vae.hpp +++ b/src/vae.hpp @@ -3,635 +3,206 @@ #include "common_block.hpp" -/*================================================== AutoEncoderKL ===================================================*/ - -#define VAE_GRAPH_SIZE 20480 - -class ResnetBlock : public UnaryBlock { -protected: - int64_t in_channels; - int64_t out_channels; - -public: - ResnetBlock(int64_t in_channels, - int64_t out_channels) - : in_channels(in_channels), - out_channels(out_channels) { - // temb_channels is always 0 - blocks["norm1"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["conv1"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); - - blocks["norm2"] = std::shared_ptr(new GroupNorm32(out_channels)); - blocks["conv2"] = std::shared_ptr(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); - - if (out_channels != in_channels) { - blocks["nin_shortcut"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {1, 1})); - } - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { - // x: [N, in_channels, h, w] - // t_emb is always None - auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); - auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); - auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); - auto conv2 = std::dynamic_pointer_cast(blocks["conv2"]); - - auto h = x; - h = norm1->forward(ctx, h); - h = ggml_silu_inplace(ctx->ggml_ctx, h); // swish - h = conv1->forward(ctx, h); - // return h; - - h = norm2->forward(ctx, h); - h = ggml_silu_inplace(ctx->ggml_ctx, h); // swish - // dropout, skip for inference - h = conv2->forward(ctx, h); - - // skip connection - if (out_channels != in_channels) { - auto nin_shortcut = std::dynamic_pointer_cast(blocks["nin_shortcut"]); - - x = nin_shortcut->forward(ctx, x); // [N, out_channels, h, w] - } - - h = ggml_add(ctx->ggml_ctx, h, x); - return h; // [N, out_channels, h, w] - } -}; - -class AttnBlock : public UnaryBlock { -protected: - int64_t in_channels; - bool use_linear; - - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { - auto iter = tensor_storage_map.find(prefix + "proj_out.weight"); - if (iter != tensor_storage_map.end()) { - if (iter->second.n_dims == 4 && use_linear) { - use_linear = false; - blocks["q"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); - blocks["k"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); - blocks["v"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); - blocks["proj_out"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); - } else if (iter->second.n_dims == 2 && !use_linear) { - use_linear = true; - blocks["q"] = std::make_shared(in_channels, in_channels); - blocks["k"] = std::make_shared(in_channels, in_channels); - blocks["v"] = std::make_shared(in_channels, in_channels); - blocks["proj_out"] = std::make_shared(in_channels, in_channels); - } - } - } - -public: - AttnBlock(int64_t in_channels, bool use_linear) - : in_channels(in_channels), use_linear(use_linear) { - blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels)); - if (use_linear) { - blocks["q"] = std::shared_ptr(new Linear(in_channels, in_channels)); - blocks["k"] = std::shared_ptr(new Linear(in_channels, in_channels)); - blocks["v"] = std::shared_ptr(new Linear(in_channels, in_channels)); - blocks["proj_out"] = std::shared_ptr(new Linear(in_channels, in_channels)); - } else { - blocks["q"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); - blocks["k"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); - blocks["v"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); - blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); - } - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { - // x: [N, in_channels, h, w] - auto norm = std::dynamic_pointer_cast(blocks["norm"]); - auto q_proj = std::dynamic_pointer_cast(blocks["q"]); - auto k_proj = std::dynamic_pointer_cast(blocks["k"]); - auto v_proj = std::dynamic_pointer_cast(blocks["v"]); - auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); - - auto h_ = norm->forward(ctx, x); - - const int64_t n = h_->ne[3]; - const int64_t c = h_->ne[2]; - const int64_t h = h_->ne[1]; - const int64_t w = h_->ne[0]; - - ggml_tensor* q; - ggml_tensor* k; - ggml_tensor* v; - if (use_linear) { - h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 2, 0, 3)); // [N, h, w, in_channels] - h_ = ggml_reshape_3d(ctx->ggml_ctx, h_, c, h * w, n); // [N, h * w, in_channels] - - q = q_proj->forward(ctx, h_); // [N, h * w, in_channels] - k = k_proj->forward(ctx, h_); // [N, h * w, in_channels] - v = v_proj->forward(ctx, h_); // [N, h * w, in_channels] - } else { - q = q_proj->forward(ctx, h_); // [N, in_channels, h, w] - q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels] - q = ggml_reshape_3d(ctx->ggml_ctx, q, c, h * w, n); // [N, h * w, in_channels] - - k = k_proj->forward(ctx, h_); // [N, in_channels, h, w] - k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels] - k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n); // [N, h * w, in_channels] - - v = v_proj->forward(ctx, h_); // [N, in_channels, h, w] - v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3)); // [N, h, w, in_channels] - v = ggml_reshape_3d(ctx->ggml_ctx, v, c, h * w, n); // [N, h * w, in_channels] - } - - h_ = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, false, ctx->flash_attn_enabled); - - if (use_linear) { - h_ = proj_out->forward(ctx, h_); // [N, h * w, in_channels] - - h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w] - h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n); // [N, in_channels, h, w] - } else { - h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w] - h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n); // [N, in_channels, h, w] - - h_ = proj_out->forward(ctx, h_); // [N, in_channels, h, w] - } - - h_ = ggml_add(ctx->ggml_ctx, h_, x); - return h_; - } -}; - -class AE3DConv : public Conv2d { -public: - AE3DConv(int64_t in_channels, - int64_t out_channels, - std::pair kernel_size, - int video_kernel_size = 3, - std::pair stride = {1, 1}, - std::pair padding = {0, 0}, - std::pair dilation = {1, 1}, - bool bias = true) - : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) { - int kernel_padding = video_kernel_size / 2; - blocks["time_mix_conv"] = std::shared_ptr(new Conv3d(out_channels, - out_channels, - {video_kernel_size, 1, 1}, - {1, 1, 1}, - {kernel_padding, 0, 0})); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x) override { - // timesteps always None - // skip_video always False - // x: [N, IC, IH, IW] - // result: [N, OC, OH, OW] - auto time_mix_conv = std::dynamic_pointer_cast(blocks["time_mix_conv"]); - - x = Conv2d::forward(ctx, x); - // timesteps = x.shape[0] - // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps) - // x = conv3d(x) - // return rearrange(x, "b c t h w -> (b t) c h w") - int64_t T = x->ne[3]; - int64_t B = x->ne[3] / T; - int64_t C = x->ne[2]; - int64_t H = x->ne[1]; - int64_t W = x->ne[0]; - - x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w) - x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w) - x = time_mix_conv->forward(ctx, x); // [B, OC, T, OH * OW] - x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) - x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w - return x; // [B*T, OC, OH, OW] - } -}; - -class VideoResnetBlock : public ResnetBlock { -protected: - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { - enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32); - params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); - } - - float get_alpha() { - float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]); - return sigmoid(alpha); - } - -public: - VideoResnetBlock(int64_t in_channels, - int64_t out_channels, - int video_kernel_size = 3) - : ResnetBlock(in_channels, out_channels) { - // merge_strategy is always learned - blocks["time_stack"] = std::shared_ptr(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true)); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { - // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w] - // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w] - // t_emb is always None - // skip_video is always False - // timesteps is always None - auto time_stack = std::dynamic_pointer_cast(blocks["time_stack"]); - - x = ResnetBlock::forward(ctx, x); // [N, out_channels, h, w] - // return x; - - int64_t T = x->ne[3]; - int64_t B = x->ne[3] / T; - int64_t C = x->ne[2]; - int64_t H = x->ne[1]; - int64_t W = x->ne[0]; - - x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w) - x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w) - auto x_mix = x; - - x = time_stack->forward(ctx, x); // b t c (h w) - - float alpha = get_alpha(); - x = ggml_add(ctx->ggml_ctx, - ggml_ext_scale(ctx->ggml_ctx, x, alpha), - ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha)); - - x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) - x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w - - return x; - } -}; - -// ldm.modules.diffusionmodules.model.Encoder -class Encoder : public GGMLBlock { -protected: - int ch = 128; - std::vector ch_mult = {1, 2, 4, 4}; - int num_res_blocks = 2; - int in_channels = 3; - int z_channels = 4; - bool double_z = true; - -public: - Encoder(int ch, - std::vector ch_mult, - int num_res_blocks, - int in_channels, - int z_channels, - bool double_z = true, - bool use_linear_projection = false) - : ch(ch), - ch_mult(ch_mult), - num_res_blocks(num_res_blocks), - in_channels(in_channels), - z_channels(z_channels), - double_z(double_z) { - blocks["conv_in"] = std::shared_ptr(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1})); - - size_t num_resolutions = ch_mult.size(); - - int block_in = 1; - for (int i = 0; i < num_resolutions; i++) { - if (i == 0) { - block_in = ch; - } else { - block_in = ch * ch_mult[i - 1]; - } - int block_out = ch * ch_mult[i]; - for (int j = 0; j < num_res_blocks; j++) { - std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j); - blocks[name] = std::shared_ptr(new ResnetBlock(block_in, block_out)); - block_in = block_out; - } - if (i != num_resolutions - 1) { - std::string name = "down." + std::to_string(i) + ".downsample"; - blocks[name] = std::shared_ptr(new DownSampleBlock(block_in, block_in, true)); - } - } - - blocks["mid.block_1"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); - blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, use_linear_projection)); - blocks["mid.block_2"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); - - blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); - blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1})); - } - - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - // x: [N, in_channels, h, w] - - auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); - auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]); - auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]); - auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]); - auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); - auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); - - auto h = conv_in->forward(ctx, x); // [N, ch, h, w] - - // downsampling - size_t num_resolutions = ch_mult.size(); - for (int i = 0; i < num_resolutions; i++) { - for (int j = 0; j < num_res_blocks; j++) { - std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j); - auto down_block = std::dynamic_pointer_cast(blocks[name]); - - h = down_block->forward(ctx, h); - } - if (i != num_resolutions - 1) { - std::string name = "down." + std::to_string(i) + ".downsample"; - auto down_sample = std::dynamic_pointer_cast(blocks[name]); - - h = down_sample->forward(ctx, h); - } - } - - // middle - h = mid_block_1->forward(ctx, h); - h = mid_attn_1->forward(ctx, h); - h = mid_block_2->forward(ctx, h); // [N, block_in, h, w] - - // end - h = norm_out->forward(ctx, h); - h = ggml_silu_inplace(ctx->ggml_ctx, h); // nonlinearity/swish - h = conv_out->forward(ctx, h); // [N, z_channels*2, h, w] - return h; - } -}; - -// ldm.modules.diffusionmodules.model.Decoder -class Decoder : public GGMLBlock { -protected: - int ch = 128; - int out_ch = 3; - std::vector ch_mult = {1, 2, 4, 4}; - int num_res_blocks = 2; - int z_channels = 4; - bool video_decoder = false; - int video_kernel_size = 3; - - virtual std::shared_ptr get_conv_out(int64_t in_channels, - int64_t out_channels, - std::pair kernel_size, - std::pair stride = {1, 1}, - std::pair padding = {0, 0}) { - if (video_decoder) { - return std::shared_ptr(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding)); - } else { - return std::shared_ptr(new Conv2d(in_channels, out_channels, kernel_size, stride, padding)); - } - } - - virtual std::shared_ptr get_resnet_block(int64_t in_channels, - int64_t out_channels) { - if (video_decoder) { - return std::shared_ptr(new VideoResnetBlock(in_channels, out_channels, video_kernel_size)); - } else { - return std::shared_ptr(new ResnetBlock(in_channels, out_channels)); - } - } - -public: - Decoder(int ch, - int out_ch, - std::vector ch_mult, - int num_res_blocks, - int z_channels, - bool use_linear_projection = false, - bool video_decoder = false, - int video_kernel_size = 3) - : ch(ch), - out_ch(out_ch), - ch_mult(ch_mult), - num_res_blocks(num_res_blocks), - z_channels(z_channels), - video_decoder(video_decoder), - video_kernel_size(video_kernel_size) { - int num_resolutions = static_cast(ch_mult.size()); - int block_in = ch * ch_mult[num_resolutions - 1]; - - blocks["conv_in"] = std::shared_ptr(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1})); - - blocks["mid.block_1"] = get_resnet_block(block_in, block_in); - blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, use_linear_projection)); - blocks["mid.block_2"] = get_resnet_block(block_in, block_in); - - for (int i = num_resolutions - 1; i >= 0; i--) { - int mult = ch_mult[i]; - int block_out = ch * mult; - for (int j = 0; j < num_res_blocks + 1; j++) { - std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j); - blocks[name] = get_resnet_block(block_in, block_out); - - block_in = block_out; - } - if (i != 0) { - std::string name = "up." + std::to_string(i) + ".upsample"; - blocks[name] = std::shared_ptr(new UpSampleBlock(block_in, block_in)); - } - } - - blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); - blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}); - } - - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) { - // z: [N, z_channels, h, w] - // alpha is always 0 - // merge_strategy is always learned - // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock - // AttnVideoBlock will not be used - auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); - auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]); - auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]); - auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]); - auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); - auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); - - // conv_in - auto h = conv_in->forward(ctx, z); // [N, block_in, h, w] - - // middle - h = mid_block_1->forward(ctx, h); - // return h; - - h = mid_attn_1->forward(ctx, h); - h = mid_block_2->forward(ctx, h); // [N, block_in, h, w] - - // upsampling - int num_resolutions = static_cast(ch_mult.size()); - for (int i = num_resolutions - 1; i >= 0; i--) { - for (int j = 0; j < num_res_blocks + 1; j++) { - std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j); - auto up_block = std::dynamic_pointer_cast(blocks[name]); - - h = up_block->forward(ctx, h); - } - if (i != 0) { - std::string name = "up." + std::to_string(i) + ".upsample"; - auto up_sample = std::dynamic_pointer_cast(blocks[name]); - - h = up_sample->forward(ctx, h); - } - } - - h = norm_out->forward(ctx, h); - h = ggml_silu_inplace(ctx->ggml_ctx, h); // nonlinearity/swish - h = conv_out->forward(ctx, h); // [N, out_ch, h*8, w*8] - return h; - } -}; - -// ldm.models.autoencoder.AutoencoderKL -class AutoencodingEngine : public GGMLBlock { +struct VAE : public GGMLRunner { protected: SDVersion version; - bool decode_only = true; - bool use_video_decoder = false; - bool use_quant = true; - int embed_dim = 4; - struct { - int z_channels = 4; - int resolution = 256; - int in_channels = 3; - int out_ch = 3; - int ch = 128; - std::vector ch_mult = {1, 2, 4, 4}; - int num_res_blocks = 2; - bool double_z = true; - } dd_config; + bool scale_input = true; + virtual bool _compute(const int n_threads, + struct ggml_tensor* z, + bool decode_graph, + struct ggml_tensor** output, + struct ggml_context* output_ctx) = 0; public: - AutoencodingEngine(SDVersion version = VERSION_SD1, - bool decode_only = true, - bool use_linear_projection = false, - bool use_video_decoder = false) - : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) { - if (sd_version_is_dit(version)) { - if (sd_version_is_flux2(version)) { - dd_config.z_channels = 32; - embed_dim = 32; + VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu) + : version(version), GGMLRunner(backend, offload_params_to_cpu) {} + + int get_scale_factor() { + int scale_factor = 8; + if (version == VERSION_WAN2_2_TI2V) { + scale_factor = 16; + } else if (sd_version_is_flux2(version)) { + scale_factor = 16; + } else if (version == VERSION_CHROMA_RADIANCE) { + scale_factor = 1; + } + return scale_factor; + } + + virtual int get_encoder_output_channels(int input_channels) = 0; + + void get_tile_sizes(int& tile_size_x, + int& tile_size_y, + float& tile_overlap, + const sd_tiling_params_t& params, + int64_t latent_x, + int64_t latent_y, + float encoding_factor = 1.0f) { + tile_overlap = std::max(std::min(params.target_overlap, 0.5f), 0.0f); + auto get_tile_size = [&](int requested_size, float factor, int64_t latent_size) { + const int default_tile_size = 32; + const int min_tile_dimension = 4; + int tile_size = default_tile_size; + // factor <= 1 means simple fraction of the latent dimension + // factor > 1 means number of tiles across that dimension + if (factor > 0.f) { + if (factor > 1.0) + factor = 1 / (factor - factor * tile_overlap + tile_overlap); + tile_size = static_cast(std::round(latent_size * factor)); + } else if (requested_size >= min_tile_dimension) { + tile_size = requested_size; + } + tile_size = static_cast(tile_size * encoding_factor); + return std::max(std::min(tile_size, static_cast(latent_size)), min_tile_dimension); + }; + + tile_size_x = get_tile_size(params.tile_size_x, params.rel_size_x, latent_x); + tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y); + } + + ggml_tensor* encode(int n_threads, + ggml_context* work_ctx, + ggml_tensor* x, + sd_tiling_params_t tiling_params, + bool circular_x = false, + bool circular_y = false) { + int64_t t0 = ggml_time_ms(); + ggml_tensor* result = nullptr; + const int scale_factor = get_scale_factor(); + int64_t W = x->ne[0] / scale_factor; + int64_t H = x->ne[1] / scale_factor; + int channel_dim = sd_version_is_wan(version) ? 3 : 2; + int64_t C = get_encoder_output_channels(static_cast(x->ne[channel_dim])); + int64_t ne2; + int64_t ne3; + if (sd_version_is_wan(version)) { + int64_t T = x->ne[2]; + ne2 = (T - 1) / 4 + 1; + ne3 = C; + } else { + ne2 = C; + ne3 = x->ne[3]; + } + result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3); + + if (scale_input) { + scale_to_minus1_1(x); + } + + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { + x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); + } + + if (tiling_params.enabled) { + float tile_overlap; + int tile_size_x, tile_size_y; + // multiply tile size for encode to keep the compute buffer size consistent + get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, W, H, 1.30539f); + + LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y); + + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + return _compute(n_threads, in, false, &out, work_ctx); + }; + sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling); + } else { + _compute(n_threads, x, false, &result, work_ctx); + } + free_compute_buffer(); + + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); + return result; + } + + ggml_tensor* decode(int n_threads, + ggml_context* work_ctx, + ggml_tensor* x, + sd_tiling_params_t tiling_params, + bool decode_video = false, + bool circular_x = false, + bool circular_y = false, + ggml_tensor* result = nullptr, + bool silent = false) { + const int scale_factor = get_scale_factor(); + int64_t W = x->ne[0] * scale_factor; + int64_t H = x->ne[1] * scale_factor; + int64_t C = 3; + if (result == nullptr) { + if (decode_video) { + int64_t T = x->ne[2]; + if (sd_version_is_wan(version)) { + T = ((T - 1) * 4) + 1; + } + result = ggml_new_tensor_4d(work_ctx, + GGML_TYPE_F32, + W, + H, + T, + 3); } else { - use_quant = false; - dd_config.z_channels = 16; + result = ggml_new_tensor_4d(work_ctx, + GGML_TYPE_F32, + W, + H, + C, + x->ne[3]); } } - if (use_video_decoder) { - use_quant = false; + int64_t t0 = ggml_time_ms(); + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { + x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); } - blocks["decoder"] = std::shared_ptr(new Decoder(dd_config.ch, - dd_config.out_ch, - dd_config.ch_mult, - dd_config.num_res_blocks, - dd_config.z_channels, - use_linear_projection, - use_video_decoder)); - if (use_quant) { - blocks["post_quant_conv"] = std::shared_ptr(new Conv2d(dd_config.z_channels, - embed_dim, - {1, 1})); - } - if (!decode_only) { - blocks["encoder"] = std::shared_ptr(new Encoder(dd_config.ch, - dd_config.ch_mult, - dd_config.num_res_blocks, - dd_config.in_channels, - dd_config.z_channels, - dd_config.double_z, - use_linear_projection)); - if (use_quant) { - int factor = dd_config.double_z ? 2 : 1; + if (tiling_params.enabled) { + float tile_overlap; + int tile_size_x, tile_size_y; + get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, x->ne[0], x->ne[1]); - blocks["quant_conv"] = std::shared_ptr(new Conv2d(embed_dim * factor, - dd_config.z_channels * factor, - {1, 1})); + if (!silent) { + LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y); + } + + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + return _compute(n_threads, in, true, &out, nullptr); + }; + sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling, silent); + } else { + if (!_compute(n_threads, x, true, &result, work_ctx)) { + LOG_ERROR("Failed to decode latetnts"); + free_compute_buffer(); + return nullptr; } } + free_compute_buffer(); + if (scale_input) { + scale_to_0_1(result); + } + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); + ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f); + return result; } - struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) { - // z: [N, z_channels, h, w] - if (sd_version_is_flux2(version)) { - // [N, C*p*p, h, w] -> [N, C, h*p, w*p] - int64_t p = 2; - - int64_t N = z->ne[3]; - int64_t C = z->ne[2] / p / p; - int64_t h = z->ne[1]; - int64_t w = z->ne[0]; - int64_t H = h * p; - int64_t W = w * p; - - z = ggml_reshape_4d(ctx->ggml_ctx, z, w * h, p * p, C, N); // [N, C, p*p, h*w] - z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 1, 0, 2, 3)); // [N, C, h*w, p*p] - z = ggml_reshape_4d(ctx->ggml_ctx, z, p, p, w, h * C * N); // [N*C*h, w, p, p] - z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 0, 2, 1, 3)); // [N*C*h, p, w, p] - z = ggml_reshape_4d(ctx->ggml_ctx, z, W, H, C, N); // [N, C, h*p, w*p] - } - - if (use_quant) { - auto post_quant_conv = std::dynamic_pointer_cast(blocks["post_quant_conv"]); - z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w] - } - auto decoder = std::dynamic_pointer_cast(blocks["decoder"]); - - ggml_set_name(z, "bench-start"); - auto h = decoder->forward(ctx, z); - ggml_set_name(h, "bench-end"); - return h; - } - - struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - // x: [N, in_channels, h, w] - auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); - - auto z = encoder->forward(ctx, x); // [N, 2*z_channels, h/8, w/8] - if (use_quant) { - auto quant_conv = std::dynamic_pointer_cast(blocks["quant_conv"]); - z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8] - } - if (sd_version_is_flux2(version)) { - z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0]; - - // [N, C, H, W] -> [N, C*p*p, H/p, W/p] - int64_t p = 2; - int64_t N = z->ne[3]; - int64_t C = z->ne[2]; - int64_t H = z->ne[1]; - int64_t W = z->ne[0]; - int64_t h = H / p; - int64_t w = W / p; - - z = ggml_reshape_4d(ctx->ggml_ctx, z, p, w, p, h * C * N); // [N*C*h, p, w, p] - z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 2, 1, 3)); // [N*C*h, w, p, p] - z = ggml_reshape_4d(ctx->ggml_ctx, z, p * p, w * h, C, N); // [N, C, h*w, p*p] - z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 1, 0, 2, 3)); // [N, C, p*p, h*w] - z = ggml_reshape_4d(ctx->ggml_ctx, z, w, h, p * p * C, N); // [N, C*p*p, h*w] - } - return z; - } -}; - -struct VAE : public GGMLRunner { - VAE(ggml_backend_t backend, bool offload_params_to_cpu) - : GGMLRunner(backend, offload_params_to_cpu) {} - virtual bool compute(const int n_threads, - struct ggml_tensor* z, - bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx) = 0; - virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; + virtual ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) = 0; + virtual ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0; + virtual ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0; + virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; }; struct FakeVAE : public VAE { - FakeVAE(ggml_backend_t backend, bool offload_params_to_cpu) - : VAE(backend, offload_params_to_cpu) {} - bool compute(const int n_threads, - struct ggml_tensor* z, - bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx) override { + FakeVAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu) + : VAE(version, backend, offload_params_to_cpu) {} + + int get_encoder_output_channels(int input_channels) { + return input_channels; + } + + bool _compute(const int n_threads, + struct ggml_tensor* z, + bool decode_graph, + struct ggml_tensor** output, + struct ggml_context* output_ctx) override { if (*output == nullptr && output_ctx != nullptr) { *output = ggml_dup_tensor(output_ctx, z); } @@ -642,6 +213,18 @@ struct FakeVAE : public VAE { return true; } + ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + return vae_output; + } + + ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { + return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + } + + ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { + return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + } + void get_param_tensors(std::map& tensors, const std::string prefix) override {} std::string get_desc() override { @@ -649,126 +232,4 @@ struct FakeVAE : public VAE { } }; -struct AutoEncoderKL : public VAE { - bool decode_only = true; - AutoencodingEngine ae; - - AutoEncoderKL(ggml_backend_t backend, - bool offload_params_to_cpu, - const String2TensorStorage& tensor_storage_map, - const std::string prefix, - bool decode_only = false, - bool use_video_decoder = false, - SDVersion version = VERSION_SD1) - : decode_only(decode_only), VAE(backend, offload_params_to_cpu) { - bool use_linear_projection = false; - for (const auto& [name, tensor_storage] : tensor_storage_map) { - if (!starts_with(name, prefix)) { - continue; - } - if (ends_with(name, "attn_1.proj_out.weight")) { - if (tensor_storage.n_dims == 2) { - use_linear_projection = true; - } - break; - } - } - ae = AutoencodingEngine(version, decode_only, use_linear_projection, use_video_decoder); - ae.init(params_ctx, tensor_storage_map, prefix); - } - - void set_conv2d_scale(float scale) override { - std::vector blocks; - ae.get_all_blocks(blocks); - for (auto block : blocks) { - if (block->get_desc() == "Conv2d") { - auto conv_block = (Conv2d*)block; - conv_block->set_scale(scale); - } - } - } - - std::string get_desc() override { - return "vae"; - } - - void get_param_tensors(std::map& tensors, const std::string prefix) override { - ae.get_param_tensors(tensors, prefix); - } - - struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - z = to_backend(z); - - auto runner_ctx = get_context(); - - struct ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z); - - ggml_build_forward_expand(gf, out); - - return gf; - } - - bool compute(const int n_threads, - struct ggml_tensor* z, - bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx = nullptr) override { - GGML_ASSERT(!decode_only || decode_graph); - auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(z, decode_graph); - }; - // ggml_set_f32(z, 0.5f); - // print_ggml_tensor(z); - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); - } - - void test() { - struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - params.mem_buffer = nullptr; - params.no_alloc = false; - - struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); - - { - // CPU, x{1, 3, 64, 64}: Pass - // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan - // CPU, x{2, 3, 64, 64}: Wrong result - // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result - auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2); - ggml_set_f32(x, 0.5f); - print_ggml_tensor(x); - struct ggml_tensor* out = nullptr; - - int64_t t0 = ggml_time_ms(); - compute(8, x, false, &out, work_ctx); - int64_t t1 = ggml_time_ms(); - - print_ggml_tensor(out); - LOG_DEBUG("encode test done in %lldms", t1 - t0); - } - - if (false) { - // CPU, z{1, 4, 8, 8}: Pass - // CUDA, z{1, 4, 8, 8}: Pass - // CPU, z{3, 4, 8, 8}: Wrong result - // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result - auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); - ggml_set_f32(z, 0.5f); - print_ggml_tensor(z); - struct ggml_tensor* out = nullptr; - - int64_t t0 = ggml_time_ms(); - compute(8, z, true, &out, work_ctx); - int64_t t1 = ggml_time_ms(); - - print_ggml_tensor(out); - LOG_DEBUG("decode test done in %lldms", t1 - t0); - } - }; -}; - -#endif +#endif // __VAE_HPP__ diff --git a/src/wan.hpp b/src/wan.hpp index d94fbd48..23119553 100644 --- a/src/wan.hpp +++ b/src/wan.hpp @@ -1109,7 +1109,8 @@ namespace WAN { }; struct WanVAERunner : public VAE { - bool decode_only = true; + float scale_factor = 1.0f; + bool decode_only = true; WanVAE ae; WanVAERunner(ggml_backend_t backend, @@ -1118,7 +1119,7 @@ namespace WAN { const std::string prefix = "", bool decode_only = false, SDVersion version = VERSION_WAN2) - : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(backend, offload_params_to_cpu) { + : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, offload_params_to_cpu) { ae.init(params_ctx, tensor_storage_map, prefix); } @@ -1130,6 +1131,101 @@ namespace WAN { ae.get_param_tensors(tensors, prefix); } + ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + return vae_output; + } + + void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector& latents_mean_vec, std::vector& latents_std_vec) { + GGML_ASSERT(latents->ne[channel_dim] == 16 || latents->ne[channel_dim] == 48); + if (latents->ne[channel_dim] == 16) { // Wan2.1 VAE + latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f, + 0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f}; + latents_std_vec = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f, + 3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f}; + } else if (latents->ne[channel_dim] == 48) { // Wan2.2 VAE + latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f, + -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f, + -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f, + -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f, + -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, + 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}; + latents_std_vec = { + 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, + 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, + 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, + 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, + 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, + 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; + } + } + + ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { + ggml_tensor* vae_latents = ggml_dup(work_ctx, latents); + int channel_dim = sd_version_is_wan(version) ? 3 : 2; + std::vector latents_mean_vec; + std::vector latents_std_vec; + get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); + + float mean; + float std_; + for (int i = 0; i < latents->ne[3]; i++) { + if (channel_dim == 3) { + mean = latents_mean_vec[i]; + std_ = latents_std_vec[i]; + } + for (int j = 0; j < latents->ne[2]; j++) { + if (channel_dim == 2) { + mean = latents_mean_vec[j]; + std_ = latents_std_vec[j]; + } + for (int k = 0; k < latents->ne[1]; k++) { + for (int l = 0; l < latents->ne[0]; l++) { + float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); + value = value * std_ / scale_factor + mean; + ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i); + } + } + } + } + + return vae_latents; + } + + ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { + ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents); + int channel_dim = sd_version_is_wan(version) ? 3 : 2; + std::vector latents_mean_vec; + std::vector latents_std_vec; + get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); + + float mean; + float std_; + for (int i = 0; i < latents->ne[3]; i++) { + if (channel_dim == 3) { + mean = latents_mean_vec[i]; + std_ = latents_std_vec[i]; + } + for (int j = 0; j < latents->ne[2]; j++) { + if (channel_dim == 2) { + mean = latents_mean_vec[j]; + std_ = latents_std_vec[j]; + } + for (int k = 0; k < latents->ne[1]; k++) { + for (int l = 0; l < latents->ne[0]; l++) { + float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); + value = (value - mean) * scale_factor / std_; + ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i); + } + } + } + } + return diffusion_latents; + } + + int get_encoder_output_channels(int input_channels) { + return static_cast(ae.z_dim); + } + struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { struct ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]); @@ -1173,11 +1269,11 @@ namespace WAN { return gf; } - bool compute(const int n_threads, - struct ggml_tensor* z, - bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx = nullptr) override { + bool _compute(const int n_threads, + struct ggml_tensor* z, + bool decode_graph, + struct ggml_tensor** output, + struct ggml_context* output_ctx = nullptr) override { if (true) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(z, decode_graph); @@ -1249,7 +1345,7 @@ namespace WAN { struct ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); - compute(8, z, true, &out, work_ctx); + _compute(8, z, true, &out, work_ctx); int64_t t1 = ggml_time_ms(); print_ggml_tensor(out); From 61d8331ef34dcdb28abcbd3993000b6f9dafba72 Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 15 Mar 2026 18:39:29 +0800 Subject: [PATCH 12/20] ci: avoid cuda docker build timeout by using -j16 --- Dockerfile.cuda | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.cuda b/Dockerfile.cuda index 13fef89a..4deb7247 100644 --- a/Dockerfile.cuda +++ b/Dockerfile.cuda @@ -11,7 +11,7 @@ COPY . . ARG CUDACXX=/usr/local/cuda/bin/nvcc RUN cmake . -B ./build -DSD_CUDA=ON -RUN cmake --build ./build --config Release --parallel +RUN cmake --build ./build --config Release -j$(nproc) FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime From 862a6586cb6fcec037c14f9ed902329ecec7d990 Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 16 Mar 2026 00:26:57 +0800 Subject: [PATCH 13/20] feat: add embedded WebUI (#1207) --- .github/workflows/build.yml | 73 ++++++++++++++++++++++++++++ .gitmodules | 3 ++ examples/server/CMakeLists.txt | 67 +++++++++++++++++++++++++ examples/server/README.md | 89 ++++++++++++++++++++++++++++++++++ examples/server/frontend | 1 + examples/server/main.cpp | 14 +++++- 6 files changed, 245 insertions(+), 2 deletions(-) create mode 160000 examples/server/frontend diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9816e424..1fbcbf94 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -21,11 +21,13 @@ on: "**/*.c", "**/*.cpp", "**/*.cu", + "examples/server/frontend/**", ] pull_request: types: [opened, synchronize, reopened] paths: [ + ".github/workflows/**", "**/CMakeLists.txt", "**/Makefile", "**/*.h", @@ -33,6 +35,7 @@ on: "**/*.c", "**/*.cpp", "**/*.cu", + "examples/server/frontend/**", ] env: @@ -53,6 +56,16 @@ jobs: with: submodules: recursive + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 9 + - name: Dependencies id: depends run: | @@ -106,6 +119,16 @@ jobs: with: submodules: recursive + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 9 + - name: Dependencies id: depends run: | @@ -174,6 +197,16 @@ jobs: with: submodules: recursive + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 9 + - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -223,6 +256,16 @@ jobs: with: submodules: recursive + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 9 + - name: Dependencies id: depends run: | @@ -294,6 +337,16 @@ jobs: with: submodules: recursive + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 9 + - name: Install cuda-toolkit id: cuda-toolkit if: ${{ matrix.build == 'cuda12' }} @@ -399,6 +452,16 @@ jobs: with: submodules: recursive + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 9 + - name: Cache ROCm Installation id: cache-rocm uses: actions/cache@v4 @@ -502,6 +565,16 @@ jobs: with: submodules: recursive + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 9 + - name: Free disk space run: | # Remove preinstalled SDKs and caches not needed for this job diff --git a/.gitmodules b/.gitmodules index 5a785197..5d66c879 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "ggml"] path = ggml url = https://github.com/ggml-org/ggml.git +[submodule "examples/server/frontend"] + path = examples/server/frontend + url = https://github.com/leejet/stable-ui.git diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index d1912608..8f5beba8 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,6 +1,73 @@ set(TARGET sd-server) +option(SD_SERVER_BUILD_FRONTEND "Build server frontend with pnpm" ON) + +set(FRONTEND_DIR "${CMAKE_CURRENT_SOURCE_DIR}/frontend") +set(GENERATED_HTML_HEADER "${FRONTEND_DIR}/dist/gen_index_html.h") + +set(HAVE_FRONTEND_BUILD OFF) + +if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}") + if(WIN32) + find_program(PNPM_EXECUTABLE NAMES pnpm.cmd pnpm) + else() + find_program(PNPM_EXECUTABLE NAMES pnpm) + endif() + + if(PNPM_EXECUTABLE) + message(STATUS "Frontend dir found: ${FRONTEND_DIR}") + message(STATUS "pnpm found: ${PNPM_EXECUTABLE}") + + set(HAVE_FRONTEND_BUILD ON) + + add_custom_target(${TARGET}_frontend_install + COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" install + WORKING_DIRECTORY "${FRONTEND_DIR}" + COMMENT "Installing frontend dependencies" + VERBATIM + ) + + add_custom_target(${TARGET}_frontend_build + COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build + WORKING_DIRECTORY "${FRONTEND_DIR}" + COMMENT "Building frontend" + VERBATIM + ) + + add_custom_target(${TARGET}_frontend_header + COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build:header + WORKING_DIRECTORY "${FRONTEND_DIR}" + COMMENT "Generating gen_index_html.h" + VERBATIM + ) + + add_dependencies(${TARGET}_frontend_build ${TARGET}_frontend_install) + add_dependencies(${TARGET}_frontend_header ${TARGET}_frontend_build) + + add_custom_target(${TARGET}_frontend + DEPENDS ${TARGET}_frontend_header + ) + + set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE) + else() + message(WARNING "pnpm not found, frontend build disabled") + endif() +else() + message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}") +endif() + add_executable(${TARGET} main.cpp) + +if(HAVE_FRONTEND_BUILD) + add_dependencies(${TARGET} ${TARGET}_frontend) + target_sources(${TARGET} PRIVATE "${GENERATED_HTML_HEADER}") + target_include_directories(${TARGET} PRIVATE "${FRONTEND_DIR}/dist") + target_compile_definitions(${TARGET} PRIVATE HAVE_INDEX_HTML) + message(STATUS "HAVE_INDEX_HTML enabled") +else() + message(STATUS "HAVE_INDEX_HTML disabled") +endif() + install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17) \ No newline at end of file diff --git a/examples/server/README.md b/examples/server/README.md index 38deff61..8aa2158f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -1,3 +1,92 @@ +# Frontend + +## Build with Frontend + +The server can optionally build the web frontend and embed it into the binary as `gen_index_html.h`. + +### Requirements + +Install the following tools: + +* **Node.js** ≥ 22.18 + https://nodejs.org/ + +* **pnpm** ≥ 10 + Install via npm: + +```bash +npm install -g pnpm +``` + +Verify installation: + +```bash +node -v +pnpm -v +``` + +### Install frontend dependencies + +Go to the frontend directory and install dependencies: + +```bash +cd examples/server/frontend +pnpm install +``` + +### Build the server with CMake + +Enable the frontend build option when configuring CMake: + +```bash +cmake -B build -DSD_SERVER_BUILD_FRONTEND=ON +cmake --build build --config Release +``` + +If `pnpm` is available, the build system will automatically run: + +``` +pnpm run build +pnpm run build:header +``` + +and embed the generated frontend into the server binary. + +## Frontend Repository + +The web frontend is maintained in a **separate repository**, https://github.com/leejet/stable-ui. + +If you want to modify the UI or frontend logic, please submit pull requests to the **frontend repository**. + +This repository (`stable-diffusion.cpp`) only vendors the frontend periodically. Changes from the frontend repo are synchronized: + +* approximately **every 1–2 weeks**, or +* when there are **major frontend updates** + +Because of this, frontend changes will **not appear here immediately** after being merged upstream. + +## Using an external frontend + +By default, the server uses the **embedded frontend** generated during the build (`gen_index_html.h`). + +You can also serve a custom frontend file instead of the embedded one by using: + +```bash +--serve-html-path +``` + +For example: + +```bash +sd-server --serve-html-path ./index.html +``` + +In this case, the server will load and serve the specified `index.html` file instead of the embedded frontend. This is useful when: + +* developing or testing frontend changes +* using a custom UI +* avoiding rebuilding the binary after frontend modifications + # Run ``` diff --git a/examples/server/frontend b/examples/server/frontend new file mode 160000 index 00000000..1a34176c --- /dev/null +++ b/examples/server/frontend @@ -0,0 +1 @@ +Subproject commit 1a34176cd6d39ad3a226b2b69047e71f6797f6bc diff --git a/examples/server/main.cpp b/examples/server/main.cpp index cc9e66cc..6e4340a6 100644 --- a/examples/server/main.cpp +++ b/examples/server/main.cpp @@ -13,6 +13,10 @@ #include "common/common.hpp" +#ifdef HAVE_INDEX_HTML +#include "frontend/dist/gen_index_html.h" +#endif + namespace fs = std::filesystem; // ----------------------- helpers ----------------------- @@ -380,7 +384,13 @@ int main(int argc, const char** argv) { return httplib::Server::HandlerResponse::Unhandled; }); - // root + // index html + std::string index_html; +#ifdef HAVE_INDEX_HTML + index_html.assign(reinterpret_cast(index_html_bytes), index_html_size); +#else + index_html = "Stable Diffusion Server is running"; +#endif svr.Get("/", [&](const httplib::Request&, httplib::Response& res) { if (!svr_params.serve_html_path.empty()) { std::ifstream file(svr_params.serve_html_path); @@ -392,7 +402,7 @@ int main(int argc, const char** argv) { res.set_content("Error: Unable to read HTML file", "text/plain"); } } else { - res.set_content("Stable Diffusion Server is running", "text/plain"); + res.set_content(index_html, "text/html"); } }); From 997bb11fb6203f06fe65e92db413f579df2c4b47 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Mon, 16 Mar 2026 15:16:43 +0100 Subject: [PATCH 14/20] fix: correct encoder channels for flux2 (#1346) --- src/auto_encoder_kl.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp index 581bc59d..b703fd37 100644 --- a/src/auto_encoder_kl.hpp +++ b/src/auto_encoder_kl.hpp @@ -613,6 +613,9 @@ public: int get_encoder_output_channels() { int factor = dd_config.double_z ? 2 : 1; + if (sd_version_is_flux2(version)) { + return dd_config.z_channels * 4; + } return dd_config.z_channels * factor; } }; @@ -927,4 +930,4 @@ struct AutoEncoderKL : public VAE { }; }; -#endif // __AUTO_ENCODER_KL_HPP__ \ No newline at end of file +#endif // __AUTO_ENCODER_KL_HPP__ From 84cbd88df148345ac4ebb04e32f1760dba4166de Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 16 Mar 2026 22:17:22 +0800 Subject: [PATCH 15/20] style: remove redundant struct qualifiers for consistent C/C++ type usage (#1349) --- src/anima.hpp | 130 ++++----- src/auto_encoder_kl.hpp | 50 ++-- src/cache_dit.hpp | 4 +- src/clip.hpp | 118 ++++---- src/common_block.hpp | 48 ++-- src/conditioner.hpp | 120 ++++---- src/control.hpp | 80 +++--- src/denoiser.hpp | 62 ++--- src/diffusion_model.hpp | 86 +++--- src/esrgan.hpp | 22 +- src/flux.hpp | 286 +++++++++---------- src/ggml_extend.hpp | 582 +++++++++++++++++++-------------------- src/latent-preview.h | 2 +- src/llm.hpp | 180 ++++++------ src/lora.hpp | 32 +-- src/ltxv.hpp | 6 +- src/mmdit.hpp | 174 ++++++------ src/model.cpp | 18 +- src/model.h | 2 +- src/pmid.hpp | 204 +++++++------- src/preprocessing.hpp | 58 ++-- src/qwen_image.hpp | 106 +++---- src/rope.hpp | 24 +- src/spectrum.hpp | 4 +- src/stable-diffusion.cpp | 161 ++++++----- src/t5.hpp | 114 ++++---- src/tae.hpp | 80 +++--- src/unet.hpp | 114 ++++---- src/upscaler.cpp | 4 +- src/vae.hpp | 18 +- src/wan.hpp | 396 +++++++++++++------------- src/z_image.hpp | 100 +++---- 32 files changed, 1692 insertions(+), 1693 deletions(-) diff --git a/src/anima.hpp b/src/anima.hpp index 191a096d..81dbefe7 100644 --- a/src/anima.hpp +++ b/src/anima.hpp @@ -13,9 +13,9 @@ namespace Anima { constexpr int ANIMA_GRAPH_SIZE = 65536; - __STATIC_INLINE__ struct ggml_tensor* apply_gate(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* gate) { + __STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* gate) { gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]); // [N, 1, C] return ggml_mul(ctx, x, gate); } @@ -26,7 +26,7 @@ namespace Anima { blocks["proj.1"] = std::make_shared(in_dim, out_dim, false); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto proj = std::dynamic_pointer_cast(blocks["proj.1"]); return proj->forward(ctx, x); } @@ -39,7 +39,7 @@ namespace Anima { blocks["1.linear_2"] = std::make_shared(in_dim, out_dim, false); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto linear_1 = std::dynamic_pointer_cast(blocks["1.linear_1"]); auto linear_2 = std::dynamic_pointer_cast(blocks["1.linear_2"]); @@ -62,10 +62,10 @@ namespace Anima { blocks["2"] = std::make_shared(hidden_features, 3 * in_features, false); } - std::pair forward(GGMLRunnerContext* ctx, - struct ggml_tensor* hidden_states, - struct ggml_tensor* embedded_timestep, - struct ggml_tensor* temb = nullptr) { + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* embedded_timestep, + ggml_tensor* temb = nullptr) { auto norm = std::dynamic_pointer_cast(blocks["norm"]); auto linear_1 = std::dynamic_pointer_cast(blocks["1"]); auto linear_2 = std::dynamic_pointer_cast(blocks["2"]); @@ -102,10 +102,10 @@ namespace Anima { blocks["2"] = std::make_shared(hidden_features, 2 * in_features, false); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* hidden_states, - struct ggml_tensor* embedded_timestep, - struct ggml_tensor* temb = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* embedded_timestep, + ggml_tensor* temb = nullptr) { auto norm = std::dynamic_pointer_cast(blocks["norm"]); auto linear_1 = std::dynamic_pointer_cast(blocks["1"]); auto linear_2 = std::dynamic_pointer_cast(blocks["2"]); @@ -152,11 +152,11 @@ namespace Anima { blocks[this->out_proj_name] = std::make_shared(inner_dim, query_dim, false); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* hidden_states, - struct ggml_tensor* encoder_hidden_states = nullptr, - struct ggml_tensor* pe_q = nullptr, - struct ggml_tensor* pe_k = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* encoder_hidden_states = nullptr, + ggml_tensor* pe_q = nullptr, + ggml_tensor* pe_k = nullptr) { if (encoder_hidden_states == nullptr) { encoder_hidden_states = hidden_states; } @@ -183,7 +183,7 @@ namespace Anima { q4 = q_norm->forward(ctx, q4); k4 = k_norm->forward(ctx, k4); - struct ggml_tensor* attn_out = nullptr; + ggml_tensor* attn_out = nullptr; if (pe_q != nullptr || pe_k != nullptr) { if (pe_q == nullptr) { pe_q = pe_k; @@ -227,7 +227,7 @@ namespace Anima { blocks["layer2"] = std::make_shared(hidden_dim, dim, false); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto layer1 = std::dynamic_pointer_cast(blocks["layer1"]); auto layer2 = std::dynamic_pointer_cast(blocks["layer2"]); @@ -245,7 +245,7 @@ namespace Anima { blocks["2"] = std::make_shared(hidden_dim, dim, true); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto layer0 = std::dynamic_pointer_cast(blocks["0"]); auto layer2 = std::dynamic_pointer_cast(blocks["2"]); @@ -267,11 +267,11 @@ namespace Anima { blocks["mlp"] = std::make_shared(model_dim, model_dim * 4); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context, - struct ggml_tensor* target_pe, - struct ggml_tensor* context_pe) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context, + ggml_tensor* target_pe, + ggml_tensor* context_pe) { auto norm_self_attn = std::dynamic_pointer_cast(blocks["norm_self_attn"]); auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); auto norm_cross_attn = std::dynamic_pointer_cast(blocks["norm_cross_attn"]); @@ -317,11 +317,11 @@ namespace Anima { blocks["norm"] = std::make_shared(target_dim, 1e-6f); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* source_hidden_states, - struct ggml_tensor* target_input_ids, - struct ggml_tensor* target_pe, - struct ggml_tensor* source_pe) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* source_hidden_states, + ggml_tensor* target_input_ids, + ggml_tensor* target_pe, + ggml_tensor* source_pe) { GGML_ASSERT(target_input_ids != nullptr); if (ggml_n_dims(target_input_ids) == 1) { target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1); @@ -360,12 +360,12 @@ namespace Anima { blocks["mlp"] = std::make_shared(hidden_size, hidden_size * mlp_ratio); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* hidden_states, - struct ggml_tensor* encoder_hidden_states, - struct ggml_tensor* embedded_timestep, - struct ggml_tensor* temb, - struct ggml_tensor* image_pe) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* encoder_hidden_states, + ggml_tensor* embedded_timestep, + ggml_tensor* temb, + ggml_tensor* image_pe) { auto norm1 = std::dynamic_pointer_cast(blocks["adaln_modulation_self_attn"]); auto attn1 = std::dynamic_pointer_cast(blocks["self_attn"]); auto norm2 = std::dynamic_pointer_cast(blocks["adaln_modulation_cross_attn"]); @@ -402,10 +402,10 @@ namespace Anima { blocks["linear"] = std::make_shared(hidden_size, patch_size * patch_size * out_channels, false); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* hidden_states, - struct ggml_tensor* embedded_timestep, - struct ggml_tensor* temb) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* hidden_states, + ggml_tensor* embedded_timestep, + ggml_tensor* temb) { auto adaln = std::dynamic_pointer_cast(blocks["adaln_modulation"]); auto linear = std::dynamic_pointer_cast(blocks["linear"]); @@ -445,15 +445,15 @@ namespace Anima { blocks["llm_adapter"] = std::make_shared(1024, 1024, 1024, 6, 16); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* encoder_hidden_states, - struct ggml_tensor* image_pe, - struct ggml_tensor* t5_ids = nullptr, - struct ggml_tensor* t5_weights = nullptr, - struct ggml_tensor* adapter_q_pe = nullptr, - struct ggml_tensor* adapter_k_pe = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* encoder_hidden_states, + ggml_tensor* image_pe, + ggml_tensor* t5_ids = nullptr, + ggml_tensor* t5_weights = nullptr, + ggml_tensor* adapter_q_pe = nullptr, + ggml_tensor* adapter_k_pe = nullptr) { GGML_ASSERT(x->ne[3] == 1); auto x_embedder = std::dynamic_pointer_cast(blocks["x_embedder"]); @@ -553,7 +553,7 @@ namespace Anima { return "anima"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { net.get_param_tensors(tensors, prefix + ".net"); } @@ -602,13 +602,13 @@ namespace Anima { return Rope::embed_nd(ids, bs, axis_thetas, axes_dim); } - struct ggml_cgraph* build_graph(struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* t5_ids = nullptr, - struct ggml_tensor* t5_weights = nullptr) { + ggml_cgraph* build_graph(ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* t5_ids = nullptr, + ggml_tensor* t5_weights = nullptr) { GGML_ASSERT(x->ne[3] == 1); - struct ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE); + ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE); x = to_backend(x); timesteps = to_backend(timesteps); @@ -668,14 +668,14 @@ namespace Anima { } bool compute(int n_threads, - struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* t5_ids = nullptr, - struct ggml_tensor* t5_weights = nullptr, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* t5_ids = nullptr, + ggml_tensor* t5_weights = nullptr, + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, t5_ids, t5_weights); }; return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp index b703fd37..6efdb41a 100644 --- a/src/auto_encoder_kl.hpp +++ b/src/auto_encoder_kl.hpp @@ -29,7 +29,7 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [N, in_channels, h, w] // t_emb is always None auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); @@ -65,7 +65,7 @@ protected: int64_t in_channels; bool use_linear; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { auto iter = tensor_storage_map.find(prefix + "proj_out.weight"); if (iter != tensor_storage_map.end()) { if (iter->second.n_dims == 4 && use_linear) { @@ -101,7 +101,7 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [N, in_channels, h, w] auto norm = std::dynamic_pointer_cast(blocks["norm"]); auto q_proj = std::dynamic_pointer_cast(blocks["q"]); @@ -178,8 +178,8 @@ public: {kernel_padding, 0, 0})); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x) override { // timesteps always None // skip_video always False // x: [N, IC, IH, IW] @@ -208,7 +208,7 @@ public: class VideoResnetBlock : public ResnetBlock { protected: - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32); params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); } @@ -227,7 +227,7 @@ public: blocks["time_stack"] = std::shared_ptr(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w] // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w] // t_emb is always None @@ -317,7 +317,7 @@ public: blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1})); } - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, in_channels, h, w] auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); @@ -435,7 +435,7 @@ public: blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}); } - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) { + virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) { // z: [N, z_channels, h, w] // alpha is always 0 // merge_strategy is always learned @@ -549,7 +549,7 @@ public: } } - struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) { + ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) { // z: [N, z_channels, h, w] if (sd_version_is_flux2(version)) { // [N, C*p*p, h, w] -> [N, C, h*p, w*p] @@ -581,7 +581,7 @@ public: return h; } - struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, in_channels, h, w] auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); @@ -681,18 +681,18 @@ struct AutoEncoderKL : public VAE { return "vae"; } - void get_param_tensors(std::map& tensors, const std::string prefix) override { + void get_param_tensors(std::map& tensors, const std::string prefix) override { ae.get_param_tensors(tensors, prefix); } - struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); z = to_backend(z); auto runner_ctx = get_context(); - struct ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z); + ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z); ggml_build_forward_expand(gf, out); @@ -700,12 +700,12 @@ struct AutoEncoderKL : public VAE { } bool _compute(const int n_threads, - struct ggml_tensor* z, + ggml_tensor* z, bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx = nullptr) override { + ggml_tensor** output, + ggml_context* output_ctx = nullptr) override { GGML_ASSERT(!decode_only || decode_graph); - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(z, decode_graph); }; // ggml_set_f32(z, 0.5f); @@ -715,8 +715,8 @@ struct AutoEncoderKL : public VAE { ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments, std::shared_ptr rng) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor* latents = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latents); + ggml_tensor* latents = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); + ggml_tensor* noise = ggml_dup_tensor(work_ctx, latents); ggml_ext_im_set_randn_f32(noise, rng); { float mean = 0; @@ -884,12 +884,12 @@ struct AutoEncoderKL : public VAE { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); { @@ -900,7 +900,7 @@ struct AutoEncoderKL : public VAE { auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2); ggml_set_f32(x, 0.5f); print_ggml_tensor(x); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); _compute(8, x, false, &out, work_ctx); @@ -918,7 +918,7 @@ struct AutoEncoderKL : public VAE { auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); ggml_set_f32(z, 0.5f); print_ggml_tensor(z); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); _compute(8, z, true, &out, work_ctx); diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp index 4e3cf693..9af627fb 100644 --- a/src/cache_dit.hpp +++ b/src/cache_dit.hpp @@ -799,7 +799,7 @@ struct CacheDitConditionState { } } - bool before_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output, float sigma, int step_index) { + bool before_condition(const void* cond, ggml_tensor* input, ggml_tensor* output, float sigma, int step_index) { if (!enabled() || step_index < 0) return false; @@ -867,7 +867,7 @@ struct CacheDitConditionState { return false; } - void after_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output) { + void after_condition(const void* cond, ggml_tensor* input, ggml_tensor* output) { if (!step_is_active()) return; diff --git a/src/clip.hpp b/src/clip.hpp index adecd4d2..f4e5ef78 100644 --- a/src/clip.hpp +++ b/src/clip.hpp @@ -473,7 +473,7 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, n_token, d_model] auto fc1 = std::dynamic_pointer_cast(blocks["fc1"]); auto fc2 = std::dynamic_pointer_cast(blocks["fc2"]); @@ -511,7 +511,7 @@ public: blocks["mlp"] = std::shared_ptr(new CLIPMLP(d_model, intermediate_size)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* mask = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* mask = nullptr) { // x: [N, n_token, d_model] auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); auto layer_norm1 = std::dynamic_pointer_cast(blocks["layer_norm1"]); @@ -541,10 +541,10 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* mask = nullptr, - int clip_skip = -1) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* mask = nullptr, + int clip_skip = -1) { // x: [N, n_token, d_model] int layer_idx = n_layer - 1; // LOG_DEBUG("clip_skip %d", clip_skip); @@ -573,7 +573,7 @@ protected: int64_t num_positions; bool force_clip_f32; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { enum ggml_type token_wtype = GGML_TYPE_F32; if (!force_clip_f32) { token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32); @@ -597,13 +597,13 @@ public: force_clip_f32(force_clip_f32) { } - struct ggml_tensor* get_token_embed_weight() { + ggml_tensor* get_token_embed_weight() { return params["token_embedding.weight"]; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* custom_embed_weight) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* custom_embed_weight) { // input_ids: [N, n_token] auto token_embed_weight = params["token_embedding.weight"]; auto position_embed_weight = params["position_embedding.weight"]; @@ -630,7 +630,7 @@ protected: int num_patches; int64_t num_positions; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { enum ggml_type patch_wtype = GGML_TYPE_F16; enum ggml_type class_wtype = GGML_TYPE_F32; enum ggml_type position_wtype = GGML_TYPE_F32; @@ -653,7 +653,7 @@ public: num_positions = num_patches + 1; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* pixel_values) { // pixel_values: [N, num_channels, image_size, image_size] // return: [N, num_positions, embed_dim] GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels); @@ -663,20 +663,20 @@ public: auto position_embed_weight = params["position_embedding.weight"]; // concat(patch_embedding, class_embedding) + position_embedding - struct ggml_tensor* patch_embedding; + ggml_tensor* patch_embedding; int64_t N = pixel_values->ne[3]; patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] - struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N); - class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim] - class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1] + ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N); + class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim] + class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1] - struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1] - x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim] - x = ggml_add(ctx->ggml_ctx, x, position_embed_weight); + ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1] + x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim] + x = ggml_add(ctx->ggml_ctx, x, position_embed_weight); return x; // [N, num_positions, embed_dim] } }; @@ -693,7 +693,7 @@ enum CLIPVersion { class CLIPTextModel : public GGMLBlock { protected: - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { if (version == OPEN_CLIP_VIT_BIGG_14) { enum ggml_type wtype = GGML_TYPE_F32; params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size); @@ -734,18 +734,18 @@ public: blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size)); } - struct ggml_tensor* get_token_embed_weight() { + ggml_tensor* get_token_embed_weight() { auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); return embeddings->get_token_embed_weight(); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* tkn_embeddings, - struct ggml_tensor* mask = nullptr, - size_t max_token_idx = 0, - bool return_pooled = false, - int clip_skip = -1) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* tkn_embeddings, + ggml_tensor* mask = nullptr, + size_t max_token_idx = 0, + bool return_pooled = false, + int clip_skip = -1) { // input_ids: [N, n_token] auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); @@ -804,10 +804,10 @@ public: blocks["post_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* pixel_values, - bool return_pooled = true, - int clip_skip = -1) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* pixel_values, + bool return_pooled = true, + int clip_skip = -1) { // pixel_values: [N, num_channels, image_size, image_size] auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); auto pre_layernorm = std::dynamic_pointer_cast(blocks["pre_layernorm"]); @@ -839,7 +839,7 @@ protected: int64_t out_features; bool transpose_weight; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); if (transpose_weight) { params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features); @@ -856,8 +856,8 @@ public: out_features(out_features), transpose_weight(transpose_weight) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { - struct ggml_tensor* w = params["weight"]; + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + ggml_tensor* w = params["weight"]; if (transpose_weight) { w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w)); } @@ -886,10 +886,10 @@ public: blocks["visual_projection"] = std::shared_ptr(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* pixel_values, - bool return_pooled = true, - int clip_skip = -1) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* pixel_values, + bool return_pooled = true, + int clip_skip = -1) { // pixel_values: [N, num_channels, image_size, image_size] // return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size] auto vision_model = std::dynamic_pointer_cast(blocks["vision_model"]); @@ -936,17 +936,17 @@ struct CLIPTextModelRunner : public GGMLRunner { return "clip"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* embeddings, - struct ggml_tensor* mask, - size_t max_token_idx = 0, - bool return_pooled = false, - int clip_skip = -1) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* embeddings, + ggml_tensor* mask, + size_t max_token_idx = 0, + bool return_pooled = false, + int clip_skip = -1) { size_t N = input_ids->ne[1]; size_t n_token = input_ids->ne[0]; if (input_ids->ne[0] > model.n_token) { @@ -957,17 +957,17 @@ struct CLIPTextModelRunner : public GGMLRunner { return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip); } - struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, - int num_custom_embeddings = 0, - void* custom_embeddings_data = nullptr, - size_t max_token_idx = 0, - bool return_pooled = false, - int clip_skip = -1) { - struct ggml_cgraph* gf = new_graph_custom(2048); + ggml_cgraph* build_graph(ggml_tensor* input_ids, + int num_custom_embeddings = 0, + void* custom_embeddings_data = nullptr, + size_t max_token_idx = 0, + bool return_pooled = false, + int clip_skip = -1) { + ggml_cgraph* gf = new_graph_custom(2048); input_ids = to_backend(input_ids); - struct ggml_tensor* embeddings = nullptr; + ggml_tensor* embeddings = nullptr; if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) { auto token_embed_weight = model.get_token_embed_weight(); @@ -997,7 +997,7 @@ struct CLIPTextModelRunner : public GGMLRunner { auto runner_ctx = get_context(); - struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip); + ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip); ggml_build_forward_expand(gf, hidden_states); @@ -1005,7 +1005,7 @@ struct CLIPTextModelRunner : public GGMLRunner { } bool compute(const int n_threads, - struct ggml_tensor* input_ids, + ggml_tensor* input_ids, int num_custom_embeddings, void* custom_embeddings_data, size_t max_token_idx, @@ -1013,7 +1013,7 @@ struct CLIPTextModelRunner : public GGMLRunner { int clip_skip, ggml_tensor** output, ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); }; return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); diff --git a/src/common_block.hpp b/src/common_block.hpp index 435afa4f..2cef389a 100644 --- a/src/common_block.hpp +++ b/src/common_block.hpp @@ -23,7 +23,7 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, channels, h, w] if (vae_downsample) { auto conv = std::dynamic_pointer_cast(blocks["conv"]); @@ -52,7 +52,7 @@ public: blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, channels, h, w] auto conv = std::dynamic_pointer_cast(blocks["conv"]); @@ -121,7 +121,7 @@ public: } } - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) { + virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb = nullptr) { // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml // [N, c, t, h, w] => [N, c, t, h * w] // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w] @@ -188,7 +188,7 @@ public: blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out * 2)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [ne3, ne2, ne1, dim_in] // return: [ne3, ne2, ne1, dim_out] auto proj = std::dynamic_pointer_cast(blocks["proj"]); @@ -214,7 +214,7 @@ public: blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out, bias)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [ne3, ne2, ne1, dim_in] // return: [ne3, ne2, ne1, dim_out] auto proj = std::dynamic_pointer_cast(blocks["proj"]); @@ -258,7 +258,7 @@ public: blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [ne3, ne2, ne1, dim] // return: [ne3, ne2, ne1, dim_out] @@ -297,9 +297,9 @@ public: // to_out_1 is nn.Dropout(), skip for inference } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context) { // x: [N, n_token, query_dim] // context: [N, n_context, context_dim] // return: [N, n_token, query_dim] @@ -355,9 +355,9 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context) { // x: [N, n_token, query_dim] // context: [N, n_context, context_dim] // return: [N, n_token, query_dim] @@ -406,7 +406,7 @@ protected: int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2 bool use_linear = false; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { auto iter = tensor_storage_map.find(prefix + "proj_out.weight"); if (iter != tensor_storage_map.end()) { int64_t inner_dim = n_head * d_head; @@ -456,9 +456,9 @@ public: } } - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context) { + virtual ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context) { // x: [N, in_channels, h, w] // context: [N, max_position(aka n_token), hidden_size(aka context_dim)] auto norm = std::dynamic_pointer_cast(blocks["norm"]); @@ -510,7 +510,7 @@ public: class AlphaBlender : public GGMLBlock { protected: - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix enum ggml_type wtype = GGML_TYPE_F32; params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); @@ -530,9 +530,9 @@ public: // since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x_spatial, - struct ggml_tensor* x_temporal) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x_spatial, + ggml_tensor* x_temporal) { // image_only_indicator is always tensor([0.]) float alpha = get_alpha(); auto x = ggml_add(ctx->ggml_ctx, @@ -555,10 +555,10 @@ public: blocks["time_mixer"] = std::shared_ptr(new AlphaBlender()); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* emb, - int num_video_frames) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* emb, + int num_video_frames) { // x: [N, channels, h, w] aka [b*t, channels, h, w] // emb: [N, emb_channels] aka [b*t, emb_channels] // image_only_indicator is always tensor([0.]) diff --git a/src/conditioner.hpp b/src/conditioner.hpp index d4a3146b..534a2f11 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -6,17 +6,17 @@ #include "t5.hpp" struct SDCondition { - struct ggml_tensor* c_crossattn = nullptr; // aka context - struct ggml_tensor* c_vector = nullptr; // aka y - struct ggml_tensor* c_concat = nullptr; + ggml_tensor* c_crossattn = nullptr; // aka context + ggml_tensor* c_vector = nullptr; // aka y + ggml_tensor* c_concat = nullptr; - std::vector extra_c_crossattns; + std::vector extra_c_crossattns; SDCondition() = default; - SDCondition(struct ggml_tensor* c_crossattn, - struct ggml_tensor* c_vector, - struct ggml_tensor* c_concat, - const std::vector& extra_c_crossattns = {}) + SDCondition(ggml_tensor* c_crossattn, + ggml_tensor* c_vector, + ggml_tensor* c_concat, + const std::vector& extra_c_crossattns = {}) : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {} }; @@ -37,7 +37,7 @@ struct Conditioner { const ConditionerParams& conditioner_params) = 0; virtual void alloc_params_buffer() = 0; virtual void free_params_buffer() = 0; - virtual void get_param_tensors(std::map& tensors) = 0; + virtual void get_param_tensors(std::map& tensors) = 0; virtual size_t get_params_buffer_size() = 0; virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} @@ -92,7 +92,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model"); if (sd_version_is_sdxl(version)) { text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model"); @@ -149,14 +149,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } return true; } - struct ggml_init_params params; - params.mem_size = 100 * 1024 * 1024; // max for custom embeddings 100 MB - params.mem_buffer = nullptr; - params.no_alloc = false; - struct ggml_context* embd_ctx = ggml_init(params); - struct ggml_tensor* embd = nullptr; - struct ggml_tensor* embd2 = nullptr; - auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) { + ggml_init_params params; + params.mem_size = 100 * 1024 * 1024; // max for custom embeddings 100 MB + params.mem_buffer = nullptr; + params.no_alloc = false; + ggml_context* embd_ctx = ggml_init(params); + ggml_tensor* embd = nullptr; + ggml_tensor* embd2 = nullptr; + auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) { if (tensor_storage.ne[0] != text_model->model.hidden_size) { if (text_model2) { if (tensor_storage.ne[0] == text_model2->model.hidden_size) { @@ -435,12 +435,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { int height, int adm_in_channels = -1, bool zero_out_masked = false) { - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] - struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] - struct ggml_tensor* chunk_hidden_states1 = nullptr; // [n_token, hidden_size] - struct ggml_tensor* chunk_hidden_states2 = nullptr; // [n_token, hidden_size2] - struct ggml_tensor* pooled = nullptr; + int64_t t0 = ggml_time_ms(); + ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] + ggml_tensor* chunk_hidden_states = nullptr; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] + ggml_tensor* chunk_hidden_states1 = nullptr; // [n_token, hidden_size] + ggml_tensor* chunk_hidden_states2 = nullptr; // [n_token, hidden_size2] + ggml_tensor* pooled = nullptr; std::vector hidden_states_vec; if (clip_skip <= 0) { @@ -455,9 +455,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::vector chunk_weights(weights.begin() + chunk_idx * chunk_len, weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - struct ggml_tensor* input_ids2 = nullptr; - size_t max_token_idx = 0; + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + ggml_tensor* input_ids2 = nullptr; + size_t max_token_idx = 0; if (sd_version_is_sdxl(version)) { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID); if (it != chunk_tokens.end()) { @@ -676,18 +676,18 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { return "clip_vision"; } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) { vision_model.get_param_tensors(tensors, "cond_stage_model.transformer"); } - struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled, int clip_skip) { - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_cgraph* build_graph(ggml_tensor* pixel_values, bool return_pooled, int clip_skip) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); pixel_values = to_backend(pixel_values); auto runner_ctx = get_context(); - struct ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip); + ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip); ggml_build_forward_expand(gf, hidden_states); @@ -700,7 +700,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { int clip_skip, ggml_tensor** output, ggml_context* output_ctx) { - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(pixel_values, return_pooled, clip_skip); }; return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); @@ -746,7 +746,7 @@ struct SD3CLIPEmbedder : public Conditioner { } } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { if (clip_l) { clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); } @@ -909,15 +909,15 @@ struct SD3CLIPEmbedder : public Conditioner { clip_skip = 2; } - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = nullptr; // [N, n_token*2, 4096] - struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token*2, 4096] - struct ggml_tensor* chunk_hidden_states_l = nullptr; // [n_token, hidden_size_l] - struct ggml_tensor* chunk_hidden_states_g = nullptr; // [n_token, hidden_size_g] - struct ggml_tensor* chunk_hidden_states_t5 = nullptr; // [n_token, hidden_size_t5] - struct ggml_tensor* pooled = nullptr; - struct ggml_tensor* pooled_l = nullptr; // [768,] - struct ggml_tensor* pooled_g = nullptr; // [1280,] + int64_t t0 = ggml_time_ms(); + ggml_tensor* hidden_states = nullptr; // [N, n_token*2, 4096] + ggml_tensor* chunk_hidden_states = nullptr; // [n_token*2, 4096] + ggml_tensor* chunk_hidden_states_l = nullptr; // [n_token, hidden_size_l] + ggml_tensor* chunk_hidden_states_g = nullptr; // [n_token, hidden_size_g] + ggml_tensor* chunk_hidden_states_t5 = nullptr; // [n_token, hidden_size_t5] + ggml_tensor* pooled = nullptr; + ggml_tensor* pooled_l = nullptr; // [768,] + ggml_tensor* pooled_g = nullptr; // [1280,] std::vector hidden_states_vec; size_t chunk_len = 77; @@ -1178,7 +1178,7 @@ struct FluxCLIPEmbedder : public Conditioner { } } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { if (clip_l) { clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); } @@ -1306,10 +1306,10 @@ struct FluxCLIPEmbedder : public Conditioner { clip_skip = 2; } - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] - struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] - struct ggml_tensor* pooled = nullptr; // [768,] + int64_t t0 = ggml_time_ms(); + ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] + ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] + ggml_tensor* pooled = nullptr; // [768,] std::vector hidden_states_vec; size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len; @@ -1448,7 +1448,7 @@ struct T5CLIPEmbedder : public Conditioner { } } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { if (t5) { t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); } @@ -1523,7 +1523,7 @@ struct T5CLIPEmbedder : public Conditioner { return {t5_tokens, t5_weights, t5_mask}; } - void modify_mask_to_attend_padding(struct ggml_tensor* mask, int max_seq_length, int num_extra_padding = 8) { + void modify_mask_to_attend_padding(ggml_tensor* mask, int max_seq_length, int num_extra_padding = 8) { float* mask_data = (float*)mask->data; int num_pad = 0; for (int64_t i = 0; i < max_seq_length; i++) { @@ -1554,11 +1554,11 @@ struct T5CLIPEmbedder : public Conditioner { auto& t5_weights = std::get<1>(token_and_weights); auto& t5_attn_mask_vec = std::get<2>(token_and_weights); - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] - struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] - struct ggml_tensor* pooled = nullptr; - struct ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [n_token] + int64_t t0 = ggml_time_ms(); + ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] + ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] + ggml_tensor* pooled = nullptr; + ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [n_token] std::vector hidden_states_vec; @@ -1658,7 +1658,7 @@ struct AnimaConditioner : public Conditioner { false); } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { llm->get_param_tensors(tensors, "text_encoders.llm"); } @@ -1736,7 +1736,7 @@ struct AnimaConditioner : public Conditioner { auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens); - struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024] + ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024] llm->compute(n_threads, input_ids, nullptr, @@ -1763,8 +1763,8 @@ struct AnimaConditioner : public Conditioner { } } - struct ggml_tensor* t5_ids_tensor = nullptr; - struct ggml_tensor* t5_weight_tensor = nullptr; + ggml_tensor* t5_ids_tensor = nullptr; + ggml_tensor* t5_weight_tensor = nullptr; if (!t5_tokens.empty()) { t5_ids_tensor = vector_to_ggml_tensor_i32(work_ctx, t5_tokens); t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights); @@ -1808,7 +1808,7 @@ struct LLMEmbedder : public Conditioner { enable_vision); } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { llm->get_param_tensors(tensors, "text_encoders.llm"); } @@ -1904,7 +1904,7 @@ struct LLMEmbedder : public Conditioner { tokenizer->pad_tokens(tokens, weights, max_length, true); } - struct ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] + ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); diff --git a/src/control.hpp b/src/control.hpp index 5bab0381..93df10a4 100644 --- a/src/control.hpp +++ b/src/control.hpp @@ -164,26 +164,26 @@ public: blocks["middle_block_out.0"] = std::shared_ptr(make_zero_conv(ch)); } - struct ggml_tensor* resblock_forward(std::string name, - GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* emb) { + ggml_tensor* resblock_forward(std::string name, + GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* emb) { auto block = std::dynamic_pointer_cast(blocks[name]); return block->forward(ctx, x, emb); } - struct ggml_tensor* attention_layer_forward(std::string name, - GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context) { + ggml_tensor* attention_layer_forward(std::string name, + GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context) { auto block = std::dynamic_pointer_cast(blocks[name]); return block->forward(ctx, x, context); } - struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx, - struct ggml_tensor* hint, - struct ggml_tensor* emb, - struct ggml_tensor* context) { + ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx, + ggml_tensor* hint, + ggml_tensor* emb, + ggml_tensor* context) { int num_input_blocks = 15; auto h = hint; for (int i = 0; i < num_input_blocks; i++) { @@ -198,13 +198,13 @@ public: return h; } - std::vector forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* hint, - struct ggml_tensor* guided_hint, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* y = nullptr) { + std::vector forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* hint, + ggml_tensor* guided_hint, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* y = nullptr) { // x: [N, in_channels, h, w] or [N, in_channels/2, h, w] // timesteps: [N,] // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768] @@ -246,7 +246,7 @@ public: emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim] } - std::vector outs; + std::vector outs; if (guided_hint == nullptr) { guided_hint = input_hint_block_forward(ctx, hint, emb, context); @@ -312,9 +312,9 @@ struct ControlNet : public GGMLRunner { ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory ggml_context* control_ctx = nullptr; - std::vector controls; // (12 input block outputs, 1 middle block output) SD 1.5 - struct ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference - bool guided_hint_cached = false; + std::vector controls; // (12 input block outputs, 1 middle block output) SD 1.5 + ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference + bool guided_hint_cached = false; ControlNet(ggml_backend_t backend, bool offload_params_to_cpu, @@ -328,8 +328,8 @@ struct ControlNet : public GGMLRunner { free_control_ctx(); } - void alloc_control_ctx(std::vector outs) { - struct ggml_init_params params; + void alloc_control_ctx(std::vector outs) { + ggml_init_params params; params.mem_size = static_cast(outs.size() * ggml_tensor_overhead()) + 1024 * 1024; params.mem_buffer = nullptr; params.no_alloc = true; @@ -370,16 +370,16 @@ struct ControlNet : public GGMLRunner { return "control_net"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { control_net.get_param_tensors(tensors, prefix); } - struct ggml_cgraph* build_graph(struct ggml_tensor* x, - struct ggml_tensor* hint, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* y = nullptr) { - struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE); + ggml_cgraph* build_graph(ggml_tensor* x, + ggml_tensor* hint, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* y = nullptr) { + ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE); x = to_backend(x); if (guided_hint_cached) { @@ -414,18 +414,18 @@ struct ControlNet : public GGMLRunner { } bool compute(int n_threads, - struct ggml_tensor* x, - struct ggml_tensor* hint, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* y, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) { + ggml_tensor* x, + ggml_tensor* hint, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* y, + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] // y: [N, adm_in_channels] or [1, adm_in_channels] - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, hint, timesteps, context, y); }; diff --git a/src/denoiser.hpp b/src/denoiser.hpp index 40bd7cb7..b92ca4e3 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -773,8 +773,8 @@ static bool sample_k_diffusion(sample_method_t method, // sample_euler_ancestral switch (method) { case EULER_A_SAMPLE_METHOD: { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); + ggml_tensor* d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -830,7 +830,7 @@ static bool sample_k_diffusion(sample_method_t method, } break; case EULER_SAMPLE_METHOD: // Implemented without any sigma churn { - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + ggml_tensor* d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -865,8 +865,8 @@ static bool sample_k_diffusion(sample_method_t method, } } break; case HEUN_SAMPLE_METHOD: { - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise @@ -921,8 +921,8 @@ static bool sample_k_diffusion(sample_method_t method, } } break; case DPM2_SAMPLE_METHOD: { - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise @@ -979,8 +979,8 @@ static bool sample_k_diffusion(sample_method_t method, } break; case DPMPP2S_A_SAMPLE_METHOD: { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); + ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise @@ -1050,7 +1050,7 @@ static bool sample_k_diffusion(sample_method_t method, } break; case DPMPP2M_SAMPLE_METHOD: // DPM++ (2M) from Karras et al (2022) { - struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); + ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); auto t_fn = [](float sigma) -> float { return -log(sigma); }; @@ -1092,7 +1092,7 @@ static bool sample_k_diffusion(sample_method_t method, } break; case DPMPP2Mv2_SAMPLE_METHOD: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 { - struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); + ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); auto t_fn = [](float sigma) -> float { return -log(sigma); }; @@ -1157,8 +1157,8 @@ static bool sample_k_diffusion(sample_method_t method, } float* vec_denoised = (float*)denoised->data; // d_cur = (x_cur - denoised) / sigma - struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur); - float* vec_d_cur = (float*)d_cur->data; + ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur); + float* vec_d_cur = (float*)d_cur->data; for (int j = 0; j < ggml_nelements(d_cur); j++) { vec_d_cur[j] = (vec_x_cur[j] - vec_denoised[j]) / sigma; @@ -1225,11 +1225,11 @@ static bool sample_k_diffusion(sample_method_t method, float t_next = sigmas[i + 1]; // Denoising step - ggml_tensor* denoised = model(x, sigma, i + 1); - float* vec_denoised = (float*)denoised->data; - struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x); - float* vec_d_cur = (float*)d_cur->data; - float* vec_x = (float*)x->data; + ggml_tensor* denoised = model(x, sigma, i + 1); + float* vec_denoised = (float*)denoised->data; + ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x); + float* vec_d_cur = (float*)d_cur->data; + float* vec_x = (float*)x->data; // d_cur = (x - denoised) / sigma for (int j = 0; j < ggml_nelements(d_cur); j++) { @@ -1290,8 +1290,8 @@ static bool sample_k_diffusion(sample_method_t method, } break; case LCM_SAMPLE_METHOD: // Latent Consistency Models { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); + ggml_tensor* d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -1358,9 +1358,9 @@ static bool sample_k_diffusion(sample_method_t method, alphas_cumprod[i]); } - struct ggml_tensor* pred_original_sample = + ggml_tensor* pred_original_sample = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* variance_noise = + ggml_tensor* variance_noise = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { @@ -1422,7 +1422,7 @@ static bool sample_k_diffusion(sample_method_t method, // model_output = model() is the D(x, sigma) as // defined in Karras et al. (2022), p. 3, Table 1 and // p. 8 (7), compare also p. 38 (226) therein. - struct ggml_tensor* model_output = + ggml_tensor* model_output = model(x, sigma, i + 1); // Here model_output is still the k-diffusion denoiser // output, not the U-net output F_theta(c_in(sigma) x; @@ -1545,9 +1545,9 @@ static bool sample_k_diffusion(sample_method_t method, } int original_steps = 50; - struct ggml_tensor* pred_original_sample = + ggml_tensor* pred_original_sample = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* noise = + ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { @@ -1581,7 +1581,7 @@ static bool sample_k_diffusion(sample_method_t method, vec_x[j] *= std::sqrt(sigma * sigma + 1); } } - struct ggml_tensor* model_output = + ggml_tensor* model_output = model(x, sigma, i + 1); { float* vec_x = (float*)x->data; @@ -1689,8 +1689,8 @@ static bool sample_k_diffusion(sample_method_t method, } break; case RES_MULTISTEP_SAMPLE_METHOD: // Res Multistep sampler { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); + ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); + ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); bool have_old_sigma = false; float old_sigma_down = 0.0f; @@ -1797,9 +1797,9 @@ static bool sample_k_diffusion(sample_method_t method, } break; case RES_2S_SAMPLE_METHOD: // Res 2s sampler { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x0 = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); + ggml_tensor* x0 = ggml_dup_tensor(work_ctx, x); + ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); const float c2 = 0.5f; auto t_fn = [](float sigma) -> float { return -logf(sigma); }; diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 329bb9d9..07d9df89 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -10,33 +10,33 @@ #include "z_image.hpp" struct DiffusionParams { - struct ggml_tensor* x = nullptr; - struct ggml_tensor* timesteps = nullptr; - struct ggml_tensor* context = nullptr; - struct ggml_tensor* c_concat = nullptr; - struct ggml_tensor* y = nullptr; - struct ggml_tensor* guidance = nullptr; - std::vector ref_latents = {}; - bool increase_ref_index = false; - int num_video_frames = -1; - std::vector controls = {}; - float control_strength = 0.f; - struct ggml_tensor* vace_context = nullptr; - float vace_strength = 1.f; - std::vector skip_layers = {}; + ggml_tensor* x = nullptr; + ggml_tensor* timesteps = nullptr; + ggml_tensor* context = nullptr; + ggml_tensor* c_concat = nullptr; + ggml_tensor* y = nullptr; + ggml_tensor* guidance = nullptr; + std::vector ref_latents = {}; + bool increase_ref_index = false; + int num_video_frames = -1; + std::vector controls = {}; + float control_strength = 0.f; + ggml_tensor* vace_context = nullptr; + float vace_strength = 1.f; + std::vector skip_layers = {}; }; struct DiffusionModel { - virtual std::string get_desc() = 0; + virtual std::string get_desc() = 0; virtual bool compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) = 0; - virtual void alloc_params_buffer() = 0; - virtual void free_params_buffer() = 0; - virtual void free_compute_buffer() = 0; - virtual void get_param_tensors(std::map& tensors) = 0; - virtual size_t get_params_buffer_size() = 0; + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) = 0; + virtual void alloc_params_buffer() = 0; + virtual void free_params_buffer() = 0; + virtual void free_compute_buffer() = 0; + virtual void get_param_tensors(std::map& tensors) = 0; + virtual size_t get_params_buffer_size() = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter){}; virtual int64_t get_adm_in_channels() = 0; virtual void set_flash_attention_enabled(bool enabled) = 0; @@ -69,7 +69,7 @@ struct UNetModel : public DiffusionModel { unet.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { unet.get_param_tensors(tensors, "model.diffusion_model"); } @@ -95,8 +95,8 @@ struct UNetModel : public DiffusionModel { bool compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) override { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) override { return unet.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, @@ -134,7 +134,7 @@ struct MMDiTModel : public DiffusionModel { mmdit.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { mmdit.get_param_tensors(tensors, "model.diffusion_model"); } @@ -160,8 +160,8 @@ struct MMDiTModel : public DiffusionModel { bool compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) override { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) override { return mmdit.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, @@ -200,7 +200,7 @@ struct FluxModel : public DiffusionModel { flux.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { flux.get_param_tensors(tensors, "model.diffusion_model"); } @@ -226,8 +226,8 @@ struct FluxModel : public DiffusionModel { bool compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) override { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) override { return flux.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, @@ -270,7 +270,7 @@ struct AnimaModel : public DiffusionModel { anima.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { anima.get_param_tensors(tensors, prefix); } @@ -296,8 +296,8 @@ struct AnimaModel : public DiffusionModel { bool compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) override { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) override { return anima.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, @@ -337,7 +337,7 @@ struct WanModel : public DiffusionModel { wan.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { wan.get_param_tensors(tensors, prefix); } @@ -363,8 +363,8 @@ struct WanModel : public DiffusionModel { bool compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) override { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) override { return wan.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, @@ -408,7 +408,7 @@ struct QwenImageModel : public DiffusionModel { qwen_image.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { qwen_image.get_param_tensors(tensors, prefix); } @@ -434,8 +434,8 @@ struct QwenImageModel : public DiffusionModel { bool compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) override { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) override { return qwen_image.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, @@ -475,7 +475,7 @@ struct ZImageModel : public DiffusionModel { z_image.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) override { + void get_param_tensors(std::map& tensors) override { z_image.get_param_tensors(tensors, prefix); } @@ -501,8 +501,8 @@ struct ZImageModel : public DiffusionModel { bool compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) override { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) override { return z_image.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, diff --git a/src/esrgan.hpp b/src/esrgan.hpp index f740c2bc..efb3aed6 100644 --- a/src/esrgan.hpp +++ b/src/esrgan.hpp @@ -27,11 +27,11 @@ public: blocks["conv5"] = std::shared_ptr(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1})); } - struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) { return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [n, num_feat, h, w] // return: [n, num_feat, h, w] @@ -64,7 +64,7 @@ public: blocks["rdb3"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [n, num_feat, h, w] // return: [n, num_feat, h, w] @@ -112,11 +112,11 @@ public: int get_scale() { return scale; } int get_num_block() { return num_block; } - struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) { return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [n, num_in_ch, h, w] // return: [n, num_out_ch, h*scale, w*scale] auto conv_first = std::dynamic_pointer_cast(blocks["conv_first"]); @@ -341,24 +341,24 @@ struct ESRGAN : public GGMLRunner { return success; } - struct ggml_cgraph* build_graph(struct ggml_tensor* x) { + ggml_cgraph* build_graph(ggml_tensor* x) { if (!rrdb_net) return nullptr; constexpr int kGraphNodes = 1 << 16; // 65k - struct ggml_cgraph* gf = new_graph_custom(kGraphNodes); + ggml_cgraph* gf = new_graph_custom(kGraphNodes); x = to_backend(x); - auto runner_ctx = get_context(); - struct ggml_tensor* out = rrdb_net->forward(&runner_ctx, x); + auto runner_ctx = get_context(); + ggml_tensor* out = rrdb_net->forward(&runner_ctx, x); ggml_build_forward_expand(gf, out); return gf; } bool compute(const int n_threads, - struct ggml_tensor* x, + ggml_tensor* x, ggml_tensor** output, ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); }; return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); diff --git a/src/flux.hpp b/src/flux.hpp index 1204ae1e..93b9350a 100644 --- a/src/flux.hpp +++ b/src/flux.hpp @@ -19,7 +19,7 @@ namespace Flux { blocks["out_layer"] = std::shared_ptr(new Linear(hidden_dim, hidden_dim, bias)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [..., in_dim] // return: [..., hidden_dim] auto in_layer = std::dynamic_pointer_cast(blocks["in_layer"]); @@ -37,7 +37,7 @@ namespace Flux { int64_t hidden_size; float eps; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { ggml_type wtype = GGML_TYPE_F32; params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); } @@ -48,10 +48,10 @@ namespace Flux { : hidden_size(hidden_size), eps(eps) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { - struct ggml_tensor* w = params["scale"]; - x = ggml_rms_norm(ctx->ggml_ctx, x, eps); - x = ggml_mul(ctx->ggml_ctx, x, w); + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + ggml_tensor* w = params["scale"]; + x = ggml_rms_norm(ctx->ggml_ctx, x, eps); + x = ggml_mul(ctx->ggml_ctx, x, w); return x; } }; @@ -63,7 +63,7 @@ namespace Flux { blocks["key_norm"] = std::shared_ptr(new RMSNorm(dim)); } - struct ggml_tensor* query_norm(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* query_norm(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [..., dim] // return: [..., dim] auto norm = std::dynamic_pointer_cast(blocks["query_norm"]); @@ -72,7 +72,7 @@ namespace Flux { return x; } - struct ggml_tensor* key_norm(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* key_norm(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [..., dim] // return: [..., dim] auto norm = std::dynamic_pointer_cast(blocks["key_norm"]); @@ -98,7 +98,7 @@ namespace Flux { blocks["proj"] = std::shared_ptr(new Linear(dim, dim, proj_bias)); } - std::vector pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + std::vector pre_attention(GGMLRunnerContext* ctx, ggml_tensor* x) { auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); auto norm = std::dynamic_pointer_cast(blocks["norm"]); @@ -115,17 +115,17 @@ namespace Flux { return {q, k, v}; } - struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* post_attention(GGMLRunnerContext* ctx, ggml_tensor* x) { auto proj = std::dynamic_pointer_cast(blocks["proj"]); x = proj->forward(ctx, x); // [N, n_token, dim] return x; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* pe, - struct ggml_tensor* mask) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe, + ggml_tensor* mask) { // x: [N, n_token, dim] // pe: [n_token, d_head/2, 2, 2] // return [N, n_token, dim] @@ -147,7 +147,7 @@ namespace Flux { blocks["2"] = std::make_shared(intermediate_size, hidden_size, bias); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto mlp_0 = std::dynamic_pointer_cast(blocks["0"]); auto mlp_2 = std::dynamic_pointer_cast(blocks["2"]); @@ -170,7 +170,7 @@ namespace Flux { blocks["down_proj"] = std::make_shared(intermediate_size, hidden_size, bias); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto gate_proj = std::dynamic_pointer_cast(blocks["gate_proj"]); auto up_proj = std::dynamic_pointer_cast(blocks["up_proj"]); auto down_proj = std::dynamic_pointer_cast(blocks["down_proj"]); @@ -212,7 +212,7 @@ namespace Flux { blocks["lin"] = std::shared_ptr(new Linear(dim, dim * multiplier, bias)); } - std::vector forward(GGMLRunnerContext* ctx, struct ggml_tensor* vec) { + std::vector forward(GGMLRunnerContext* ctx, ggml_tensor* vec) { // x: [N, dim] // return: [ModulationOut, ModulationOut] auto lin = std::dynamic_pointer_cast(blocks["lin"]); @@ -232,11 +232,11 @@ namespace Flux { } }; - __STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* shift, - struct ggml_tensor* scale, - bool skip_reshape = false) { + __STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* shift, + ggml_tensor* scale, + bool skip_reshape = false) { // x: [N, L, C] // scale: [N, C] // shift: [N, C] @@ -294,7 +294,7 @@ namespace Flux { } } - std::vector get_distil_img_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) { + std::vector get_distil_img_mod(GGMLRunnerContext* ctx, ggml_tensor* vec) { // TODO: not hardcoded? const int single_blocks_count = 38; const int double_blocks_count = 19; @@ -303,7 +303,7 @@ namespace Flux { return {ModulationOut(ctx, vec, offset), ModulationOut(ctx, vec, offset + 3)}; } - std::vector get_distil_txt_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) { + std::vector get_distil_txt_mod(GGMLRunnerContext* ctx, ggml_tensor* vec) { // TODO: not hardcoded? const int single_blocks_count = 38; const int double_blocks_count = 19; @@ -312,14 +312,14 @@ namespace Flux { return {ModulationOut(ctx, vec, offset), ModulationOut(ctx, vec, offset + 3)}; } - std::pair forward(GGMLRunnerContext* ctx, - struct ggml_tensor* img, - struct ggml_tensor* txt, - struct ggml_tensor* vec, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr, - std::vector img_mods = {}, - std::vector txt_mods = {}) { + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* img, + ggml_tensor* txt, + ggml_tensor* vec, + ggml_tensor* pe, + ggml_tensor* mask = nullptr, + std::vector img_mods = {}, + std::vector txt_mods = {}) { // img: [N, n_img_token, hidden_size] // txt: [N, n_txt_token, hidden_size] // pe: [n_img_token + n_txt_token, d_head/2, 2, 2] @@ -457,17 +457,17 @@ namespace Flux { } } - ModulationOut get_distil_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) { + ModulationOut get_distil_mod(GGMLRunnerContext* ctx, ggml_tensor* vec) { int64_t offset = 3 * idx; return ModulationOut(ctx, vec, offset); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* vec, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr, - std::vector mods = {}) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* vec, + ggml_tensor* pe, + ggml_tensor* mask = nullptr, + std::vector mods = {}) { // x: [N, n_token, hidden_size] // pe: [n_token, d_head/2, 2, 2] // return: [N, n_token, hidden_size] @@ -539,7 +539,7 @@ namespace Flux { } } - ModulationOut get_distil_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) { + ModulationOut get_distil_mod(GGMLRunnerContext* ctx, ggml_tensor* vec) { int64_t offset = vec->ne[2] - 2; int64_t stride = vec->nb[1] * vec->ne[1]; auto shift = ggml_view_2d(ctx->ggml_ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0)); // [N, dim] @@ -548,15 +548,15 @@ namespace Flux { return {shift, scale, nullptr}; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* c) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* c) { // x: [N, n_token, hidden_size] // c: [N, hidden_size] // return: [N, n_token, patch_size * patch_size * out_channels] auto norm_final = std::dynamic_pointer_cast(blocks["norm_final"]); auto linear = std::dynamic_pointer_cast(blocks["linear"]); - struct ggml_tensor *shift, *scale; + ggml_tensor *shift, *scale; if (prune_mod) { auto mod = get_distil_mod(ctx, c); shift = mod.shift; @@ -589,7 +589,7 @@ namespace Flux { blocks["out_proj"] = std::shared_ptr(new Linear(inner_size, hidden_size, true)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto in_proj = std::dynamic_pointer_cast(blocks["in_proj"]); auto out_proj = std::dynamic_pointer_cast(blocks["out_proj"]); @@ -612,9 +612,9 @@ namespace Flux { blocks["embedder.0"] = std::make_shared(in_channels + max_freqs * max_freqs, hidden_size_input); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* dct) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* dct) { // x: (B, P^2, C) // dct: (1, P^2, max_freqs^2) // return: (B, P^2, hidden_size_input) @@ -639,9 +639,9 @@ namespace Flux { blocks["norm"] = std::make_shared(hidden_size_x); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* s) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* s) { // x: (batch_size, n_token, hidden_size_x) // s: (batch_size, hidden_size_s) // return: (batch_size, n_token, hidden_size_x) @@ -689,8 +689,8 @@ namespace Flux { blocks["linear"] = std::make_shared(hidden_size, out_channels); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x) { auto norm = std::dynamic_pointer_cast(blocks["norm"]); auto linear = std::dynamic_pointer_cast(blocks["linear"]); @@ -708,8 +708,8 @@ namespace Flux { blocks["conv"] = std::make_shared(hidden_size, out_channels, std::pair{3, 3}, std::pair{1, 1}, std::pair{1, 1}); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x) { // x: [N, C, H, W] auto norm = std::dynamic_pointer_cast(blocks["norm"]); auto conv = std::dynamic_pointer_cast(blocks["conv"]); @@ -847,15 +847,15 @@ namespace Flux { } } - struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx, - struct ggml_tensor* img, - struct ggml_tensor* txt, - struct ggml_tensor* timesteps, - struct ggml_tensor* y, - struct ggml_tensor* guidance, - struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = nullptr, - std::vector skip_layers = {}) { + ggml_tensor* forward_orig(GGMLRunnerContext* ctx, + ggml_tensor* img, + ggml_tensor* txt, + ggml_tensor* timesteps, + ggml_tensor* y, + ggml_tensor* guidance, + ggml_tensor* pe, + ggml_tensor* mod_index_arange = nullptr, + std::vector skip_layers = {}) { auto img_in = std::dynamic_pointer_cast(blocks["img_in"]); auto txt_in = std::dynamic_pointer_cast(blocks["txt_in"]); auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); @@ -864,8 +864,8 @@ namespace Flux { img = img_in->forward(ctx, img); } - struct ggml_tensor* vec; - struct ggml_tensor* txt_img_mask = nullptr; + ggml_tensor* vec; + ggml_tensor* txt_img_mask = nullptr; if (params.is_chroma) { int64_t mod_index_length = 344; auto approx = std::dynamic_pointer_cast(blocks["distilled_guidance_layer"]); @@ -967,27 +967,27 @@ namespace Flux { return img; } - struct ggml_tensor* _apply_x0_residual(GGMLRunnerContext* ctx, - struct ggml_tensor* predicted, - struct ggml_tensor* noisy, - struct ggml_tensor* timesteps) { + ggml_tensor* _apply_x0_residual(GGMLRunnerContext* ctx, + ggml_tensor* predicted, + ggml_tensor* noisy, + ggml_tensor* timesteps) { auto x = ggml_sub(ctx->ggml_ctx, noisy, predicted); x = ggml_div(ctx->ggml_ctx, x, timesteps); return x; } - struct ggml_tensor* forward_chroma_radiance(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* context, - struct ggml_tensor* c_concat, - struct ggml_tensor* y, - struct ggml_tensor* guidance, - struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = nullptr, - struct ggml_tensor* dct = nullptr, - std::vector ref_latents = {}, - std::vector skip_layers = {}) { + ggml_tensor* forward_chroma_radiance(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* c_concat, + ggml_tensor* y, + ggml_tensor* guidance, + ggml_tensor* pe, + ggml_tensor* mod_index_arange = nullptr, + ggml_tensor* dct = nullptr, + std::vector ref_latents = {}, + std::vector skip_layers = {}) { GGML_ASSERT(x->ne[3] == 1); int64_t W = x->ne[0]; @@ -1050,18 +1050,18 @@ namespace Flux { return out; } - struct ggml_tensor* forward_flux_chroma(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* context, - struct ggml_tensor* c_concat, - struct ggml_tensor* y, - struct ggml_tensor* guidance, - struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = nullptr, - struct ggml_tensor* dct = nullptr, - std::vector ref_latents = {}, - std::vector skip_layers = {}) { + ggml_tensor* forward_flux_chroma(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* c_concat, + ggml_tensor* y, + ggml_tensor* guidance, + ggml_tensor* pe, + ggml_tensor* mod_index_arange = nullptr, + ggml_tensor* dct = nullptr, + std::vector ref_latents = {}, + std::vector skip_layers = {}) { GGML_ASSERT(x->ne[3] == 1); int64_t W = x->ne[0]; @@ -1119,18 +1119,18 @@ namespace Flux { return out; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* context, - struct ggml_tensor* c_concat, - struct ggml_tensor* y, - struct ggml_tensor* guidance, - struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = nullptr, - struct ggml_tensor* dct = nullptr, - std::vector ref_latents = {}, - std::vector skip_layers = {}) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* c_concat, + ggml_tensor* y, + ggml_tensor* guidance, + ggml_tensor* pe, + ggml_tensor* mod_index_arange = nullptr, + ggml_tensor* dct = nullptr, + std::vector ref_latents = {}, + std::vector skip_layers = {}) { // Forward pass of DiT. // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) // timestep: (N,) tensor of diffusion timesteps @@ -1299,7 +1299,7 @@ namespace Flux { return "flux"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { flux.get_param_tensors(tensors, prefix); } @@ -1353,20 +1353,20 @@ namespace Flux { return dct; } - struct ggml_cgraph* build_graph(struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* c_concat, - struct ggml_tensor* y, - struct ggml_tensor* guidance, - std::vector ref_latents = {}, - bool increase_ref_index = false, - std::vector skip_layers = {}) { + ggml_cgraph* build_graph(ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* c_concat, + ggml_tensor* y, + ggml_tensor* guidance, + std::vector ref_latents = {}, + bool increase_ref_index = false, + std::vector skip_layers = {}) { GGML_ASSERT(x->ne[3] == 1); - struct ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE); + ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE); - struct ggml_tensor* mod_index_arange = nullptr; - struct ggml_tensor* dct = nullptr; // for chroma radiance + ggml_tensor* mod_index_arange = nullptr; + ggml_tensor* dct = nullptr; // for chroma radiance x = to_backend(x); context = to_backend(context); @@ -1437,18 +1437,18 @@ namespace Flux { auto runner_ctx = get_context(); - struct ggml_tensor* out = flux.forward(&runner_ctx, - x, - timesteps, - context, - c_concat, - y, - guidance, - pe, - mod_index_arange, - dct, - ref_latents, - skip_layers); + ggml_tensor* out = flux.forward(&runner_ctx, + x, + timesteps, + context, + c_concat, + y, + guidance, + pe, + mod_index_arange, + dct, + ref_latents, + skip_layers); ggml_build_forward_expand(gf, out); @@ -1456,23 +1456,23 @@ namespace Flux { } bool compute(int n_threads, - struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* c_concat, - struct ggml_tensor* y, - struct ggml_tensor* guidance, + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* c_concat, + ggml_tensor* y, + ggml_tensor* guidance, std::vector ref_latents = {}, bool increase_ref_index = false, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr, + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr, std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] // y: [N, adm_in_channels] or [1, adm_in_channels] // guidance: [N, ] - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers); }; @@ -1480,12 +1480,12 @@ namespace Flux { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1GB params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); { @@ -1513,7 +1513,7 @@ namespace Flux { auto y = nullptr; // print_ggml_tensor(y); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); compute(8, x, timesteps, context, nullptr, y, guidance, {}, false, &out, work_ctx); diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index a51976e1..e6b27cc7 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -95,7 +95,7 @@ static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128"); // A: [ne03, k, ne01, ne00] // B: k rows, m columns => [k, m] // result is [ne03, m, ne01, ne00] -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_mul_n_mode(struct ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b, int mode = 0) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_mul_n_mode(ggml_context* ctx, ggml_tensor* a, ggml_tensor* b, int mode = 0) { // reshape A // swap 0th and nth axis a = ggml_cont(ctx, ggml_permute(ctx, a, mode, mode != 1 ? 1 : 0, mode != 2 ? 2 : 0, mode != 3 ? 3 : 0)); @@ -105,7 +105,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_mul_n_mode(struct ggml_context* c // make 2D a = ggml_cont(ctx, ggml_reshape_2d(ctx, a, a->ne[0], (ne3 * ne2 * ne1))); - struct ggml_tensor* result = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, a, b))); + ggml_tensor* result = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, a, b))); // reshape output (same shape as a after permutation except first dim) result = ggml_reshape_4d(ctx, result, result->ne[0], ne1, ne2, ne3); @@ -114,11 +114,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_mul_n_mode(struct ggml_context* c return result; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_merge_lora(ggml_context* ctx, - ggml_tensor* lora_down, - ggml_tensor* lora_up, - ggml_tensor* lora_mid = nullptr) { - struct ggml_tensor* updown; +__STATIC_INLINE__ ggml_tensor* ggml_ext_merge_lora(ggml_context* ctx, + ggml_tensor* lora_down, + ggml_tensor* lora_up, + ggml_tensor* lora_mid = nullptr) { + ggml_tensor* updown; // flat lora tensors to multiply it int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1]; lora_up = ggml_reshape_2d(ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows); @@ -147,7 +147,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_merge_lora(ggml_context* ctx, // Kronecker product // [ne03,ne02,ne01,ne00] x [ne13,ne12,ne11,ne10] => [ne03*ne13,ne02*ne12,ne01*ne11,ne00*ne10] -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_kronecker(ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_kronecker(ggml_context* ctx, ggml_tensor* a, ggml_tensor* b) { return ggml_mul(ctx, ggml_interpolate(ctx, a, @@ -159,7 +159,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_kronecker(ggml_context* ctx, stru b); } -__STATIC_INLINE__ void ggml_ext_im_set_randn_f32(struct ggml_tensor* tensor, std::shared_ptr rng) { +__STATIC_INLINE__ void ggml_ext_im_set_randn_f32(ggml_tensor* tensor, std::shared_ptr rng) { uint32_t n = (uint32_t)ggml_nelements(tensor); std::vector random_numbers = rng->randn(n); for (uint32_t i = 0; i < n; i++) { @@ -167,7 +167,7 @@ __STATIC_INLINE__ void ggml_ext_im_set_randn_f32(struct ggml_tensor* tensor, std } } -__STATIC_INLINE__ void ggml_ext_tensor_set_f32(struct ggml_tensor* tensor, float value, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { +__STATIC_INLINE__ void ggml_ext_tensor_set_f32(ggml_tensor* tensor, float value, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { GGML_ASSERT(tensor->nb[0] == sizeof(float)); *(float*)((char*)(tensor->data) + i3 * tensor->nb[3] + i2 * tensor->nb[2] + i1 * tensor->nb[1] + i0 * tensor->nb[0]) = value; } @@ -213,7 +213,7 @@ __STATIC_INLINE__ float sd_image_get_f32(sd_image_f32_t image, int64_t iw, int64 return value; } -__STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false, const char* mark = "") { +__STATIC_INLINE__ void print_ggml_tensor(ggml_tensor* tensor, bool shape_only = false, const char* mark = "") { printf("%s (%s): shape(%zu, %zu, %zu, %zu)\n", mark, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); fflush(stdout); if (shape_only) { @@ -350,7 +350,7 @@ __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const st // file.close(); // } -__STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_tensor* src) { +__STATIC_INLINE__ void copy_ggml_tensor(ggml_tensor* dst, ggml_tensor* src) { if (dst->type == src->type) { dst->nb[0] = src->nb[0]; dst->nb[1] = src->nb[1]; @@ -360,18 +360,18 @@ __STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_ten memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst)); return; } - struct ggml_init_params params; - params.mem_size = 10 * 1024 * 1024; // for padding - params.mem_buffer = nullptr; - params.no_alloc = false; - struct ggml_context* ctx = ggml_init(params); + ggml_init_params params; + params.mem_size = 10 * 1024 * 1024; // for padding + params.mem_buffer = nullptr; + params.no_alloc = false; + ggml_context* ctx = ggml_init(params); if (!ctx) { LOG_ERROR("ggml_init() failed"); return; } ggml_tensor* final = ggml_cpy(ctx, src, dst); - struct ggml_cgraph* graph = ggml_new_graph(ctx); + ggml_cgraph* graph = ggml_new_graph(ctx); ggml_build_forward_expand(graph, final); ggml_graph_compute_with_ctx(ctx, graph, 1); ggml_free(ctx); @@ -389,7 +389,7 @@ __STATIC_INLINE__ float sigmoid(float x) { // SPECIAL OPERATIONS WITH TENSORS -__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(struct ggml_tensor* input, uint8_t* image_data = nullptr) { +__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(ggml_tensor* input, uint8_t* image_data = nullptr) { int64_t width = input->ne[0]; int64_t height = input->ne[1]; int64_t channels = input->ne[2]; @@ -408,7 +408,7 @@ __STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(struct ggml_tensor* input, ui return image_data; } -__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(struct ggml_tensor* input, int idx, bool video = false) { +__STATIC_INLINE__ uint8_t* ggml_tensor_to_sd_image(ggml_tensor* input, int idx, bool video = false) { int64_t width = input->ne[0]; int64_t height = input->ne[1]; int64_t channels; @@ -449,9 +449,9 @@ __STATIC_INLINE__ void sd_image_to_ggml_tensor(sd_image_t image, }); } -__STATIC_INLINE__ void ggml_ext_tensor_apply_mask(struct ggml_tensor* image_data, - struct ggml_tensor* mask, - struct ggml_tensor* output, +__STATIC_INLINE__ void ggml_ext_tensor_apply_mask(ggml_tensor* image_data, + ggml_tensor* mask, + ggml_tensor* output, float masked_value = 0.5f) { int64_t width = output->ne[0]; int64_t height = output->ne[1]; @@ -489,8 +489,8 @@ __STATIC_INLINE__ void sd_image_f32_to_ggml_tensor(sd_image_f32_t image, }); } -__STATIC_INLINE__ void ggml_ext_tensor_split_2d(struct ggml_tensor* input, - struct ggml_tensor* output, +__STATIC_INLINE__ void ggml_ext_tensor_split_2d(ggml_tensor* input, + ggml_tensor* output, int x, int y) { int64_t width = output->ne[0]; @@ -520,8 +520,8 @@ __STATIC_INLINE__ float smootherstep_f32(const float x) { return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f); } -__STATIC_INLINE__ void ggml_ext_tensor_merge_2d(struct ggml_tensor* input, - struct ggml_tensor* output, +__STATIC_INLINE__ void ggml_ext_tensor_merge_2d(ggml_tensor* input, + ggml_tensor* output, int x, int y, int overlap_x, @@ -568,7 +568,7 @@ __STATIC_INLINE__ void ggml_ext_tensor_merge_2d(struct ggml_tensor* input, } } -__STATIC_INLINE__ float ggml_ext_tensor_mean(struct ggml_tensor* src) { +__STATIC_INLINE__ float ggml_ext_tensor_mean(ggml_tensor* src) { float mean = 0.0f; int64_t nelements = ggml_nelements(src); float* data = (float*)src->data; @@ -579,7 +579,7 @@ __STATIC_INLINE__ float ggml_ext_tensor_mean(struct ggml_tensor* src) { } // a = a+b -__STATIC_INLINE__ void ggml_ext_tensor_add_inplace(struct ggml_tensor* a, struct ggml_tensor* b) { +__STATIC_INLINE__ void ggml_ext_tensor_add_inplace(ggml_tensor* a, ggml_tensor* b) { GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); int64_t nelements = ggml_nelements(a); float* vec_a = (float*)a->data; @@ -589,7 +589,7 @@ __STATIC_INLINE__ void ggml_ext_tensor_add_inplace(struct ggml_tensor* a, struct } } -__STATIC_INLINE__ void ggml_ext_tensor_scale_inplace(struct ggml_tensor* src, float scale) { +__STATIC_INLINE__ void ggml_ext_tensor_scale_inplace(ggml_tensor* src, float scale) { int64_t nelements = ggml_nelements(src); float* data = (float*)src->data; for (int i = 0; i < nelements; i++) { @@ -597,7 +597,7 @@ __STATIC_INLINE__ void ggml_ext_tensor_scale_inplace(struct ggml_tensor* src, fl } } -__STATIC_INLINE__ void ggml_ext_tensor_clamp_inplace(struct ggml_tensor* src, float min, float max) { +__STATIC_INLINE__ void ggml_ext_tensor_clamp_inplace(ggml_tensor* src, float min, float max) { int64_t nelements = ggml_nelements(src); float* data = (float*)src->data; for (int i = 0; i < nelements; i++) { @@ -606,10 +606,10 @@ __STATIC_INLINE__ void ggml_ext_tensor_clamp_inplace(struct ggml_tensor* src, fl } } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_tensor_concat(struct ggml_context* ctx, - struct ggml_tensor* a, - struct ggml_tensor* b, - int dim) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_tensor_concat(ggml_context* ctx, + ggml_tensor* a, + ggml_tensor* b, + int dim) { int64_t ne[GGML_MAX_DIMS]; for (int d = 0; d < GGML_MAX_DIMS; ++d) { if (d == dim) { @@ -619,9 +619,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_tensor_concat(struct ggml_context GGML_ASSERT(a->ne[d] == b->ne[d]); ne[d] = a->ne[d]; } - struct ggml_tensor* result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne); - int64_t o[4] = {0, 0, 0, 0}; - o[dim] = a->ne[dim]; + ggml_tensor* result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne); + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = a->ne[dim]; float v; for (int i3 = 0; i3 < result->ne[3]; i3++) { @@ -643,7 +643,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_tensor_concat(struct ggml_context } // convert values from [0, 1] to [-1, 1] -__STATIC_INLINE__ void scale_to_minus1_1(struct ggml_tensor* src) { +__STATIC_INLINE__ void scale_to_minus1_1(ggml_tensor* src) { int64_t nelements = ggml_nelements(src); float* data = (float*)src->data; for (int i = 0; i < nelements; i++) { @@ -653,7 +653,7 @@ __STATIC_INLINE__ void scale_to_minus1_1(struct ggml_tensor* src) { } // convert values from [-1, 1] to [0, 1] -__STATIC_INLINE__ void scale_to_0_1(struct ggml_tensor* src) { +__STATIC_INLINE__ void scale_to_0_1(ggml_tensor* src) { int64_t nelements = ggml_nelements(src); float* data = (float*)src->data; for (int i = 0; i < nelements; i++) { @@ -662,8 +662,8 @@ __STATIC_INLINE__ void scale_to_0_1(struct ggml_tensor* src) { } } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_cont(struct ggml_context* ctx, - struct ggml_tensor* x) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_cont(ggml_context* ctx, + ggml_tensor* x) { if (ggml_is_contiguous(x)) { return x; } @@ -671,12 +671,12 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_cont(struct ggml_context* ctx, } // torch like permute -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_torch_permute(struct ggml_context* ctx, - struct ggml_tensor* x, - int axis0, - int axis1, - int axis2, - int axis3) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_torch_permute(ggml_context* ctx, + ggml_tensor* x, + int axis0, + int axis1, + int axis2, + int axis3) { int torch_axes[4] = {axis0, axis1, axis2, axis3}; int ggml_axes[4] = {0}; @@ -695,12 +695,12 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_torch_permute(struct ggml_context return ggml_permute(ctx, x, ggml_axes[0], ggml_axes[1], ggml_axes[2], ggml_axes[3]); } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx, - struct ggml_tensor* x, - int dim, - int64_t start, - int64_t end, - bool cont = true) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_slice(ggml_context* ctx, + ggml_tensor* x, + int dim, + int64_t start, + int64_t end, + bool cont = true) { GGML_ASSERT(dim >= 0 && dim < 4); if (x->ne[dim] == 1) { return x; @@ -731,15 +731,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx, } // example: [N, 3*C, H, W] => ([N, C, H, W], [N, C, H, W], [N, C, H, W]) -__STATIC_INLINE__ std::vector ggml_ext_chunk(struct ggml_context* ctx, - struct ggml_tensor* x, - int num, - int64_t dim, - bool cont = true) { +__STATIC_INLINE__ std::vector ggml_ext_chunk(ggml_context* ctx, + ggml_tensor* x, + int num, + int64_t dim, + bool cont = true) { GGML_ASSERT(dim >= 0 && dim < 4); GGML_ASSERT(x->ne[dim] % num == 0); - std::vector chunks; + std::vector chunks; int64_t chunk_size = x->ne[dim] / num; int64_t stride = chunk_size * x->nb[dim]; int64_t chunk_ne[4] = {x->ne[0], x->ne[1], x->ne[2], x->ne[3]}; @@ -898,7 +898,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, input_tile_size_y *= scale; } - struct ggml_init_params params = {}; + ggml_init_params params = {}; params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float); // input chunk params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float); // output chunk params.mem_size += 3 * ggml_tensor_overhead(); @@ -910,7 +910,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, } // draft context - struct ggml_context* tiles_ctx = ggml_init(params); + ggml_context* tiles_ctx = ggml_init(params); if (!tiles_ctx) { LOG_ERROR("ggml_init() failed"); return; @@ -992,16 +992,16 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, circular_x, circular_y, on_processing); } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm_32(struct ggml_context* ctx, - struct ggml_tensor* a) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm_32(ggml_context* ctx, + ggml_tensor* a) { const float eps = 1e-6f; // default eps parameter return ggml_group_norm(ctx, a, 32, eps); } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_scale(struct ggml_context* ctx, - struct ggml_tensor* x, - float factor, - bool inplace = false) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_scale(ggml_context* ctx, + ggml_tensor* x, + float factor, + bool inplace = false) { if (!ggml_is_contiguous(x)) { x = ggml_cont(ctx, x); } @@ -1013,9 +1013,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_scale(struct ggml_context* ctx, return x; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_gelu(struct ggml_context* ctx, - struct ggml_tensor* x, - bool inplace = false) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_gelu(ggml_context* ctx, + ggml_tensor* x, + bool inplace = false) { if (!ggml_is_contiguous(x)) { x = ggml_cont(ctx, x); } @@ -1027,9 +1027,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_gelu(struct ggml_context* ctx, return x; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_gelu_quick(struct ggml_context* ctx, - struct ggml_tensor* x, - bool inplace = false) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_gelu_quick(ggml_context* ctx, + ggml_tensor* x, + bool inplace = false) { if (!ggml_is_contiguous(x)) { x = ggml_cont(ctx, x); } @@ -1041,12 +1041,12 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_gelu_quick(struct ggml_context* c return x; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* w, - struct ggml_tensor* b, - bool force_prec_f32 = false, - float scale = 1.f) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_linear(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* w, + ggml_tensor* b, + bool force_prec_f32 = false, + float scale = 1.f) { if (scale != 1.f) { x = ggml_ext_scale(ctx, x, scale); } @@ -1075,18 +1075,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx, return x; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad_ext(struct ggml_context* ctx, - struct ggml_tensor* x, - int lp0, - int rp0, - int lp1, - int rp1, - int lp2, - int rp2, - int lp3, - int rp3, - bool circular_x = false, - bool circular_y = false) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_pad_ext(ggml_context* ctx, + ggml_tensor* x, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3, + bool circular_x = false, + bool circular_y = false) { if (circular_x && circular_y) { return ggml_pad_ext_circular(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); } @@ -1106,14 +1106,14 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad_ext(struct ggml_context* ctx, return x; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad(struct ggml_context* ctx, - struct ggml_tensor* x, - int p0, - int p1, - int p2 = 0, - int p3 = 0, - bool circular_x = false, - bool circular_y = false) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_pad(ggml_context* ctx, + ggml_tensor* x, + int p0, + int p1, + int p2 = 0, + int p3 = 0, + bool circular_x = false, + bool circular_y = false) { return ggml_ext_pad_ext(ctx, x, 0, p0, 0, p1, 0, p2, 0, p3, circular_x, circular_y); } @@ -1121,20 +1121,20 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad(struct ggml_context* ctx, // x: [N, IC, IH, IW] // b: [OC,] // result: [N, OC, OH, OW] -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* w, - struct ggml_tensor* b, - int s0 = 1, - int s1 = 1, - int p0 = 0, - int p1 = 0, - int d0 = 1, - int d1 = 1, - bool direct = false, - bool circular_x = false, - bool circular_y = false, - float scale = 1.f) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_conv_2d(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* w, + ggml_tensor* b, + int s0 = 1, + int s1 = 1, + int p0 = 0, + int p1 = 0, + int d0 = 1, + int d1 = 1, + bool direct = false, + bool circular_x = false, + bool circular_y = false, + float scale = 1.f) { if (scale != 1.f) { x = ggml_ext_scale(ctx, x, scale); } @@ -1167,20 +1167,20 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx, // x: [N, IC, IH, IW] // b: [OC,] // result: [N*OC, OD, OH, OW] -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* w, - struct ggml_tensor* b, - int64_t IC, - int s0 = 1, - int s1 = 1, - int s2 = 1, - int p0 = 0, - int p1 = 0, - int p2 = 0, - int d0 = 1, - int d1 = 1, - int d2 = 1) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_conv_3d(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* w, + ggml_tensor* b, + int64_t IC, + int s0 = 1, + int s1 = 1, + int s2 = 1, + int p0 = 0, + int p1 = 0, + int p2 = 0, + int d0 = 1, + int d1 = 1, + int d2 = 1) { int64_t OC = w->ne[3] / IC; int64_t N = x->ne[3] / IC; x = ggml_conv_3d(ctx, w, x, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2); @@ -1196,13 +1196,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d(struct ggml_context* ctx, // x: [N, IC, ID, IH*IW] // b: [OC,] // result: [N, OC, OD, OH*OW] -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d_nx1x1(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* w, - struct ggml_tensor* b, - int s2 = 1, - int p2 = 1, - int d2 = 1) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_conv_3d_nx1x1(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* w, + ggml_tensor* b, + int s2 = 1, + int p2 = 1, + int d2 = 1) { x = ggml_conv_2d(ctx, w, x, 1, s2, 0, p2, 1, d2); // [N, OC, T, OH * OW] if (b != nullptr) { b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); @@ -1213,8 +1213,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_3d_nx1x1(struct ggml_context // qkv: [N, L, 3*C] // return: ([N, L, C], [N, L, C], [N, L, C]) -__STATIC_INLINE__ std::vector split_qkv(struct ggml_context* ctx, - struct ggml_tensor* qkv) { +__STATIC_INLINE__ std::vector split_qkv(ggml_context* ctx, + ggml_tensor* qkv) { qkv = ggml_reshape_4d(ctx, qkv, qkv->ne[0] / 3, 3, qkv->ne[1], qkv->ne[2]); // [N, L, 3, C] qkv = ggml_cont(ctx, ggml_permute(ctx, qkv, 0, 3, 1, 2)); // [3, N, L, C] @@ -1227,8 +1227,8 @@ __STATIC_INLINE__ std::vector split_qkv(struct ggml_context // qkv: [N, 3*C, H, W] // return: ([N, C, H, W], [N, C, H, W], [N, C, H, W]) -__STATIC_INLINE__ std::vector split_image_qkv(struct ggml_context* ctx, - struct ggml_tensor* qkv) { +__STATIC_INLINE__ std::vector split_image_qkv(ggml_context* ctx, + ggml_tensor* qkv) { int64_t W = qkv->ne[0]; int64_t H = qkv->ne[1]; int64_t C = qkv->ne[2] / 3; @@ -1245,41 +1245,41 @@ __STATIC_INLINE__ std::vector split_image_qkv(struct ggml_c return {q, k, v}; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_full(struct ggml_context* ctx, - float value, - int64_t ne0, - int64_t ne1, - int64_t ne2, - int64_t ne3) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_full(ggml_context* ctx, + float value, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3) { auto one = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:one"); auto t = ggml_ext_scale(ctx, one, value); // [1,] t = ggml_repeat_4d(ctx, t, ne0, ne1, ne2, ne3); // [ne0, ne1, ne2, ne3] return t; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros(struct ggml_context* ctx, - int64_t ne0, - int64_t ne1, - int64_t ne2, - int64_t ne3) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_zeros(ggml_context* ctx, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3) { return ggml_ext_full(ctx, 0.f, ne0, ne1, ne2, ne3); } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros_like(struct ggml_context* ctx, - struct ggml_tensor* x) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_zeros_like(ggml_context* ctx, + ggml_tensor* x) { return ggml_ext_zeros(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]); } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx, - int64_t ne0, - int64_t ne1, - int64_t ne2, - int64_t ne3) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_ones(ggml_context* ctx, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3) { return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3); } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones_like(struct ggml_context* ctx, - struct ggml_tensor* x) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_ones_like(ggml_context* ctx, + ggml_tensor* x) { return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]); } @@ -1309,16 +1309,16 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* // v: [N, L_k, n_kv_head*d_head] or [N, L_k, n_kv_head, d_head] // mask: [N, L_q, L_k] // return: [N, L_q, C] -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context* ctx, - ggml_backend_t backend, - struct ggml_tensor* q, - struct ggml_tensor* k, - struct ggml_tensor* v, - int64_t n_head, - struct ggml_tensor* mask = nullptr, - bool skip_reshape = false, - bool flash_attn = false, - float kv_scale = 1.0f) { // avoid overflow +__STATIC_INLINE__ ggml_tensor* ggml_ext_attention_ext(ggml_context* ctx, + ggml_backend_t backend, + ggml_tensor* q, + ggml_tensor* k, + ggml_tensor* v, + int64_t n_head, + ggml_tensor* mask = nullptr, + bool skip_reshape = false, + bool flash_attn = false, + float kv_scale = 1.0f) { // avoid overflow int64_t L_q; int64_t L_k; int64_t C; @@ -1457,11 +1457,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context return kqv; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_layer_norm(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* w, - struct ggml_tensor* b, - float eps = EPS) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_layer_norm(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* w, + ggml_tensor* b, + float eps = EPS) { x = ggml_norm(ctx, x, eps); if (w != nullptr) { x = ggml_mul_inplace(ctx, x, w); @@ -1472,11 +1472,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_layer_norm(struct ggml_context* c return x; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* w, - struct ggml_tensor* b, - int num_groups = 32) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* w, + ggml_tensor* b, + int num_groups = 32) { if (ggml_n_dims(x) >= 3 && w != nullptr && b != nullptr) { w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], 1); b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); @@ -1492,7 +1492,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_group_norm(struct ggml_context* c return x; } -__STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) { +__STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) { #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) if (!ggml_backend_is_cpu(backend)) { ggml_backend_tensor_get_async(backend, tensor, data, offset, size); @@ -1526,16 +1526,16 @@ __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) { return value; } -__STATIC_INLINE__ struct ggml_tensor* vector_to_ggml_tensor(struct ggml_context* ctx, - const std::vector& vec) { - struct ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, vec.size()); +__STATIC_INLINE__ ggml_tensor* vector_to_ggml_tensor(ggml_context* ctx, + const std::vector& vec) { + ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, vec.size()); memcpy(t->data, (const void*)vec.data(), ggml_nbytes(t)); return t; } -__STATIC_INLINE__ struct ggml_tensor* vector_to_ggml_tensor_i32(struct ggml_context* ctx, - const std::vector& vec) { - struct ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, vec.size()); +__STATIC_INLINE__ ggml_tensor* vector_to_ggml_tensor_i32(ggml_context* ctx, + const std::vector& vec) { + ggml_tensor* t = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, vec.size()); memcpy(t->data, (const void*)vec.data(), ggml_nbytes(t)); return t; } @@ -1581,21 +1581,21 @@ __STATIC_INLINE__ std::vector timestep_embedding(std::vector times } __STATIC_INLINE__ void set_timestep_embedding(std::vector timesteps, - struct ggml_tensor* embedding, + ggml_tensor* embedding, int dim, int max_period = 10000) { std::vector embedding_vec = timestep_embedding(timesteps, dim, max_period); memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding)); } -__STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context* ctx, - std::vector timesteps, - int dim, - int max_period = 10000) { +__STATIC_INLINE__ ggml_tensor* new_timestep_embedding(ggml_context* ctx, + std::vector timesteps, + int dim, + int max_period = 10000) { // timesteps: [N,] // embedding: [N, dim] std::vector embedding_vec = timestep_embedding(timesteps, dim, max_period); - struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size()); + ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size()); if (embedding->data != nullptr) { memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding)); } else { @@ -1604,9 +1604,9 @@ __STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context return embedding; } -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_timestep_embedding( - struct ggml_context* ctx, - struct ggml_tensor* timesteps, +__STATIC_INLINE__ ggml_tensor* ggml_ext_timestep_embedding( + ggml_context* ctx, + ggml_tensor* timesteps, int dim, int max_period = 10000, float time_factor = 1.0f) { @@ -1671,22 +1671,22 @@ struct GGMLRunnerContext { struct GGMLRunner { protected: - typedef std::function get_graph_cb_t; + typedef std::function get_graph_cb_t; ggml_backend_t params_backend = nullptr; ggml_backend_t runtime_backend = nullptr; - struct ggml_context* params_ctx = nullptr; + ggml_context* params_ctx = nullptr; ggml_backend_buffer_t params_buffer = nullptr; - struct ggml_context* offload_ctx = nullptr; + ggml_context* offload_ctx = nullptr; ggml_backend_buffer_t runtime_params_buffer = nullptr; bool params_on_runtime_backend = false; - struct ggml_context* cache_ctx = nullptr; + ggml_context* cache_ctx = nullptr; ggml_backend_buffer_t cache_buffer = nullptr; - struct ggml_context* compute_ctx = nullptr; - struct ggml_gallocr* compute_allocr = nullptr; + ggml_context* compute_ctx = nullptr; + ggml_gallocr* compute_allocr = nullptr; std::shared_ptr weight_adapter = nullptr; @@ -1696,8 +1696,8 @@ protected: std::vector zero_int_vec = {0}; ggml_tensor* zero_int_tensor = nullptr; - std::map backend_tensor_data_map; - std::map cache_tensor_map; // name -> tensor + std::map backend_tensor_data_map; + std::map cache_tensor_map; // name -> tensor const std::string final_result_name = "ggml_runner_final_result_tensor"; bool flash_attn_enabled = false; @@ -1706,7 +1706,7 @@ protected: bool circular_y_enabled = false; void alloc_params_ctx() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead()); params.mem_buffer = nullptr; params.no_alloc = true; @@ -1731,7 +1731,7 @@ protected: } void alloc_cache_ctx() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead()); params.mem_buffer = nullptr; params.no_alloc = true; @@ -1748,7 +1748,7 @@ protected: } void alloc_compute_ctx() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(ggml_tensor_overhead() * MAX_GRAPH_SIZE + ggml_graph_overhead()); params.mem_buffer = nullptr; params.no_alloc = true; @@ -1774,21 +1774,21 @@ protected: set_backend_tensor_data(zero_int_tensor, zero_int_vec.data()); } - void prepare_build_in_tensor_after(struct ggml_cgraph* gf) { + void prepare_build_in_tensor_after(ggml_cgraph* gf) { ggml_build_forward_expand(gf, one_tensor); ggml_build_forward_expand(gf, zero_int_tensor); } - struct ggml_cgraph* new_graph_custom(size_t graph_size) { + ggml_cgraph* new_graph_custom(size_t graph_size) { if (weight_adapter) { graph_size += weight_adapter->get_extra_graph_size(); } return ggml_new_graph_custom(compute_ctx, graph_size, false); } - struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) { + ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) { prepare_build_in_tensor_before(); - struct ggml_cgraph* gf = get_graph(); + ggml_cgraph* gf = get_graph(); if (ggml_graph_n_nodes(gf) > 0) { auto result = ggml_graph_node(gf, -1); ggml_set_name(result, final_result_name.c_str()); @@ -1802,7 +1802,7 @@ protected: return true; } reset_compute_ctx(); - struct ggml_cgraph* gf = get_compute_graph(get_graph); + ggml_cgraph* gf = get_compute_graph(get_graph); backend_tensor_data_map.clear(); compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend)); @@ -2038,11 +2038,11 @@ public: } // do copy after alloc graph - void set_backend_tensor_data(struct ggml_tensor* tensor, const void* data) { + void set_backend_tensor_data(ggml_tensor* tensor, const void* data) { backend_tensor_data_map[tensor] = data; } - struct ggml_tensor* to_backend(struct ggml_tensor* tensor) { + ggml_tensor* to_backend(ggml_tensor* tensor) { GGML_ASSERT(compute_ctx != nullptr); if (tensor == nullptr) { return nullptr; @@ -2059,11 +2059,11 @@ public: } } - void cache(const std::string name, struct ggml_tensor* tensor) { + void cache(const std::string name, ggml_tensor* tensor) { cache_tensor_map[name] = tensor; } - struct ggml_tensor* get_cache_tensor_by_name(const std::string& name) { + ggml_tensor* get_cache_tensor_by_name(const std::string& name) { if (cache_ctx == nullptr) { return nullptr; } @@ -2073,8 +2073,8 @@ public: bool compute(get_graph_cb_t get_graph, int n_threads, bool free_compute_buffer_immediately = true, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) { if (!offload_params_to_runtime_backend()) { LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str()); return false; @@ -2084,7 +2084,7 @@ public: return false; } reset_compute_ctx(); - struct ggml_cgraph* gf = get_compute_graph(get_graph); + ggml_cgraph* gf = get_compute_graph(get_graph); if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); return false; @@ -2139,7 +2139,7 @@ public: class GGMLBlock { protected: - typedef std::unordered_map ParameterMap; + typedef std::unordered_map ParameterMap; typedef std::unordered_map> GGMLBlockMap; GGMLBlockMap blocks; ParameterMap params; @@ -2158,17 +2158,17 @@ protected: return wtype; } - void init_blocks(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { + void init_blocks(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { for (auto& pair : blocks) { auto& block = pair.second; block->init(ctx, tensor_storage_map, prefix + pair.first); } } - virtual void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {} + virtual void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {} public: - void init(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") { + void init(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") { if (prefix.size() > 0) { prefix = prefix + "."; } @@ -2201,7 +2201,7 @@ public: return mem_size; } - void get_param_tensors(std::map& tensors, std::string prefix = "") { + void get_param_tensors(std::map& tensors, std::string prefix = "") { if (prefix.size() > 0) { prefix = prefix + "."; } @@ -2211,7 +2211,7 @@ public: } for (auto& pair : params) { - struct ggml_tensor* param = pair.second; + ggml_tensor* param = pair.second; tensors[prefix + pair.first] = pair.second; } } @@ -2232,12 +2232,12 @@ public: class UnaryBlock : public GGMLBlock { public: - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) = 0; + virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) = 0; }; class Identity : public UnaryBlock { public: - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { return x; } }; @@ -2252,7 +2252,7 @@ protected: float scale; std::string prefix; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { this->prefix = prefix; enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); if (in_features % ggml_blck_size(wtype) != 0 || force_f32) { @@ -2279,9 +2279,9 @@ public: force_prec_f32(force_prec_f32), scale(scale) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = nullptr; + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + ggml_tensor* w = params["weight"]; + ggml_tensor* b = nullptr; if (bias) { b = params["bias"]; } @@ -2308,7 +2308,7 @@ class Embedding : public UnaryBlock { protected: int64_t embedding_dim; int64_t num_embeddings; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); if (!support_get_rows(wtype)) { wtype = GGML_TYPE_F32; @@ -2322,8 +2322,8 @@ public: num_embeddings(num_embeddings) { } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids) { // input_ids: [N, n_token] auto weight = params["weight"]; @@ -2353,7 +2353,7 @@ protected: float scale = 1.f; std::string prefix; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { this->prefix = prefix; enum ggml_type wtype = GGML_TYPE_F16; params["weight"] = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels); @@ -2387,9 +2387,9 @@ public: return "Conv2d"; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = nullptr; + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + ggml_tensor* w = params["weight"]; + ggml_tensor* b = nullptr; if (bias) { b = params["bias"]; } @@ -2436,7 +2436,7 @@ protected: bool bias; std::string prefix; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { this->prefix = prefix; enum ggml_type wtype = GGML_TYPE_F16; params["weight"] = ggml_new_tensor_4d(ctx, @@ -2466,9 +2466,9 @@ public: dilation(dilation), bias(bias) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = nullptr; + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + ggml_tensor* w = params["weight"]; + ggml_tensor* b = nullptr; if (ctx->weight_adapter) { w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); if (w->type != GGML_TYPE_F16) { @@ -2496,7 +2496,7 @@ protected: bool bias; std::string prefix; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { this->prefix = prefix; if (elementwise_affine) { enum ggml_type wtype = GGML_TYPE_F32; @@ -2518,9 +2518,9 @@ public: elementwise_affine(elementwise_affine), bias(bias) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - struct ggml_tensor* w = nullptr; - struct ggml_tensor* b = nullptr; + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + ggml_tensor* w = nullptr; + ggml_tensor* b = nullptr; if (elementwise_affine) { w = params["weight"]; @@ -2546,7 +2546,7 @@ protected: bool affine; std::string prefix; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { this->prefix = prefix; if (affine) { enum ggml_type wtype = GGML_TYPE_F32; @@ -2566,9 +2566,9 @@ public: eps(eps), affine(affine) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - struct ggml_tensor* w = nullptr; - struct ggml_tensor* b = nullptr; + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + ggml_tensor* w = nullptr; + ggml_tensor* b = nullptr; if (affine) { w = params["weight"]; b = params["bias"]; @@ -2593,7 +2593,7 @@ protected: float eps; std::string prefix; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { this->prefix = prefix; enum ggml_type wtype = GGML_TYPE_F32; params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); @@ -2605,8 +2605,8 @@ public: : hidden_size(hidden_size), eps(eps) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - struct ggml_tensor* w = params["weight"]; + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + ggml_tensor* w = params["weight"]; if (ctx->weight_adapter) { w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); } @@ -2657,9 +2657,9 @@ public: } // x: [N, n_token, embed_dim] - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* mask = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* mask = nullptr) { auto out_proj = std::dynamic_pointer_cast(blocks[out_proj_name]); ggml_tensor* q; @@ -2689,15 +2689,15 @@ public: } }; -__STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( - struct ggml_context* ctx, - struct ggml_tensor* h, // Input: [q, batch] or [W, H, q, batch] - struct ggml_tensor* w1, // Outer C (Full rank) - struct ggml_tensor* w1a, // Outer A (Low rank part 1) - struct ggml_tensor* w1b, // Outer B (Low rank part 2) - struct ggml_tensor* w2, // Inner BA (Full rank) - struct ggml_tensor* w2a, // Inner A (Low rank part 1) - struct ggml_tensor* w2b, // Inner B (Low rank part 2) +__STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward( + ggml_context* ctx, + ggml_tensor* h, // Input: [q, batch] or [W, H, q, batch] + ggml_tensor* w1, // Outer C (Full rank) + ggml_tensor* w1a, // Outer A (Low rank part 1) + ggml_tensor* w1b, // Outer B (Low rank part 2) + ggml_tensor* w2, // Inner BA (Full rank) + ggml_tensor* w2a, // Inner A (Low rank part 1) + ggml_tensor* w2b, // Inner B (Low rank part 2) bool is_conv, WeightAdapter::ForwardParams::conv2d_params_t conv_params, float scale) { @@ -2714,7 +2714,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( : (int)w2a->ne[1]; GGML_ASSERT(q_actual == (uq * vq) && "Input dimension mismatch for LoKR split"); - struct ggml_tensor* hb; + ggml_tensor* hb; if (!is_conv) { int batch = (int)h->ne[1]; @@ -2745,7 +2745,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( } #endif - struct ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq); + ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq); if (w2 != NULL) { hb = ggml_mul_mat(ctx, w2, h_split); } else { @@ -2755,10 +2755,10 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( if (batch > 1) { hb = ggml_reshape_3d(ctx, hb, vp, uq, batch); } - struct ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb)); - hb_t = ggml_reshape_3d(ctx, hb_t, uq, vp * merge_batch_vp, batch / merge_batch_vp); + ggml_tensor* hb_t = ggml_cont(ctx, ggml_transpose(ctx, hb)); + hb_t = ggml_reshape_3d(ctx, hb_t, uq, vp * merge_batch_vp, batch / merge_batch_vp); - struct ggml_tensor* hc_t; + ggml_tensor* hc_t; if (w1 != NULL) { hc_t = ggml_mul_mat(ctx, w1, hb_t); } else { @@ -2769,13 +2769,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( hc_t = ggml_reshape_3d(ctx, hc_t, up, vp, batch); } - struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); - struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch); + ggml_tensor* hc = ggml_transpose(ctx, hc_t); + ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc), up * vp, batch); return ggml_scale(ctx, out, scale); } else { int batch = (int)h->ne[3]; // 1. Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch] - struct ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch); + ggml_tensor* h_split = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch); if (w2 != NULL) { hb = ggml_ext_conv_2d(ctx, h_split, w2, nullptr, @@ -2791,8 +2791,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( conv_params.scale); } else { // swap a and b order for conv lora - struct ggml_tensor* a = w2b; - struct ggml_tensor* b = w2a; + ggml_tensor* a = w2b; + ggml_tensor* b = w2a; // unpack conv2d weights if needed if (ggml_n_dims(a) < 4) { @@ -2804,17 +2804,17 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( GGML_ASSERT(k * k * h_split->ne[2] == a->ne[2]); a = ggml_reshape_4d(ctx, a, a->ne[0] * k, a->ne[1] * k, a->ne[2] / (k * k), a->ne[3]); } - struct ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, a, nullptr, - conv_params.s0, - conv_params.s1, - conv_params.p0, - conv_params.p1, - conv_params.d0, - conv_params.d1, - conv_params.direct, - conv_params.circular_x, - conv_params.circular_y, - conv_params.scale); + ggml_tensor* ha = ggml_ext_conv_2d(ctx, h_split, a, nullptr, + conv_params.s0, + conv_params.s1, + conv_params.p0, + conv_params.p1, + conv_params.d0, + conv_params.d1, + conv_params.direct, + conv_params.circular_x, + conv_params.circular_y, + conv_params.scale); // not supporting lora_mid here hb = ggml_ext_conv_2d(ctx, @@ -2837,23 +2837,23 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( int w_out = (int)hb->ne[0]; int h_out = (int)hb->ne[1]; - // struct ggml_tensor* hb_cat = ggml_reshape_4d(ctx, hb, w_out , h_out , vp * uq, batch); + // ggml_tensor* hb_cat = ggml_reshape_4d(ctx, hb, w_out , h_out , vp * uq, batch); // [W_out, H_out, vp * uq, batch] // Now left to compute (W1 kr Id) * hb_cat == (W1 kr W2) cv h // merge the uq groups of size vp*w_out*h_out - struct ggml_tensor* hb_merged = ggml_reshape_2d(ctx, hb, w_out * h_out * vp, uq * batch); - struct ggml_tensor* hc_t; - struct ggml_tensor* hb_merged_t = ggml_cont(ctx, ggml_transpose(ctx, hb_merged)); + ggml_tensor* hb_merged = ggml_reshape_2d(ctx, hb, w_out * h_out * vp, uq * batch); + ggml_tensor* hc_t; + ggml_tensor* hb_merged_t = ggml_cont(ctx, ggml_transpose(ctx, hb_merged)); if (w1 != NULL) { // Would be great to be able to transpose w1 instead to avoid transposing both hb and hc hc_t = ggml_mul_mat(ctx, w1, hb_merged_t); } else { hc_t = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_merged_t)); } - struct ggml_tensor* hc = ggml_transpose(ctx, hc_t); + ggml_tensor* hc = ggml_transpose(ctx, hc_t); // ungroup - struct ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc), w_out, h_out, up * vp, batch); + ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc), w_out, h_out, up * vp, batch); return ggml_scale(ctx, out, scale); } } diff --git a/src/latent-preview.h b/src/latent-preview.h index 85c8e0dc..5078a6bd 100644 --- a/src/latent-preview.h +++ b/src/latent-preview.h @@ -163,7 +163,7 @@ const float sd_latent_rgb_proj[4][3] = { {-0.178022f, -0.200862f, -0.678514f}}; float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f}; -void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) { +void preview_latent_video(uint8_t* buffer, ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) { size_t buffer_head = 0; uint32_t latent_width = static_cast(latents->ne[0]); diff --git a/src/llm.hpp b/src/llm.hpp index 5490f07c..5a9c25c8 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -522,7 +522,7 @@ namespace LLM { blocks["down_proj"] = std::shared_ptr(new Linear(intermediate_size, hidden_size, bias)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, n_token, hidden_size] auto gate_proj = std::dynamic_pointer_cast(blocks["gate_proj"]); auto up_proj = std::dynamic_pointer_cast(blocks["up_proj"]); @@ -582,7 +582,7 @@ namespace LLM { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N*grid_t*grid_h*grid_w, in_channels, temporal_patch_size*patch_size*patch_size] // return: [N*grid_t*grid_h*grid_w, embed_dim] x = ggml_reshape_4d(ctx->ggml_ctx, @@ -631,7 +631,7 @@ namespace LLM { blocks["mlp.2"] = std::shared_ptr(new Linear(hidden_size, dim)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto ln_q = std::dynamic_pointer_cast(blocks["ln_q"]); auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]); auto mlp_2 = std::dynamic_pointer_cast(blocks["mlp.2"]); @@ -668,10 +668,10 @@ namespace LLM { blocks["proj"] = std::shared_ptr(new Linear(hidden_size, hidden_size)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe, + ggml_tensor* mask = nullptr) { // x: [N, n_token, hidden_size] int64_t n_token = x->ne[1]; int64_t N = x->ne[2]; @@ -718,10 +718,10 @@ namespace LLM { blocks["norm2"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe, + ggml_tensor* mask = nullptr) { // x: [N, n_token, hidden_size] auto attn = std::dynamic_pointer_cast(blocks["attn"]); auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); @@ -778,12 +778,12 @@ namespace LLM { blocks["merger"] = std::shared_ptr(new PatchMerger(out_hidden_size, hidden_size, spatial_merge_size)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* pixel_values, - struct ggml_tensor* pe, - struct ggml_tensor* window_index, - struct ggml_tensor* window_inverse_index, - struct ggml_tensor* window_mask) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* pixel_values, + ggml_tensor* pe, + ggml_tensor* window_index, + ggml_tensor* window_inverse_index, + ggml_tensor* window_mask) { // pixel_values: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw] // window_index: [grid_t*(H/mh/ph)*(W/mw/pw)] // window_inverse_index: [grid_t*(H/mh/ph)*(W/mw/pw)] @@ -836,10 +836,10 @@ namespace LLM { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* input_pos, - struct ggml_tensor* attention_mask = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* input_pos, + ggml_tensor* attention_mask = nullptr) { // x: [N, n_token, hidden_size] int64_t n_token = x->ne[1]; int64_t N = x->ne[2]; @@ -898,10 +898,10 @@ namespace LLM { blocks["post_attention_layernorm"] = std::make_shared(params.hidden_size, params.rms_norm_eps); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* input_pos, - struct ggml_tensor* attention_mask = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* input_pos, + ggml_tensor* attention_mask = nullptr) { // x: [N, n_token, hidden_size] auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); @@ -936,12 +936,12 @@ namespace LLM { blocks["norm"] = std::shared_ptr(new RMSNorm(params.hidden_size, params.rms_norm_eps)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* input_pos, - struct ggml_tensor* attention_mask, - std::vector> image_embeds, - std::set out_layers) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* input_pos, + ggml_tensor* attention_mask, + std::vector> image_embeds, + std::set out_layers) { // input_ids: [N, n_token] // return: [N, n_token, hidden_size] @@ -1037,12 +1037,12 @@ namespace LLM { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* input_pos, - struct ggml_tensor* attention_mask, - std::vector> image_embeds, - std::set out_layers) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* input_pos, + ggml_tensor* attention_mask, + std::vector> image_embeds, + std::set out_layers) { // input_ids: [N, n_token] auto model = std::dynamic_pointer_cast(blocks["model"]); @@ -1050,12 +1050,12 @@ namespace LLM { return x; } - struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx, - struct ggml_tensor* pixel_values, - struct ggml_tensor* pe, - struct ggml_tensor* window_index, - struct ggml_tensor* window_inverse_index, - struct ggml_tensor* window_mask) { + ggml_tensor* vision_forward(GGMLRunnerContext* ctx, + ggml_tensor* pixel_values, + ggml_tensor* pe, + ggml_tensor* window_index, + ggml_tensor* window_inverse_index, + ggml_tensor* window_mask) { GGML_ASSERT(enable_vision); auto vision_model = std::dynamic_pointer_cast(blocks["visual"]); return vision_model->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask); @@ -1156,35 +1156,35 @@ namespace LLM { return llm_arch_to_str[static_cast(params.arch)]; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* input_pos, - struct ggml_tensor* attention_mask, - std::vector> image_embeds, - std::set out_layers) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* input_pos, + ggml_tensor* attention_mask, + std::vector> image_embeds, + std::set out_layers) { auto hidden_states = model.forward(ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); // [N, n_token, hidden_size] return hidden_states; } - struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx, - struct ggml_tensor* pixel_values, - struct ggml_tensor* input_pos, - struct ggml_tensor* window_index, - struct ggml_tensor* window_inverse_index, - struct ggml_tensor* window_mask) { + ggml_tensor* vision_forward(GGMLRunnerContext* ctx, + ggml_tensor* pixel_values, + ggml_tensor* input_pos, + ggml_tensor* window_index, + ggml_tensor* window_inverse_index, + ggml_tensor* window_mask) { auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask); return hidden_states; } - struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, - struct ggml_tensor* attention_mask, - std::vector> image_embeds, - std::set out_layers) { - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_cgraph* build_graph(ggml_tensor* input_ids, + ggml_tensor* attention_mask, + std::vector> image_embeds, + std::set out_layers) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); input_ids = to_backend(input_ids); @@ -1232,7 +1232,7 @@ namespace LLM { auto runner_ctx = get_context(); - struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); + ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, attention_mask, image_embeds, out_layers); ggml_build_forward_expand(gf, hidden_states); @@ -1240,13 +1240,13 @@ namespace LLM { } bool compute(const int n_threads, - struct ggml_tensor* input_ids, - struct ggml_tensor* attention_mask, + ggml_tensor* input_ids, + ggml_tensor* attention_mask, std::vector> image_embeds, std::set out_layers, ggml_tensor** output, ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, attention_mask, image_embeds, out_layers); }; return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); @@ -1261,7 +1261,7 @@ namespace LLM { return grid_t * grid_h * grid_w; } - struct ggml_tensor* process_image(struct ggml_context* ctx, struct ggml_tensor* image) { + ggml_tensor* process_image(ggml_context* ctx, ggml_tensor* image) { // image: [C, H, W] // return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1 int64_t C = image->ne[2]; @@ -1288,8 +1288,8 @@ namespace LLM { return image; } - struct ggml_cgraph* build_encode_image_graph(struct ggml_tensor* image) { - struct ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE); + ggml_cgraph* build_encode_image_graph(ggml_tensor* image) { + ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE); GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); @@ -1399,23 +1399,23 @@ namespace LLM { // pe->data = nullptr; set_backend_tensor_data(pe, pe_vec.data()); - auto runnter_ctx = get_context(); - struct ggml_tensor* hidden_states = vision_forward(&runnter_ctx, - pixel_values, - pe, - window_index, - window_inverse_index, - window_mask); + auto runnter_ctx = get_context(); + ggml_tensor* hidden_states = vision_forward(&runnter_ctx, + pixel_values, + pe, + window_index, + window_inverse_index, + window_mask); ggml_build_forward_expand(gf, hidden_states); return gf; } void encode_image(const int n_threads, - struct ggml_tensor* image, + ggml_tensor* image, ggml_tensor** output, ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_encode_image_graph(image); }; GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); @@ -1440,7 +1440,7 @@ namespace LLM { } } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } @@ -1492,12 +1492,12 @@ namespace LLM { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1GB params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); bool test_mistral = false; bool test_qwen3 = true; @@ -1509,7 +1509,7 @@ namespace LLM { { auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); print_ggml_tensor(image, false, "image"); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); model.encode_image(8, image, &out, work_ctx); @@ -1547,8 +1547,8 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - struct ggml_tensor* out = nullptr; + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); model.compute(8, input_ids, nullptr, image_embeds, {}, &out, work_ctx); @@ -1561,7 +1561,7 @@ namespace LLM { // ggml_set_f32(image, 0.f); auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); print_ggml_tensor(image, false, "image"); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); model.encode_image(8, image, &out, work_ctx); @@ -1587,8 +1587,8 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - struct ggml_tensor* out = nullptr; + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); model.compute(8, input_ids, nullptr, {}, {10, 20, 30}, &out, work_ctx); @@ -1610,8 +1610,8 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - struct ggml_tensor* out = nullptr; + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); model.compute(8, input_ids, nullptr, {}, {35}, &out, work_ctx); @@ -1633,8 +1633,8 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - struct ggml_tensor* out = nullptr; + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); model.compute(8, input_ids, nullptr, {}, {}, &out, work_ctx); diff --git a/src/lora.hpp b/src/lora.hpp index d2f91cd4..7df04ea2 100644 --- a/src/lora.hpp +++ b/src/lora.hpp @@ -9,7 +9,7 @@ struct LoraModel : public GGMLRunner { std::string lora_id; float multiplier = 1.0f; - std::unordered_map lora_tensors; + std::unordered_map lora_tensors; std::map original_tensor_to_final_tensor; std::set applied_lora_tensors; std::string file_path; @@ -76,13 +76,13 @@ struct LoraModel : public GGMLRunner { } for (const auto& pair : tensors_to_create) { - const auto& name = pair.first; - const auto& ts = pair.second; - struct ggml_tensor* real = ggml_new_tensor(params_ctx, - ts.type, - ts.n_dims, - ts.ne); - lora_tensors[name] = real; + const auto& name = pair.first; + const auto& ts = pair.second; + ggml_tensor* real = ggml_new_tensor(params_ctx, + ts.type, + ts.n_dims, + ts.ne); + lora_tensors[name] = real; } alloc_params_buffer(); @@ -337,10 +337,10 @@ struct LoraModel : public GGMLRunner { } scale_value *= multiplier; - struct ggml_tensor* updown_1 = ggml_ext_merge_lora(ctx, hada_1_down, hada_1_up, hada_1_mid); - struct ggml_tensor* updown_2 = ggml_ext_merge_lora(ctx, hada_2_down, hada_2_up, hada_2_mid); - auto curr_updown = ggml_mul_inplace(ctx, updown_1, updown_2); - curr_updown = ggml_ext_scale(ctx, curr_updown, scale_value, true); + ggml_tensor* updown_1 = ggml_ext_merge_lora(ctx, hada_1_down, hada_1_up, hada_1_mid); + ggml_tensor* updown_2 = ggml_ext_merge_lora(ctx, hada_2_down, hada_2_up, hada_2_mid); + auto curr_updown = ggml_mul_inplace(ctx, updown_1, updown_2); + curr_updown = ggml_ext_scale(ctx, curr_updown, scale_value, true); if (updown == nullptr) { updown = curr_updown; } else { @@ -747,9 +747,9 @@ struct LoraModel : public GGMLRunner { return out_diff; } - struct ggml_cgraph* build_lora_graph(const std::map& model_tensors, SDVersion version) { + ggml_cgraph* build_lora_graph(const std::map& model_tensors, SDVersion version) { size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10; - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false); + ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false); preprocess_lora_tensors(model_tensors); @@ -788,8 +788,8 @@ struct LoraModel : public GGMLRunner { return gf; } - void apply(std::map model_tensors, SDVersion version, int n_threads) { - auto get_graph = [&]() -> struct ggml_cgraph* { + void apply(std::map model_tensors, SDVersion version, int n_threads) { + auto get_graph = [&]() -> ggml_cgraph* { return build_lora_graph(model_tensors, version); }; GGMLRunner::compute(get_graph, n_threads, false); diff --git a/src/ltxv.hpp b/src/ltxv.hpp index 9dcdd4b2..fb37dbe0 100644 --- a/src/ltxv.hpp +++ b/src/ltxv.hpp @@ -26,9 +26,9 @@ namespace LTXV { bias)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - bool causal = true) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + bool causal = true) { // x: [N*IC, ID, IH, IW] // result: [N*OC, OD, OH, OW] auto conv = std::dynamic_pointer_cast(blocks["conv"]); diff --git a/src/mmdit.hpp b/src/mmdit.hpp index ba1c35d6..7fbb2b24 100644 --- a/src/mmdit.hpp +++ b/src/mmdit.hpp @@ -27,7 +27,7 @@ public: blocks["fc2"] = std::shared_ptr(new Linear(hidden_features, out_features, bias)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, n_token, in_features] auto fc1 = std::dynamic_pointer_cast(blocks["fc1"]); auto fc2 = std::dynamic_pointer_cast(blocks["fc2"]); @@ -72,7 +72,7 @@ public: bias)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, C, H, W] // return: [N, H*W, embed_dim] auto proj = std::dynamic_pointer_cast(blocks["proj"]); @@ -111,7 +111,7 @@ public: blocks["mlp.2"] = std::shared_ptr(new Linear(hidden_size, out_channels, true, true)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* t) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) { // t: [N, ] // return: [N, hidden_size] auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]); @@ -135,7 +135,7 @@ public: blocks["mlp.2"] = std::shared_ptr(new Linear(hidden_size, hidden_size, true, true)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, input_dim] // return: [N, hidden_size] auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]); @@ -175,7 +175,7 @@ public: } } - std::vector pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + std::vector pre_attention(GGMLRunnerContext* ctx, ggml_tensor* x) { auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); auto qkv = qkv_proj->forward(ctx, x); @@ -198,7 +198,7 @@ public: return {q, k, v}; } - struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* post_attention(GGMLRunnerContext* ctx, ggml_tensor* x) { GGML_ASSERT(!pre_only); auto proj = std::dynamic_pointer_cast(blocks["proj"]); @@ -208,8 +208,8 @@ public: } // x: [N, n_token, dim] - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x) { auto qkv = pre_attention(ctx, x); x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, dim] x = post_attention(ctx, x); // [N, n_token, dim] @@ -217,10 +217,10 @@ public: } }; -__STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* shift, - struct ggml_tensor* scale) { +__STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* shift, + ggml_tensor* scale) { // x: [N, L, C] // scale: [N, C] // shift: [N, C] @@ -274,8 +274,8 @@ public: } std::tuple, std::vector, std::vector> pre_attention_x(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* c) { + ggml_tensor* x, + ggml_tensor* c) { GGML_ASSERT(self_attn); // x: [N, n_token, hidden_size] // c: [N, hidden_size] @@ -309,9 +309,9 @@ public: return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}}; } - std::pair, std::vector> pre_attention(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* c) { + std::pair, std::vector> pre_attention(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* c) { // x: [N, n_token, hidden_size] // c: [N, hidden_size] auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); @@ -346,15 +346,15 @@ public: } } - struct ggml_tensor* post_attention_x(GGMLRunnerContext* ctx, - struct ggml_tensor* attn_out, - struct ggml_tensor* attn2_out, - struct ggml_tensor* x, - struct ggml_tensor* gate_msa, - struct ggml_tensor* shift_mlp, - struct ggml_tensor* scale_mlp, - struct ggml_tensor* gate_mlp, - struct ggml_tensor* gate_msa2) { + ggml_tensor* post_attention_x(GGMLRunnerContext* ctx, + ggml_tensor* attn_out, + ggml_tensor* attn2_out, + ggml_tensor* x, + ggml_tensor* gate_msa, + ggml_tensor* shift_mlp, + ggml_tensor* scale_mlp, + ggml_tensor* gate_mlp, + ggml_tensor* gate_msa2) { // attn_out: [N, n_token, hidden_size] // x: [N, n_token, hidden_size] // gate_msa: [N, hidden_size] @@ -384,13 +384,13 @@ public: return x; } - struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, - struct ggml_tensor* attn_out, - struct ggml_tensor* x, - struct ggml_tensor* gate_msa, - struct ggml_tensor* shift_mlp, - struct ggml_tensor* scale_mlp, - struct ggml_tensor* gate_mlp) { + ggml_tensor* post_attention(GGMLRunnerContext* ctx, + ggml_tensor* attn_out, + ggml_tensor* x, + ggml_tensor* gate_msa, + ggml_tensor* shift_mlp, + ggml_tensor* scale_mlp, + ggml_tensor* gate_mlp) { // attn_out: [N, n_token, hidden_size] // x: [N, n_token, hidden_size] // gate_msa: [N, hidden_size] @@ -416,9 +416,9 @@ public: return x; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* c) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* c) { // x: [N, n_token, hidden_size] // c: [N, hidden_size] // return: [N, n_token, hidden_size] @@ -463,11 +463,11 @@ public: } }; -__STATIC_INLINE__ std::pair +__STATIC_INLINE__ std::pair block_mixing(GGMLRunnerContext* ctx, - struct ggml_tensor* context, - struct ggml_tensor* x, - struct ggml_tensor* c, + ggml_tensor* context, + ggml_tensor* x, + ggml_tensor* c, std::shared_ptr context_block, std::shared_ptr x_block) { // context: [N, n_context, hidden_size] @@ -489,7 +489,7 @@ block_mixing(GGMLRunnerContext* ctx, x_qkv = x_qkv_intermediates.first; x_intermediates = x_qkv_intermediates.second; } - std::vector qkv; + std::vector qkv; for (int i = 0; i < 3; i++) { qkv.push_back(ggml_concat(ctx->ggml_ctx, context_qkv[i], x_qkv[i], 1)); } @@ -563,10 +563,10 @@ public: blocks["x_block"] = std::shared_ptr(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x)); } - std::pair forward(GGMLRunnerContext* ctx, - struct ggml_tensor* context, - struct ggml_tensor* x, - struct ggml_tensor* c) { + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* context, + ggml_tensor* x, + ggml_tensor* c) { auto context_block = std::dynamic_pointer_cast(blocks["context_block"]); auto x_block = std::dynamic_pointer_cast(blocks["x_block"]); @@ -586,9 +586,9 @@ public: blocks["adaLN_modulation.1"] = std::shared_ptr(new Linear(hidden_size, 2 * hidden_size)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* c) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* c) { // x: [N, n_token, hidden_size] // c: [N, hidden_size] // return: [N, n_token, patch_size * patch_size * out_channels] @@ -626,7 +626,7 @@ protected: int64_t hidden_size; std::string qk_norm; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { enum ggml_type wtype = GGML_TYPE_F32; params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1); } @@ -705,8 +705,8 @@ public: blocks["final_layer"] = std::shared_ptr(new FinalLayer(hidden_size, patch_size, out_channels)); } - struct ggml_tensor* - cropped_pos_embed(struct ggml_context* ctx, + ggml_tensor* + cropped_pos_embed(ggml_context* ctx, int64_t h, int64_t w) { auto pos_embed = params["pos_embed"]; @@ -745,11 +745,11 @@ public: return spatial_pos_embed; } - struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* c_mod, - struct ggml_tensor* context, - std::vector skip_layers = std::vector()) { + ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* c_mod, + ggml_tensor* context, + std::vector skip_layers = std::vector()) { // x: [N, H*W, hidden_size] // context: [N, n_context, d_context] // c: [N, hidden_size] @@ -774,12 +774,12 @@ public: return x; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* t, - struct ggml_tensor* y = nullptr, - struct ggml_tensor* context = nullptr, - std::vector skip_layers = std::vector()) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* t, + ggml_tensor* y = nullptr, + ggml_tensor* context = nullptr, + std::vector skip_layers = std::vector()) { // Forward pass of DiT. // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) // t: (N,) tensor of diffusion timesteps @@ -832,29 +832,29 @@ struct MMDiTRunner : public GGMLRunner { return "mmdit"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { mmdit.get_param_tensors(tensors, prefix); } - struct ggml_cgraph* build_graph(struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* y, - std::vector skip_layers = std::vector()) { - struct ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE); + ggml_cgraph* build_graph(ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* y, + std::vector skip_layers = std::vector()) { + ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE); x = to_backend(x); context = to_backend(context); y = to_backend(y); timesteps = to_backend(timesteps); - auto runner_ctx = get_context(); - struct ggml_tensor* out = mmdit.forward(&runner_ctx, - x, - timesteps, - y, - context, - skip_layers); + auto runner_ctx = get_context(); + ggml_tensor* out = mmdit.forward(&runner_ctx, + x, + timesteps, + y, + context, + skip_layers); ggml_build_forward_expand(gf, out); @@ -862,18 +862,18 @@ struct MMDiTRunner : public GGMLRunner { } bool compute(int n_threads, - struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* y, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr, - std::vector skip_layers = std::vector()) { + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* y, + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr, + std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size] // y: [N, adm_in_channels] or [1, adm_in_channels] - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, y, skip_layers); }; @@ -881,12 +881,12 @@ struct MMDiTRunner : public GGMLRunner { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); { @@ -908,7 +908,7 @@ struct MMDiTRunner : public GGMLRunner { ggml_set_f32(y, 0.01f); // print_ggml_tensor(y); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); compute(8, x, timesteps, context, y, &out, work_ctx); diff --git a/src/model.cpp b/src/model.cpp index 87b65455..d23b97fa 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -287,7 +287,7 @@ void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) { } bool is_zip_file(const std::string& file_path) { - struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); + zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); if (zip == nullptr) { return false; } @@ -453,9 +453,9 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s size_t total_size = 0; size_t data_offset = gguf_get_data_offset(ctx_gguf_); for (int i = 0; i < n_tensors; i++) { - std::string name = gguf_get_tensor_name(ctx_gguf_, i); - struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str()); - size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i); + std::string name = gguf_get_tensor_name(ctx_gguf_, i); + ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str()); + size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i); // LOG_DEBUG("%s", name.c_str()); @@ -812,7 +812,7 @@ struct PickleTensorReader { } } - void read_string(const std::string& str, struct zip_t* zip, std::string dir) { + void read_string(const std::string& str, zip_t* zip, std::string dir) { if (str == "storage") { read_global_type = true; } else if (str != "state_dict") { @@ -995,7 +995,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s file_paths_.push_back(file_path); size_t file_index = file_paths_.size() - 1; - struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); + zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); if (zip == nullptr) { LOG_ERROR("failed to open '%s'", file_path.c_str()); return false; @@ -1413,7 +1413,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread for (int i = 0; i < n_threads; ++i) { workers.emplace_back([&, file_path, is_zip]() { std::ifstream file; - struct zip_t* zip = nullptr; + zip_t* zip = nullptr; if (is_zip) { zip = zip_open(file_path.c_str(), 0, 'r'); if (zip == nullptr) { @@ -1601,7 +1601,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread return success; } -bool ModelLoader::load_tensors(std::map& tensors, +bool ModelLoader::load_tensors(std::map& tensors, std::set ignore_tensors, int n_threads, bool enable_mmap) { @@ -1615,7 +1615,7 @@ bool ModelLoader::load_tensors(std::map& tenso tensor_names_in_file.insert(name); } - struct ggml_tensor* real; + ggml_tensor* real; if (tensors.find(name) != tensors.end()) { real = tensors[name]; } else { diff --git a/src/model.h b/src/model.h index 5b9ce18a..3af35eb7 100644 --- a/src/model.h +++ b/src/model.h @@ -323,7 +323,7 @@ public: String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; } void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = ""); bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false); - bool load_tensors(std::map& tensors, + bool load_tensors(std::map& tensors, std::set ignore_tensors = {}, int n_threads = 0, bool use_mmap = false); diff --git a/src/pmid.hpp b/src/pmid.hpp index 8ce78d3a..30c47325 100644 --- a/src/pmid.hpp +++ b/src/pmid.hpp @@ -21,14 +21,14 @@ public: blocks["layernorm"] = std::shared_ptr(new LayerNorm(in_dim)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { // x: [N, channels, h, w] auto fc1 = std::dynamic_pointer_cast(blocks["fc1"]); auto fc2 = std::dynamic_pointer_cast(blocks["fc2"]); auto layer_norm = std::dynamic_pointer_cast(blocks["layernorm"]); - struct ggml_tensor* r = x; + ggml_tensor* r = x; // x = ggml_ext_layer_norm(ctx, x, ln_w, ln_b); x = layer_norm->forward(ctx, x); // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x), fc1_b); @@ -54,8 +54,8 @@ public: blocks["1"] = std::shared_ptr(new Mlp(dim, inner_dim, dim, false)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x) { auto norm = std::dynamic_pointer_cast(blocks["0"]); auto ff = std::dynamic_pointer_cast(blocks["1"]); @@ -81,9 +81,9 @@ public: blocks["to_out"] = std::shared_ptr(new Linear(inner_dim, dim, false)); } - struct ggml_tensor* reshape_tensor(struct ggml_context* ctx, - struct ggml_tensor* x, - int heads) { + ggml_tensor* reshape_tensor(ggml_context* ctx, + ggml_tensor* x, + int heads) { int64_t ne[4]; for (int i = 0; i < 4; ++i) ne[i] = x->ne[i]; @@ -92,17 +92,17 @@ public: return x; } - std::vector chunk_half(struct ggml_context* ctx, - struct ggml_tensor* x) { + std::vector chunk_half(ggml_context* ctx, + ggml_tensor* x) { auto tlo = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0); auto tli = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], x->nb[0] * x->ne[0] / 2); return {ggml_cont(ctx, tlo), ggml_cont(ctx, tli)}; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* latents) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* latents) { // x (torch.Tensor): image features // shape (b, n1, D) // latent (torch.Tensor): latent features @@ -176,9 +176,9 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* latents, - struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* latents, + ggml_tensor* x) { // x: [N, channels, h, w] auto proj_in = std::dynamic_pointer_cast(blocks["proj_in"]); auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); @@ -225,19 +225,19 @@ public: 4)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* last_hidden_state) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* last_hidden_state) { // x: [N, channels, h, w] auto token_proj = std::dynamic_pointer_cast(blocks["token_proj"]); auto token_norm = std::dynamic_pointer_cast(blocks["token_norm"]); auto perceiver_resampler = std::dynamic_pointer_cast(blocks["perceiver_resampler"]); - x = token_proj->forward(ctx, x); - int64_t nel = ggml_nelements(x); - x = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens)); - x = token_norm->forward(ctx, x); - struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state); + x = token_proj->forward(ctx, x); + int64_t nel = ggml_nelements(x); + x = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens)); + x = token_norm->forward(ctx, x); + ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state); if (use_residul) out = ggml_add(ctx->ggml_ctx, x, out); return out; @@ -256,9 +256,9 @@ public: blocks["layer_norm"] = std::shared_ptr(new LayerNorm(embed_dim)); } - struct ggml_tensor* fuse_fn(GGMLRunnerContext* ctx, - struct ggml_tensor* prompt_embeds, - struct ggml_tensor* id_embeds) { + ggml_tensor* fuse_fn(GGMLRunnerContext* ctx, + ggml_tensor* prompt_embeds, + ggml_tensor* id_embeds) { auto mlp1 = std::dynamic_pointer_cast(blocks["mlp1"]); auto mlp2 = std::dynamic_pointer_cast(blocks["mlp2"]); auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); @@ -273,24 +273,24 @@ public: return stacked_id_embeds; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* prompt_embeds, - struct ggml_tensor* id_embeds, - struct ggml_tensor* class_tokens_mask, - struct ggml_tensor* class_tokens_mask_pos, - struct ggml_tensor* left, - struct ggml_tensor* right) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* prompt_embeds, + ggml_tensor* id_embeds, + ggml_tensor* class_tokens_mask, + ggml_tensor* class_tokens_mask_pos, + ggml_tensor* left, + ggml_tensor* right) { // x: [N, channels, h, w] - struct ggml_tensor* valid_id_embeds = id_embeds; + ggml_tensor* valid_id_embeds = id_embeds; // # slice out the image token embeddings ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos"); ggml_set_name(prompt_embeds, "prompt_embeds"); - struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos); + ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos); ggml_set_name(image_token_embeds, "image_token_embeds"); - valid_id_embeds = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0], - ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]); - struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds); + valid_id_embeds = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0], + ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]); + ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds); if (left && right) { stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1); @@ -301,10 +301,10 @@ public: stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1); } - class_tokens_mask = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask)); - class_tokens_mask = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds); - prompt_embeds = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask); - struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds); + class_tokens_mask = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask)); + class_tokens_mask = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds); + prompt_embeds = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask); + ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds); ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds"); return updated_prompt_embeds; } @@ -317,22 +317,22 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection { blocks["fuse_module"] = std::shared_ptr(new FuseModule(2048)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* id_pixel_values, - struct ggml_tensor* prompt_embeds, - struct ggml_tensor* class_tokens_mask, - struct ggml_tensor* class_tokens_mask_pos, - struct ggml_tensor* left, - struct ggml_tensor* right) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* id_pixel_values, + ggml_tensor* prompt_embeds, + ggml_tensor* class_tokens_mask, + ggml_tensor* class_tokens_mask_pos, + ggml_tensor* left, + ggml_tensor* right) { // x: [N, channels, h, w] auto vision_model = std::dynamic_pointer_cast(blocks["vision_model"]); auto visual_projection = std::dynamic_pointer_cast(blocks["visual_projection"]); auto visual_projection_2 = std::dynamic_pointer_cast(blocks["visual_projection_2"]); auto fuse_module = std::dynamic_pointer_cast(blocks["fuse_module"]); - struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size] - struct ggml_tensor* id_embeds = visual_projection->forward(ctx, shared_id_embeds); // [N, proj_dim(768)] - struct ggml_tensor* id_embeds_2 = visual_projection_2->forward(ctx, shared_id_embeds); // [N, 1280] + ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size] + ggml_tensor* id_embeds = visual_projection->forward(ctx, shared_id_embeds); // [N, proj_dim(768)] + ggml_tensor* id_embeds_2 = visual_projection_2->forward(ctx, shared_id_embeds); // [N, 1280] id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 2, 0, 1, 3)); id_embeds_2 = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds_2, 2, 0, 1, 3)); @@ -340,12 +340,12 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection { id_embeds = ggml_concat(ctx->ggml_ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 1, 2, 0, 3)); - struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, - prompt_embeds, - id_embeds, - class_tokens_mask, - class_tokens_mask_pos, - left, right); + ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, + prompt_embeds, + id_embeds, + class_tokens_mask, + class_tokens_mask_pos, + left, right); return updated_prompt_embeds; } }; @@ -365,29 +365,29 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo num_tokens)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* id_pixel_values, - struct ggml_tensor* prompt_embeds, - struct ggml_tensor* class_tokens_mask, - struct ggml_tensor* class_tokens_mask_pos, - struct ggml_tensor* id_embeds, - struct ggml_tensor* left, - struct ggml_tensor* right) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* id_pixel_values, + ggml_tensor* prompt_embeds, + ggml_tensor* class_tokens_mask, + ggml_tensor* class_tokens_mask_pos, + ggml_tensor* id_embeds, + ggml_tensor* left, + ggml_tensor* right) { // x: [N, channels, h, w] auto vision_model = std::dynamic_pointer_cast(blocks["vision_model"]); auto fuse_module = std::dynamic_pointer_cast(blocks["fuse_module"]); auto qformer_perceiver = std::dynamic_pointer_cast(blocks["qformer_perceiver"]); - // struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size] - struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false); // [N, hidden_size] - id_embeds = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state); + // ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values); // [N, hidden_size] + ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false); // [N, hidden_size] + id_embeds = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state); - struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, - prompt_embeds, - id_embeds, - class_tokens_mask, - class_tokens_mask_pos, - left, right); + ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, + prompt_embeds, + id_embeds, + class_tokens_mask, + class_tokens_mask_pos, + left, right); return updated_prompt_embeds; } }; @@ -436,18 +436,18 @@ public: return pm_version; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { if (pm_version == PM_VERSION_1) id_encoder.get_param_tensors(tensors, prefix); else if (pm_version == PM_VERSION_2) id_encoder2.get_param_tensors(tensors, prefix); } - struct ggml_cgraph* build_graph( // struct ggml_allocr* allocr, - struct ggml_tensor* id_pixel_values, - struct ggml_tensor* prompt_embeds, + ggml_cgraph* build_graph( // ggml_allocr* allocr, + ggml_tensor* id_pixel_values, + ggml_tensor* prompt_embeds, std::vector& class_tokens_mask, - struct ggml_tensor* id_embeds) { + ggml_tensor* id_embeds) { ctm.clear(); ctmf16.clear(); ctmpos.clear(); @@ -458,20 +458,20 @@ public: auto runner_ctx = get_context(); - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_cgraph* gf = ggml_new_graph(compute_ctx); int64_t hidden_size = prompt_embeds->ne[0]; int64_t seq_length = prompt_embeds->ne[1]; ggml_type type = GGML_TYPE_F32; - struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size()); + ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size()); - struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values); - struct ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds); - struct ggml_tensor* id_embeds_d = to_backend(id_embeds); + ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values); + ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds); + ggml_tensor* id_embeds_d = to_backend(id_embeds); - struct ggml_tensor* left = nullptr; - struct ggml_tensor* right = nullptr; + ggml_tensor* left = nullptr; + ggml_tensor* right = nullptr; for (int i = 0; i < class_tokens_mask.size(); i++) { if (class_tokens_mask[i]) { // printf(" 1,"); @@ -495,7 +495,7 @@ public: right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1); } - struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size()); + ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size()); { if (type == GGML_TYPE_F16) @@ -526,7 +526,7 @@ public: } } } - struct ggml_tensor* updated_prompt_embeds = nullptr; + ggml_tensor* updated_prompt_embeds = nullptr; if (pm_version == PM_VERSION_1) updated_prompt_embeds = id_encoder.forward(&runner_ctx, id_pixel_values_d, @@ -549,13 +549,13 @@ public: } bool compute(const int n_threads, - struct ggml_tensor* id_pixel_values, - struct ggml_tensor* prompt_embeds, - struct ggml_tensor* id_embeds, + ggml_tensor* id_pixel_values, + ggml_tensor* prompt_embeds, + ggml_tensor* id_embeds, std::vector& class_tokens_mask, - struct ggml_tensor** updated_prompt_embeds, + ggml_tensor** updated_prompt_embeds, ggml_context* output_ctx) { - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask); return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds); }; @@ -566,7 +566,7 @@ public: }; struct PhotoMakerIDEmbed : public GGMLRunner { - std::map tensors; + std::map tensors; std::string file_path; ModelLoader* model_loader; bool load_failed = false; @@ -606,11 +606,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner { } if (dry_run) { std::lock_guard lock(tensor_mutex); - struct ggml_tensor* real = ggml_new_tensor(params_ctx, - tensor_storage.type, - tensor_storage.n_dims, - tensor_storage.ne); - tensors[name] = real; + ggml_tensor* real = ggml_new_tensor(params_ctx, + tensor_storage.type, + tensor_storage.n_dims, + tensor_storage.ne); + tensors[name] = real; } else { auto real = tensors[name]; *dst_tensor = real; @@ -629,8 +629,8 @@ struct PhotoMakerIDEmbed : public GGMLRunner { return true; } - struct ggml_tensor* get() { - std::map::iterator pos; + ggml_tensor* get() { + std::map::iterator pos; pos = tensors.find("pmid.id_embeds"); if (pos != tensors.end()) return pos->second; diff --git a/src/preprocessing.hpp b/src/preprocessing.hpp index 84e0ed3f..ca05ca22 100644 --- a/src/preprocessing.hpp +++ b/src/preprocessing.hpp @@ -4,13 +4,13 @@ #include "ggml_extend.hpp" #define M_PI_ 3.14159265358979323846f -void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) { - struct ggml_init_params params; - params.mem_size = 80 * input->ne[0] * input->ne[1]; // 20M for 512x512 - params.mem_buffer = nullptr; - params.no_alloc = false; - struct ggml_context* ctx0 = ggml_init(params); - struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1); +void convolve(ggml_tensor* input, ggml_tensor* output, ggml_tensor* kernel, int padding) { + ggml_init_params params; + params.mem_size = 80 * input->ne[0] * input->ne[1]; // 20M for 512x512 + params.mem_buffer = nullptr; + params.no_alloc = false; + ggml_context* ctx0 = ggml_init(params); + ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1); ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel)); ggml_tensor* h = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1); ggml_cgraph* gf = ggml_new_graph(ctx0); @@ -19,7 +19,7 @@ void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml ggml_free(ctx0); } -void gaussian_kernel(struct ggml_tensor* kernel) { +void gaussian_kernel(ggml_tensor* kernel) { int ks_mid = static_cast(kernel->ne[0] / 2); float sigma = 1.4f; float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f)); @@ -33,7 +33,7 @@ void gaussian_kernel(struct ggml_tensor* kernel) { } } -void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) { +void grayscale(ggml_tensor* rgb_img, ggml_tensor* grayscale) { for (int iy = 0; iy < rgb_img->ne[1]; iy++) { for (int ix = 0; ix < rgb_img->ne[0]; ix++) { float r = ggml_ext_tensor_get_f32(rgb_img, ix, iy); @@ -45,7 +45,7 @@ void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) { } } -void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) { +void prop_hypot(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) { int n_elements = static_cast(ggml_nelements(h)); float* dx = (float*)x->data; float* dy = (float*)y->data; @@ -55,7 +55,7 @@ void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor } } -void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) { +void prop_arctan2(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) { int n_elements = static_cast(ggml_nelements(h)); float* dx = (float*)x->data; float* dy = (float*)y->data; @@ -65,7 +65,7 @@ void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tens } } -void normalize_tensor(struct ggml_tensor* g) { +void normalize_tensor(ggml_tensor* g) { int n_elements = static_cast(ggml_nelements(g)); float* dg = (float*)g->data; float max = -INFINITY; @@ -78,7 +78,7 @@ void normalize_tensor(struct ggml_tensor* g) { } } -void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) { +void non_max_supression(ggml_tensor* result, ggml_tensor* G, ggml_tensor* D) { for (int iy = 1; iy < result->ne[1] - 1; iy++) { for (int ix = 1; ix < result->ne[0] - 1; ix++) { float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_; @@ -117,7 +117,7 @@ void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struc } } -void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) { +void threshold_hystersis(ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) { int n_elements = static_cast(ggml_nelements(img)); float* imd = (float*)img->data; float max = -INFINITY; @@ -163,11 +163,11 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo } bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) { - struct ggml_init_params params; - params.mem_size = static_cast(40 * img.width * img.height); // 10MB for 512x512 - params.mem_buffer = nullptr; - params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_init_params params; + params.mem_size = static_cast(40 * img.width * img.height); // 10MB for 512x512 + params.mem_buffer = nullptr; + params.no_alloc = false; + ggml_context* work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); @@ -185,19 +185,19 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, -1, -2, -1}; // generate kernel - int kernel_size = 5; - struct ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1); - struct ggml_tensor* sf_kx = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); + int kernel_size = 5; + ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1); + ggml_tensor* sf_kx = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx)); - struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); + ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky)); gaussian_kernel(gkernel); - struct ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1); - struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1); - struct ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray); - struct ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray); - struct ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray); - struct ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray); + ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1); + ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1); + ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray); + ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray); + ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray); + ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray); sd_image_to_ggml_tensor(img, image); grayscale(image, image_gray); convolve(image_gray, image_gray, gkernel, 2); diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp index 2c70344c..68af0e8e 100644 --- a/src/qwen_image.hpp +++ b/src/qwen_image.hpp @@ -26,9 +26,9 @@ namespace Qwen { blocks["linear_2"] = std::shared_ptr(new Linear(time_embed_dim, out_dim, sample_proj_bias)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* sample, - struct ggml_tensor* condition = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* sample, + ggml_tensor* condition = nullptr) { if (condition != nullptr) { auto cond_proj = std::dynamic_pointer_cast(blocks["cond_proj"]); sample = ggml_add(ctx->ggml_ctx, sample, cond_proj->forward(ctx, condition)); @@ -49,8 +49,8 @@ namespace Qwen { blocks["timestep_embedder"] = std::shared_ptr(new TimestepEmbedding(256, embedding_dim)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* timesteps) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* timesteps) { // timesteps: [N,] // return: [N, embedding_dim] auto timestep_embedder = std::dynamic_pointer_cast(blocks["timestep_embedder"]); @@ -107,10 +107,10 @@ namespace Qwen { } std::pair forward(GGMLRunnerContext* ctx, - struct ggml_tensor* img, - struct ggml_tensor* txt, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr) { + ggml_tensor* img, + ggml_tensor* txt, + ggml_tensor* pe, + ggml_tensor* mask = nullptr) { // img: [N, n_img_token, hidden_size] // txt: [N, n_txt_token, hidden_size] // pe: [n_img_token + n_txt_token, d_head/2, 2, 2] @@ -249,11 +249,11 @@ namespace Qwen { } virtual std::pair forward(GGMLRunnerContext* ctx, - struct ggml_tensor* img, - struct ggml_tensor* txt, - struct ggml_tensor* t_emb, - struct ggml_tensor* pe, - struct ggml_tensor* modulate_index = nullptr) { + ggml_tensor* img, + ggml_tensor* txt, + ggml_tensor* t_emb, + ggml_tensor* pe, + ggml_tensor* modulate_index = nullptr) { // img: [N, n_img_token, hidden_size] // txt: [N, n_txt_token, hidden_size] // pe: [n_img_token + n_txt_token, d_head/2, 2, 2] @@ -325,9 +325,9 @@ namespace Qwen { blocks["linear"] = std::shared_ptr(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* c) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* c) { // x: [N, n_token, hidden_size] // c: [N, hidden_size] // return: [N, n_token, patch_size * patch_size * out_channels] @@ -389,12 +389,12 @@ namespace Qwen { blocks["proj_out"] = std::shared_ptr(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels)); } - struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* context, - struct ggml_tensor* pe, - struct ggml_tensor* modulate_index = nullptr) { + ggml_tensor* forward_orig(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* pe, + ggml_tensor* modulate_index = nullptr) { auto time_text_embed = std::dynamic_pointer_cast(blocks["time_text_embed"]); auto txt_norm = std::dynamic_pointer_cast(blocks["txt_norm"]); auto img_in = std::dynamic_pointer_cast(blocks["img_in"]); @@ -429,13 +429,13 @@ namespace Qwen { return img; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* context, - struct ggml_tensor* pe, - std::vector ref_latents = {}, - struct ggml_tensor* modulate_index = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* pe, + std::vector ref_latents = {}, + ggml_tensor* modulate_index = nullptr) { // Forward pass of DiT. // x: [N, C, H, W] // timestep: [N,] @@ -521,17 +521,17 @@ namespace Qwen { return "qwen_image"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { qwen_image.get_param_tensors(tensors, prefix); } - struct ggml_cgraph* build_graph(struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false) { + ggml_cgraph* build_graph(ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + std::vector ref_latents = {}, + bool increase_ref_index = false) { GGML_ASSERT(x->ne[3] == 1); - struct ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE); + ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE); x = to_backend(x); context = to_backend(context); @@ -587,13 +587,13 @@ namespace Qwen { auto runner_ctx = get_context(); - struct ggml_tensor* out = qwen_image.forward(&runner_ctx, - x, - timesteps, - context, - pe, - ref_latents, - modulate_index); + ggml_tensor* out = qwen_image.forward(&runner_ctx, + x, + timesteps, + context, + pe, + ref_latents, + modulate_index); ggml_build_forward_expand(gf, out); @@ -601,17 +601,17 @@ namespace Qwen { } bool compute(int n_threads, - struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, std::vector ref_latents = {}, bool increase_ref_index = false, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, ref_latents, increase_ref_index); }; @@ -619,12 +619,12 @@ namespace Qwen { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1GB params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); { @@ -641,7 +641,7 @@ namespace Qwen { auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin"); print_ggml_tensor(context); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); compute(8, x, timesteps, context, {}, false, &out, work_ctx); diff --git a/src/rope.hpp b/src/rope.hpp index b26e4fcc..db577f5d 100644 --- a/src/rope.hpp +++ b/src/rope.hpp @@ -600,10 +600,10 @@ namespace Rope { return embed_nd(ids, bs, static_cast(theta), axes_dim, wrap_dims); } - __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* pe, - bool rope_interleaved = true) { + __STATIC_INLINE__ ggml_tensor* apply_rope(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* pe, + bool rope_interleaved = true) { // x: [N, L, n_head, d_head] // pe: [L, d_head/2, 2, 2], [[cos, -sin], [sin, cos]] int64_t d_head = x->ne[0]; @@ -641,14 +641,14 @@ namespace Rope { return x_out; } - __STATIC_INLINE__ struct ggml_tensor* attention(GGMLRunnerContext* ctx, - struct ggml_tensor* q, - struct ggml_tensor* k, - struct ggml_tensor* v, - struct ggml_tensor* pe, - struct ggml_tensor* mask, - float kv_scale = 1.0f, - bool rope_interleaved = true) { + __STATIC_INLINE__ ggml_tensor* attention(GGMLRunnerContext* ctx, + ggml_tensor* q, + ggml_tensor* k, + ggml_tensor* v, + ggml_tensor* pe, + ggml_tensor* mask, + float kv_scale = 1.0f, + bool rope_interleaved = true) { // q,k,v: [N, L, n_head, d_head] // pe: [L, d_head/2, 2, 2] // return: [N, L, n_head*d_head] diff --git a/src/spectrum.hpp b/src/spectrum.hpp index 0b206c18..9542a8f3 100644 --- a/src/spectrum.hpp +++ b/src/spectrum.hpp @@ -57,7 +57,7 @@ struct SpectrumState { return (num_cached + 1) % ws != 0; } - void update(const struct ggml_tensor* denoised) { + void update(const ggml_tensor* denoised) { int64_t ne = ggml_nelements(denoised); const float* data = (const float*)denoised->data; @@ -76,7 +76,7 @@ struct SpectrumState { cnt++; } - void predict(struct ggml_tensor* denoised) { + void predict(ggml_tensor* denoised) { int64_t F = (int64_t)H_buf[0].size(); int K_curr = (int)H_buf.size(); int M1 = config.m + 1; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index b1243d69..a5dbc772 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -96,8 +96,7 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { if (reuse_threshold == INFINITY) { if (params.mode == SD_CACHE_EASYCACHE) { reuse_threshold = 0.2; - } - else if (params.mode == SD_CACHE_UCACHE) { + } else if (params.mode == SD_CACHE_UCACHE) { reuse_threshold = 1.0; } } @@ -149,7 +148,7 @@ public: bool is_using_v_parameterization = false; bool is_using_edm_v_parameterization = false; - std::map tensors; + std::map tensors; // lora_name => multiplier std::unordered_map curr_lora_state; @@ -783,12 +782,12 @@ public: circular_y = sd_ctx_params->circular_y; } - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(10 * 1024) * 1024; // 10M params.mem_buffer = nullptr; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check GGML_ASSERT(ctx != nullptr); ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data); @@ -998,21 +997,21 @@ public: } bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) { - struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); + ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); ggml_set_f32(x_t, 0.5); - struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); + ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); ggml_set_f32(c, 0.5); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); + ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); ggml_set_f32(timesteps, 999); - struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : nullptr; + ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : nullptr; if (concat != nullptr) { ggml_set_f32(concat, 0); } - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); + int64_t t0 = ggml_time_ms(); + ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); DiffusionParams diffusion_params; diffusion_params.x = x_t; diffusion_params.timesteps = timesteps; @@ -1321,7 +1320,7 @@ public: condition_params); id_cond = std::get<0>(cond_tup); auto class_tokens_mask = std::get<1>(cond_tup); - struct ggml_tensor* id_embeds = nullptr; + ggml_tensor* id_embeds = nullptr; if (pmv2 && pm_params.id_embed_path != nullptr) { id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path); } @@ -1402,11 +1401,11 @@ public: float augmentation_level = 0.f, bool zero_out_masked = false) { // c_crossattn - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked); + int64_t t0 = ggml_time_ms(); + ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked); // c_concat - struct ggml_tensor* c_concat = nullptr; + ggml_tensor* c_concat = nullptr; { if (zero_out_masked) { c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / get_vae_scale_factor(), height / get_vae_scale_factor(), 4, 1); @@ -1426,7 +1425,7 @@ public: sd_image_to_ggml_tensor(init_image, init_img); } if (augmentation_level > 0.f) { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img); + ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img); ggml_ext_im_set_randn_f32(noise, rng); // encode_pixels += torch.randn_like(pixels) * augmentation_level ggml_ext_tensor_scale_inplace(noise, augmentation_level); @@ -1437,7 +1436,7 @@ public: } // y - struct ggml_tensor* y = nullptr; + ggml_tensor* y = nullptr; { y = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels()); int out_dim = 256; @@ -1486,7 +1485,7 @@ public: void preview_image(ggml_context* work_ctx, int step, - struct ggml_tensor* latents, + ggml_tensor* latents, enum SDVersion version, preview_t preview_mode, ggml_tensor* result, @@ -1806,24 +1805,24 @@ public: cachedit_state.set_sigmas(sigmas); } - size_t steps = sigmas.size() - 1; - struct ggml_tensor* x = ggml_ext_dup_and_cpy_tensor(work_ctx, init_latent); + size_t steps = sigmas.size() - 1; + ggml_tensor* x = ggml_ext_dup_and_cpy_tensor(work_ctx, init_latent); if (noise) { x = denoiser->noise_scaling(sigmas[0], noise, x); } - struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x); + ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x); bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != nullptr; bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != nullptr; bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; // denoise wrapper - struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* out_uncond = nullptr; - struct ggml_tensor* out_skip = nullptr; - struct ggml_tensor* out_img_cond = nullptr; + ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); + ggml_tensor* out_uncond = nullptr; + ggml_tensor* out_skip = nullptr; + ggml_tensor* out_img_cond = nullptr; if (has_unconditioned) { out_uncond = ggml_dup_tensor(work_ctx, x); @@ -1839,12 +1838,12 @@ public: if (has_img_cond) { out_img_cond = ggml_dup_tensor(work_ctx, x); } - struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); + ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); int64_t t0 = ggml_time_us(); - struct ggml_tensor* preview_tensor = nullptr; - auto sd_preview_mode = sd_get_preview_mode(); + ggml_tensor* preview_tensor = nullptr; + auto sd_preview_mode = sd_get_preview_mode(); if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { int64_t W = x->ne[0] * get_vae_scale_factor(); int64_t H = x->ne[1] * get_vae_scale_factor(); @@ -1884,7 +1883,7 @@ public: easycache_state.begin_step(easycache_step_index, sigma); } - auto easycache_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool { + auto easycache_before_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) -> bool { if (!easycache_step_active || condition == nullptr || output_tensor == nullptr) { return false; } @@ -1895,7 +1894,7 @@ public: easycache_step_index); }; - auto easycache_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) { + auto easycache_after_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) { if (!easycache_step_active || condition == nullptr || output_tensor == nullptr) { return; } @@ -1914,7 +1913,7 @@ public: ucache_state.begin_step(ucache_step_index, sigma); } - auto ucache_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool { + auto ucache_before_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) -> bool { if (!ucache_step_active || condition == nullptr || output_tensor == nullptr) { return false; } @@ -1925,7 +1924,7 @@ public: ucache_step_index); }; - auto ucache_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) { + auto ucache_after_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) { if (!ucache_step_active || condition == nullptr || output_tensor == nullptr) { return; } @@ -1944,7 +1943,7 @@ public: cachedit_state.begin_step(cachedit_step_index, sigma); } - auto cachedit_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool { + auto cachedit_before_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) -> bool { if (!cachedit_step_active || condition == nullptr || output_tensor == nullptr) { return false; } @@ -1955,7 +1954,7 @@ public: cachedit_step_index); }; - auto cachedit_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) { + auto cachedit_after_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) { if (!cachedit_step_active || condition == nullptr || output_tensor == nullptr) { return; } @@ -1968,7 +1967,7 @@ public: return cachedit_step_active && cachedit_state.is_step_skipped(); }; - auto cache_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool { + auto cache_before_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) -> bool { if (easycache_step_active) { return easycache_before_condition(condition, output_tensor); } else if (ucache_step_active) { @@ -1979,7 +1978,7 @@ public: return false; }; - auto cache_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) { + auto cache_after_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) { if (easycache_step_active) { easycache_after_condition(condition, output_tensor); } else if (ucache_step_active) { @@ -2056,7 +2055,7 @@ public: } } - std::vector controls; + std::vector controls; if (control_hint != nullptr && control_net != nullptr) { if (control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector)) { @@ -2079,7 +2078,7 @@ public: diffusion_params.vace_strength = vace_strength; const SDCondition* active_condition = nullptr; - struct ggml_tensor** active_output = &out_cond; + ggml_tensor** active_output = &out_cond; if (start_merge_step == -1 || step <= start_merge_step) { // cond diffusion_params.context = cond.c_crossattn; @@ -2922,7 +2921,7 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me } sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, - struct ggml_context* work_ctx, + ggml_context* work_ctx, ggml_tensor* init_latent, std::string prompt, std::string negative_prompt, @@ -2999,19 +2998,19 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, } // Control net hint - struct ggml_tensor* image_hint = nullptr; + ggml_tensor* image_hint = nullptr; if (control_image.data != nullptr) { image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_image_to_ggml_tensor(control_image, image_hint); } // Sample - std::vector final_latents; // collect latents to decode + std::vector final_latents; // collect latents to decode int C = sd_ctx->sd->get_latent_channel(); int W = width / sd_ctx->sd->get_vae_scale_factor(); int H = height / sd_ctx->sd->get_vae_scale_factor(); - struct ggml_tensor* control_latent = nullptr; + ggml_tensor* control_latent = nullptr; if (sd_version_is_control(sd_ctx->sd->version) && image_hint != nullptr) { control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint); ggml_ext_tensor_scale_inplace(control_latent, control_strength); @@ -3107,8 +3106,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, sd_ctx->sd->rng->manual_seed(cur_seed); sd_ctx->sd->sampler_rng->manual_seed(cur_seed); - struct ggml_tensor* x_t = init_latent; - struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + ggml_tensor* x_t = init_latent; + ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng); int start_merge_step = -1; @@ -3119,30 +3118,30 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); } - struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, - sd_ctx->sd->diffusion_model, - true, - x_t, - noise, - cond, - uncond, - img_cond, - image_hint, - control_strength, - guidance, - eta, - shifted_timestep, - sample_method, - sigmas, - start_merge_step, - id_cond, - ref_latents, - increase_ref_index, - denoise_mask, - nullptr, - 1.0f, - cache_params); - int64_t sampling_end = ggml_time_ms(); + ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, + sd_ctx->sd->diffusion_model, + true, + x_t, + noise, + cond, + uncond, + img_cond, + image_hint, + control_strength, + guidance, + eta, + shifted_timestep, + sample_method, + sigmas, + start_merge_step, + id_cond, + ref_latents, + increase_ref_index, + denoise_mask, + nullptr, + 1.0f, + cache_params); + int64_t sampling_end = ggml_time_ms(); if (x_0 != nullptr) { // print_ggml_tensor(x_0); LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); @@ -3160,10 +3159,10 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, // Decode to image LOG_INFO("decoding %zu latents", final_latents.size()); - std::vector decoded_images; // collect decoded images + std::vector decoded_images; // collect decoded images for (size_t i = 0; i < final_latents.size(); i++) { - t1 = ggml_time_ms(); - struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); + t1 = ggml_time_ms(); + ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); // print_ggml_tensor(img); if (img != nullptr) { decoded_images.push_back(img); @@ -3255,13 +3254,13 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g return nullptr; } - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1G params.mem_buffer = nullptr; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); return nullptr; @@ -3605,7 +3604,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } if (high_noise_sample_steps < 0) { - // timesteps ∝ sigmas for Flow models (like wan2.2 a14b) + // timesteps �?sigmas for Flow models (like wan2.2 a14b) for (size_t i = 0; i < sigmas.size(); ++i) { if (sigmas[i] < sd_vid_gen_params->moe_boundary) { high_noise_sample_steps = static_cast(i); @@ -3615,13 +3614,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps); } - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1G params.mem_buffer = nullptr; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); return nullptr; @@ -3853,9 +3852,9 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s int T = static_cast(init_latent->ne[2]); int C = sd_ctx->sd->get_latent_channel(); - struct ggml_tensor* final_latent; - struct ggml_tensor* x_t = init_latent; - struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C); + ggml_tensor* final_latent; + ggml_tensor* x_t = init_latent; + ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C); ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng); // High Noise Sample if (high_noise_sample_steps > 0) { @@ -3955,8 +3954,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s int64_t t4 = ggml_time_ms(); LOG_INFO("generating latent video completed, taking %.2fs", (t4 - t2) * 1.0f / 1000); - struct ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true); - int64_t t5 = ggml_time_ms(); + ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true); + int64_t t5 = ggml_time_ms(); LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000); if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->first_stage_model->free_params_buffer(); diff --git a/src/t5.hpp b/src/t5.hpp index d789c5bd..5f8c99dd 100644 --- a/src/t5.hpp +++ b/src/t5.hpp @@ -211,9 +211,9 @@ protected: // implementation. It's based on the following three ideas: // // 1. Because it uses the *unigram* model: - // best_score(x1, x2, …, xt) = best_score(x1, x2, …, x{t-1}) + score(xt) + // best_score(x1, x2, �? xt) = best_score(x1, x2, �? x{t-1}) + score(xt) // Deciding the best path (and score) can be decoupled into two isolated - // terms: (a) the best path ended before the last token `best_score(x1, x2, …, + // terms: (a) the best path ended before the last token `best_score(x1, x2, �? // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are // not related to each other at all. // @@ -462,7 +462,7 @@ protected: int64_t hidden_size; float eps; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { enum ggml_type wtype = GGML_TYPE_F32; params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); } @@ -473,10 +473,10 @@ public: : hidden_size(hidden_size), eps(eps) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { - struct ggml_tensor* w = params["weight"]; - x = ggml_rms_norm(ctx->ggml_ctx, x, eps); - x = ggml_mul(ctx->ggml_ctx, x, w); + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + ggml_tensor* w = params["weight"]; + x = ggml_rms_norm(ctx->ggml_ctx, x, eps); + x = ggml_mul(ctx->ggml_ctx, x, w); return x; } }; @@ -488,7 +488,7 @@ public: blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [N, n_token, model_dim] auto wi = std::dynamic_pointer_cast(blocks["wi"]); auto wo = std::dynamic_pointer_cast(blocks["wo"]); @@ -510,7 +510,7 @@ public: blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false, false, false, scale)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [N, n_token, model_dim] auto wi_0 = std::dynamic_pointer_cast(blocks["wi_0"]); auto wi_1 = std::dynamic_pointer_cast(blocks["wi_1"]); @@ -531,7 +531,7 @@ public: blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [N, n_token, model_dim] auto DenseReluDense = std::dynamic_pointer_cast(blocks["DenseReluDense"]); auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); @@ -570,8 +570,8 @@ public: } } - struct ggml_tensor* compute_bias(GGMLRunnerContext* ctx, - struct ggml_tensor* relative_position_bucket) { + ggml_tensor* compute_bias(GGMLRunnerContext* ctx, + ggml_tensor* relative_position_bucket) { auto relative_attention_bias = std::dynamic_pointer_cast(blocks["relative_attention_bias"]); auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) @@ -580,11 +580,11 @@ public: } // x: [N, n_token, model_dim] - std::pair forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* past_bias = nullptr, - struct ggml_tensor* mask = nullptr, - struct ggml_tensor* relative_position_bucket = nullptr) { + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { auto q_proj = std::dynamic_pointer_cast(blocks["q"]); auto k_proj = std::dynamic_pointer_cast(blocks["k"]); auto v_proj = std::dynamic_pointer_cast(blocks["v"]); @@ -629,11 +629,11 @@ public: blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); } - std::pair forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* past_bias = nullptr, - struct ggml_tensor* mask = nullptr, - struct ggml_tensor* relative_position_bucket = nullptr) { + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { // x: [N, n_token, model_dim] auto SelfAttention = std::dynamic_pointer_cast(blocks["SelfAttention"]); auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); @@ -655,11 +655,11 @@ public: blocks["layer.1"] = std::shared_ptr(new T5LayerFF(model_dim, ff_dim)); } - std::pair forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* past_bias = nullptr, - struct ggml_tensor* mask = nullptr, - struct ggml_tensor* relative_position_bucket = nullptr) { + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { // x: [N, n_token, model_dim] auto layer_0 = std::dynamic_pointer_cast(blocks["layer.0"]); auto layer_1 = std::dynamic_pointer_cast(blocks["layer.1"]); @@ -690,11 +690,11 @@ public: blocks["final_layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* past_bias = nullptr, - struct ggml_tensor* attention_mask = nullptr, - struct ggml_tensor* relative_position_bucket = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* attention_mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { // x: [N, n_token, model_dim] for (int i = 0; i < num_layers; i++) { auto block = std::dynamic_pointer_cast(blocks["block." + std::to_string(i)]); @@ -737,11 +737,11 @@ public: params.model_dim)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* past_bias = nullptr, - struct ggml_tensor* attention_mask = nullptr, - struct ggml_tensor* relative_position_bucket = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* past_bias = nullptr, + ggml_tensor* attention_mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { // input_ids: [N, n_token] auto shared = std::dynamic_pointer_cast(blocks["shared"]); @@ -776,14 +776,14 @@ struct T5Runner : public GGMLRunner { return "t5"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* relative_position_bucket, - struct ggml_tensor* attention_mask = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* relative_position_bucket, + ggml_tensor* attention_mask = nullptr) { size_t N = input_ids->ne[1]; size_t n_token = input_ids->ne[0]; @@ -791,9 +791,9 @@ struct T5Runner : public GGMLRunner { return hidden_states; } - struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, - struct ggml_tensor* attention_mask = nullptr) { - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_cgraph* build_graph(ggml_tensor* input_ids, + ggml_tensor* attention_mask = nullptr) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); input_ids = to_backend(input_ids); attention_mask = to_backend(attention_mask); @@ -813,8 +813,8 @@ struct T5Runner : public GGMLRunner { input_ids->ne[0]); set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); - auto runner_ctx = get_context(); - struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask); + auto runner_ctx = get_context(); + ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask); ggml_build_forward_expand(gf, hidden_states); @@ -822,11 +822,11 @@ struct T5Runner : public GGMLRunner { } bool compute(const int n_threads, - struct ggml_tensor* input_ids, - struct ggml_tensor* attention_mask, + ggml_tensor* input_ids, + ggml_tensor* attention_mask, ggml_tensor** output, ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, attention_mask); }; return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); @@ -912,7 +912,7 @@ struct T5Embedder { : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } @@ -962,17 +962,17 @@ struct T5Embedder { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); { std::string text("a lovely cat"); - // std::string text("一只可爱的猫"); // umt5 chinease test + // std::string text("一只可爱的�?); // umt5 chinease test auto tokens_and_weights = tokenize(text, 512, true); std::vector& tokens = std::get<0>(tokens_and_weights); std::vector& weights = std::get<1>(tokens_and_weights); @@ -981,9 +981,9 @@ struct T5Embedder { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - auto attention_mask = vector_to_ggml_tensor(work_ctx, masks); - struct ggml_tensor* out = nullptr; + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); + auto attention_mask = vector_to_ggml_tensor(work_ctx, masks); + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); model.compute(8, input_ids, attention_mask, &out, work_ctx); diff --git a/src/tae.hpp b/src/tae.hpp index 60df7b29..3df09e4e 100644 --- a/src/tae.hpp +++ b/src/tae.hpp @@ -37,7 +37,7 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [n, n_in, h, w] // return: [n, n_out, h, w] @@ -107,7 +107,7 @@ public: blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1})); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [n, in_channels, h, w] // return: [n, z_channels, h/8, w/8] @@ -157,7 +157,7 @@ public: blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) override { // z: [n, z_channels, h, w] // return: [n, out_channels, h*8, w*8] @@ -192,7 +192,7 @@ public: blocks["conv"] = std::shared_ptr(new Conv2d(channels * stride, channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { auto conv = std::dynamic_pointer_cast(blocks["conv"]); auto h = x; if (stride != 1) { @@ -212,7 +212,7 @@ public: blocks["conv"] = std::shared_ptr(new Conv2d(channels, channels * stride, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { auto conv = std::dynamic_pointer_cast(blocks["conv"]); auto h = conv->forward(ctx, x); if (stride != 1) { @@ -236,7 +236,7 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* past) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* past) { // x: [n, channels, h, w] auto conv0 = std::dynamic_pointer_cast(blocks["conv.0"]); auto conv1 = std::dynamic_pointer_cast(blocks["conv.2"]); @@ -260,10 +260,10 @@ public: } }; -struct ggml_tensor* patchify(struct ggml_context* ctx, - struct ggml_tensor* x, - int64_t patch_size, - int64_t b = 1) { +ggml_tensor* patchify(ggml_context* ctx, + ggml_tensor* x, + int64_t patch_size, + int64_t b = 1) { // x: [f, b*c, h*q, w*r] // return: [f, b*c*r*q, h, w] if (patch_size == 1) { @@ -289,10 +289,10 @@ struct ggml_tensor* patchify(struct ggml_context* ctx, return x; } -struct ggml_tensor* unpatchify(struct ggml_context* ctx, - struct ggml_tensor* x, - int64_t patch_size, - int64_t b = 1) { +ggml_tensor* unpatchify(ggml_context* ctx, + ggml_tensor* x, + int64_t patch_size, + int64_t b = 1) { // x: [f, b*c*r*q, h, w] // return: [f, b*c, h*q, w*r] if (patch_size == 1) { @@ -339,7 +339,7 @@ public: blocks[std::to_string(index)] = std::shared_ptr(new Conv2d(hidden, z_channels, {3, 3}, {1, 1}, {1, 1})); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) override { auto first_conv = std::dynamic_pointer_cast(blocks["0"]); if (patch_size > 1) { @@ -396,7 +396,7 @@ public: blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels[num_layers], out_channels * patch_size * patch_size, {3, 3}, {1, 1}, {1, 1})); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) override { auto first_conv = std::dynamic_pointer_cast(blocks["1"]); // Clamp() @@ -459,7 +459,7 @@ public: } } - struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) { + ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) { auto decoder = std::dynamic_pointer_cast(blocks["decoder"]); if (sd_version_is_wan(version)) { // (W, H, C, T) -> (W, H, T, C) @@ -473,7 +473,7 @@ public: return result; } - struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) { auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); // (W, H, T, C) -> (W, H, C, T) x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); @@ -519,7 +519,7 @@ public: } } - struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) { + ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) { auto decoder = std::dynamic_pointer_cast(blocks["decoder.layers"]); if (taef2) { z = unpatchify(ctx->ggml_ctx, z, 2); @@ -527,7 +527,7 @@ public: return decoder->forward(ctx, z); } - struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) { auto encoder = std::dynamic_pointer_cast(blocks["encoder.layers"]); auto z = encoder->forward(ctx, x); if (taef2) { @@ -558,7 +558,7 @@ struct TinyImageAutoEncoder : public VAE { return "taesd"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { taesd.get_param_tensors(tensors, prefix); } @@ -578,21 +578,21 @@ struct TinyImageAutoEncoder : public VAE { return taesd.z_channels; } - struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); - z = to_backend(z); - auto runner_ctx = get_context(); - struct ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z); + ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); + z = to_backend(z); + auto runner_ctx = get_context(); + ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z); ggml_build_forward_expand(gf, out); return gf; } bool _compute(const int n_threads, - struct ggml_tensor* z, + ggml_tensor* z, bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { + ggml_tensor** output, + ggml_context* output_ctx = nullptr) { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(z, decode_graph); }; @@ -621,7 +621,7 @@ struct TinyVideoAutoEncoder : public VAE { return "taehv"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { taehv.get_param_tensors(tensors, prefix); } @@ -641,21 +641,21 @@ struct TinyVideoAutoEncoder : public VAE { return taehv.z_channels; } - struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); - z = to_backend(z); - auto runner_ctx = get_context(); - struct ggml_tensor* out = decode_graph ? taehv.decode(&runner_ctx, z) : taehv.encode(&runner_ctx, z); + ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); + z = to_backend(z); + auto runner_ctx = get_context(); + ggml_tensor* out = decode_graph ? taehv.decode(&runner_ctx, z) : taehv.encode(&runner_ctx, z); ggml_build_forward_expand(gf, out); return gf; } bool _compute(const int n_threads, - struct ggml_tensor* z, + ggml_tensor* z, bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { + ggml_tensor** output, + ggml_context* output_ctx = nullptr) { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(z, decode_graph); }; diff --git a/src/unet.hpp b/src/unet.hpp index e0fd4c52..f7aa3f05 100644 --- a/src/unet.hpp +++ b/src/unet.hpp @@ -60,10 +60,10 @@ public: blocks["time_mixer"] = std::shared_ptr(new AlphaBlender()); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context, - int timesteps) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context, + int timesteps) { // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w], t == timesteps // context: [N, max_position(aka n_context), hidden_size(aka context_dim)] aka [b*t, n_context, context_dim], t == timesteps // t_emb: [N, in_channels] aka [b*t, in_channels] @@ -388,11 +388,11 @@ public: blocks["out.2"] = std::shared_ptr(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); } - struct ggml_tensor* resblock_forward(std::string name, - GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* emb, - int num_video_frames) { + ggml_tensor* resblock_forward(std::string name, + GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* emb, + int num_video_frames) { if (version == VERSION_SVD) { auto block = std::dynamic_pointer_cast(blocks[name]); @@ -404,11 +404,11 @@ public: } } - struct ggml_tensor* attention_layer_forward(std::string name, - GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context, - int timesteps) { + ggml_tensor* attention_layer_forward(std::string name, + GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context, + int timesteps) { if (version == VERSION_SVD) { auto block = std::dynamic_pointer_cast(blocks[name]); @@ -420,15 +420,15 @@ public: } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* c_concat = nullptr, - struct ggml_tensor* y = nullptr, - int num_video_frames = -1, - std::vector controls = {}, - float control_strength = 0.f) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* c_concat = nullptr, + ggml_tensor* y = nullptr, + int num_video_frames = -1, + std::vector controls = {}, + float control_strength = 0.f) { // x: [N, in_channels, h, w] or [N, in_channels/2, h, w] // timesteps: [N,] // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768] @@ -480,7 +480,7 @@ public: } // input_blocks - std::vector hs; + std::vector hs; // input block 0 auto h = input_blocks_0_0->forward(ctx, x); @@ -605,19 +605,19 @@ struct UNetModelRunner : public GGMLRunner { return "unet"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { unet.get_param_tensors(tensors, prefix); } - struct ggml_cgraph* build_graph(struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* c_concat = nullptr, - struct ggml_tensor* y = nullptr, - int num_video_frames = -1, - std::vector controls = {}, - float control_strength = 0.f) { - struct ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE); + ggml_cgraph* build_graph(ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* c_concat = nullptr, + ggml_tensor* y = nullptr, + int num_video_frames = -1, + std::vector controls = {}, + float control_strength = 0.f) { + ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE); if (num_video_frames == -1) { num_video_frames = static_cast(x->ne[3]); @@ -635,15 +635,15 @@ struct UNetModelRunner : public GGMLRunner { auto runner_ctx = get_context(); - struct ggml_tensor* out = unet.forward(&runner_ctx, - x, - timesteps, - context, - c_concat, - y, - num_video_frames, - controls, - control_strength); + ggml_tensor* out = unet.forward(&runner_ctx, + x, + timesteps, + context, + c_concat, + y, + num_video_frames, + controls, + control_strength); ggml_build_forward_expand(gf, out); @@ -651,22 +651,22 @@ struct UNetModelRunner : public GGMLRunner { } bool compute(int n_threads, - struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* c_concat, - struct ggml_tensor* y, - int num_video_frames = -1, - std::vector controls = {}, - float control_strength = 0.f, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) { + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* c_concat, + ggml_tensor* y, + int num_video_frames = -1, + std::vector controls = {}, + float control_strength = 0.f, + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] // c_concat: [N, in_channels, h, w] or [1, in_channels, h, w] // y: [N, adm_in_channels] or [1, adm_in_channels] - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength); }; @@ -674,12 +674,12 @@ struct UNetModelRunner : public GGMLRunner { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); { @@ -703,7 +703,7 @@ struct UNetModelRunner : public GGMLRunner { ggml_set_f32(y, 0.5f); // print_ggml_tensor(y); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); compute(8, x, timesteps, context, nullptr, y, num_video_frames, {}, 0.f, &out, work_ctx); diff --git a/src/upscaler.cpp b/src/upscaler.cpp index 41825ee5..18e185d0 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -72,13 +72,13 @@ struct UpscalerGGML { LOG_INFO("upscaling from (%i x %i) to (%i x %i)", input_image.width, input_image.height, output_width, output_height); - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1G params.mem_buffer = nullptr; params.no_alloc = false; // draft context - struct ggml_context* upscale_ctx = ggml_init(params); + ggml_context* upscale_ctx = ggml_init(params); if (!upscale_ctx) { LOG_ERROR("ggml_init() failed"); return upscaled_image; diff --git a/src/vae.hpp b/src/vae.hpp index ad83e01a..dafc0d4b 100644 --- a/src/vae.hpp +++ b/src/vae.hpp @@ -6,12 +6,12 @@ struct VAE : public GGMLRunner { protected: SDVersion version; - bool scale_input = true; + bool scale_input = true; virtual bool _compute(const int n_threads, - struct ggml_tensor* z, + ggml_tensor* z, bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx) = 0; + ggml_tensor** output, + ggml_context* output_ctx) = 0; public: VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu) @@ -186,7 +186,7 @@ public: virtual ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) = 0; virtual ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0; virtual ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0; - virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; + virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; }; @@ -199,10 +199,10 @@ struct FakeVAE : public VAE { } bool _compute(const int n_threads, - struct ggml_tensor* z, + ggml_tensor* z, bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx) override { + ggml_tensor** output, + ggml_context* output_ctx) override { if (*output == nullptr && output_ctx != nullptr) { *output = ggml_dup_tensor(output_ctx, z); } @@ -225,7 +225,7 @@ struct FakeVAE : public VAE { return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); } - void get_param_tensors(std::map& tensors, const std::string prefix) override {} + void get_param_tensors(std::map& tensors, const std::string prefix) override {} std::string get_desc() override { return "fake_vae"; diff --git a/src/wan.hpp b/src/wan.hpp index 23119553..af8acbfd 100644 --- a/src/wan.hpp +++ b/src/wan.hpp @@ -25,7 +25,7 @@ namespace WAN { std::tuple dilation; bool bias; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { params["weight"] = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, std::get<2>(kernel_size), @@ -53,11 +53,11 @@ namespace WAN { dilation(std::move(dilation)), bias(bias) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* cache_x = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* cache_x = nullptr) { // x: [N*IC, ID, IH, IW] // result: x: [N*OC, ID, IH, IW] - struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = nullptr; + ggml_tensor* w = params["weight"]; + ggml_tensor* b = nullptr; if (bias) { b = params["bias"]; } @@ -86,7 +86,7 @@ namespace WAN { protected: int64_t dim; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { ggml_type wtype = GGML_TYPE_F32; auto iter = tensor_storage_map.find(prefix + "gamma"); if (iter != tensor_storage_map.end()) { @@ -100,16 +100,16 @@ namespace WAN { RMS_norm(int64_t dim) : dim(dim) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { // x: [N*IC, ID, IH, IW], IC == dim // assert N == 1 - struct ggml_tensor* w = params["gamma"]; - w = ggml_reshape_1d(ctx->ggml_ctx, w, ggml_nelements(w)); - auto h = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 3, 0, 1, 2)); // [ID, IH, IW, N*IC] - h = ggml_rms_norm(ctx->ggml_ctx, h, 1e-12f); - h = ggml_mul(ctx->ggml_ctx, h, w); - h = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, h, 1, 2, 3, 0)); + ggml_tensor* w = params["gamma"]; + w = ggml_reshape_1d(ctx->ggml_ctx, w, ggml_nelements(w)); + auto h = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 3, 0, 1, 2)); // [ID, IH, IW, N*IC] + h = ggml_rms_norm(ctx->ggml_ctx, h, 1e-12f); + h = ggml_mul(ctx->ggml_ctx, h, w); + h = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, h, 1, 2, 3, 0)); return h; } @@ -148,12 +148,12 @@ namespace WAN { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - int64_t b, - std::vector& feat_cache, - int& feat_idx, - int chunk_idx) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + int64_t b, + std::vector& feat_cache, + int& feat_idx, + int chunk_idx) { // x: [b*c, t, h, w] GGML_ASSERT(b == 1); int64_t c = x->ne[3] / b; @@ -254,9 +254,9 @@ namespace WAN { GGML_ASSERT(in_channels * factor % out_channels == 0); group_size = in_channels * factor / out_channels; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - int64_t B = 1) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + int64_t B = 1) { // x: [B*IC, T, H, W] // return: [B*OC, T/factor_t, H/factor_s, W/factor_s] GGML_ASSERT(B == 1); @@ -301,10 +301,10 @@ namespace WAN { GGML_ASSERT(out_channels * factor % in_channels == 0); repeats = out_channels * factor / in_channels; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - bool first_chunk = false, - int64_t B = 1) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + bool first_chunk = false, + int64_t B = 1) { // x: [B*IC, T, H, W] // return: [B*OC, T/factor_t, H/factor_s, W/factor_s] GGML_ASSERT(B == 1); @@ -356,14 +356,14 @@ namespace WAN { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - int64_t b, - std::vector& feat_cache, - int& feat_idx) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + int64_t b, + std::vector& feat_cache, + int& feat_idx) { // x: [b*c, t, h, w] GGML_ASSERT(b == 1); - struct ggml_tensor* h = x; + ggml_tensor* h = x; if (in_dim != out_dim) { auto shortcut = std::dynamic_pointer_cast(blocks["shortcut"]); @@ -430,15 +430,15 @@ namespace WAN { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - int64_t b, - std::vector& feat_cache, - int& feat_idx, - int chunk_idx) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + int64_t b, + std::vector& feat_cache, + int& feat_idx, + int chunk_idx) { // x: [b*c, t, h, w] GGML_ASSERT(b == 1); - struct ggml_tensor* x_copy = x; + ggml_tensor* x_copy = x; auto avg_shortcut = std::dynamic_pointer_cast(blocks["avg_shortcut"]); @@ -492,15 +492,15 @@ namespace WAN { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - int64_t b, - std::vector& feat_cache, - int& feat_idx, - int chunk_idx) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + int64_t b, + std::vector& feat_cache, + int& feat_idx, + int chunk_idx) { // x: [b*c, t, h, w] GGML_ASSERT(b == 1); - struct ggml_tensor* x_copy = x; + ggml_tensor* x_copy = x; int i = 0; for (; i < mult; i++) { @@ -537,9 +537,9 @@ namespace WAN { blocks["proj"] = std::shared_ptr(new Conv2d(dim, dim, {1, 1})); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - int64_t b) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + int64_t b) { // x: [b*c, t, h, w] GGML_ASSERT(b == 1); auto norm = std::dynamic_pointer_cast(blocks["norm"]); @@ -659,12 +659,12 @@ namespace WAN { blocks["head.2"] = std::shared_ptr(new CausalConv3d(out_dim, z_dim, {3, 3, 3}, {1, 1, 1}, {1, 1, 1})); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - int64_t b, - std::vector& feat_cache, - int& feat_idx, - int chunk_idx) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + int64_t b, + std::vector& feat_cache, + int& feat_idx, + int chunk_idx) { // x: [b*c, t, h, w] GGML_ASSERT(b == 1); auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); @@ -830,12 +830,12 @@ namespace WAN { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - int64_t b, - std::vector& feat_cache, - int& feat_idx, - int chunk_idx) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + int64_t b, + std::vector& feat_cache, + int& feat_idx, + int chunk_idx) { // x: [b*c, t, h, w] GGML_ASSERT(b == 1); auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); @@ -934,16 +934,16 @@ namespace WAN { int _conv_num = 33; int _conv_idx = 0; - std::vector _feat_map; + std::vector _feat_map; int _enc_conv_num = 28; int _enc_conv_idx = 0; - std::vector _enc_feat_map; + std::vector _enc_feat_map; void clear_cache() { _conv_idx = 0; - _feat_map = std::vector(_conv_num, nullptr); + _feat_map = std::vector(_conv_num, nullptr); _enc_conv_idx = 0; - _enc_feat_map = std::vector(_enc_conv_num, nullptr); + _enc_feat_map = std::vector(_enc_conv_num, nullptr); } public: @@ -966,10 +966,10 @@ namespace WAN { blocks["conv2"] = std::shared_ptr(new CausalConv3d(z_dim, z_dim, {1, 1, 1})); } - struct ggml_tensor* patchify(struct ggml_context* ctx, - struct ggml_tensor* x, - int64_t patch_size, - int64_t b = 1) { + ggml_tensor* patchify(ggml_context* ctx, + ggml_tensor* x, + int64_t patch_size, + int64_t b = 1) { // x: [b*c, f, h*q, w*r] // return: [b*c*r*q, f, h, w] if (patch_size == 1) { @@ -993,10 +993,10 @@ namespace WAN { return x; } - struct ggml_tensor* unpatchify(struct ggml_context* ctx, - struct ggml_tensor* x, - int64_t patch_size, - int64_t b = 1) { + ggml_tensor* unpatchify(ggml_context* ctx, + ggml_tensor* x, + int64_t patch_size, + int64_t b = 1) { // x: [b*c*r*q, f, h, w] // return: [b*c, f, h*q, w*r] if (patch_size == 1) { @@ -1019,9 +1019,9 @@ namespace WAN { return x; } - struct ggml_tensor* encode(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - int64_t b = 1) { + ggml_tensor* encode(GGMLRunnerContext* ctx, + ggml_tensor* x, + int64_t b = 1) { // x: [b*c, t, h, w] GGML_ASSERT(b == 1); GGML_ASSERT(decode_only == false); @@ -1037,7 +1037,7 @@ namespace WAN { int64_t t = x->ne[2]; int64_t iter_ = 1 + (t - 1) / 4; - struct ggml_tensor* out; + ggml_tensor* out; for (int i = 0; i < iter_; i++) { _enc_conv_idx = 0; if (i == 0) { @@ -1055,9 +1055,9 @@ namespace WAN { return mu; } - struct ggml_tensor* decode(GGMLRunnerContext* ctx, - struct ggml_tensor* z, - int64_t b = 1) { + ggml_tensor* decode(GGMLRunnerContext* ctx, + ggml_tensor* z, + int64_t b = 1) { // z: [b*c, t, h, w] GGML_ASSERT(b == 1); @@ -1068,7 +1068,7 @@ namespace WAN { int64_t iter_ = z->ne[2]; auto x = conv2->forward(ctx, z); - struct ggml_tensor* out; + ggml_tensor* out; for (int i = 0; i < iter_; i++) { _conv_idx = 0; if (i == 0) { @@ -1087,10 +1087,10 @@ namespace WAN { return out; } - struct ggml_tensor* decode_partial(GGMLRunnerContext* ctx, - struct ggml_tensor* z, - int i, - int64_t b = 1) { + ggml_tensor* decode_partial(GGMLRunnerContext* ctx, + ggml_tensor* z, + int i, + int64_t b = 1) { // z: [b*c, t, h, w] GGML_ASSERT(b == 1); @@ -1127,7 +1127,7 @@ namespace WAN { return "wan_vae"; } - void get_param_tensors(std::map& tensors, const std::string prefix) override { + void get_param_tensors(std::map& tensors, const std::string prefix) override { ae.get_param_tensors(tensors, prefix); } @@ -1226,22 +1226,22 @@ namespace WAN { return static_cast(ae.z_dim); } - struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { - struct ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]); + ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]); z = to_backend(z); auto runner_ctx = get_context(); - struct ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z); + ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z); ggml_build_forward_expand(gf, out); return gf; } - struct ggml_cgraph* build_graph_partial(struct ggml_tensor* z, bool decode_graph, int i) { - struct ggml_cgraph* gf = new_graph_custom(20480); + ggml_cgraph* build_graph_partial(ggml_tensor* z, bool decode_graph, int i) { + ggml_cgraph* gf = new_graph_custom(20480); ae.clear_cache(); @@ -1254,7 +1254,7 @@ namespace WAN { auto runner_ctx = get_context(); - struct ggml_tensor* out = decode_graph ? ae.decode_partial(&runner_ctx, z, i) : ae.encode(&runner_ctx, z); + ggml_tensor* out = decode_graph ? ae.decode_partial(&runner_ctx, z, i) : ae.encode(&runner_ctx, z); for (size_t feat_idx = 0; feat_idx < ae._feat_map.size(); feat_idx++) { ggml_tensor* feat_cache = ae._feat_map[feat_idx]; @@ -1270,12 +1270,12 @@ namespace WAN { } bool _compute(const int n_threads, - struct ggml_tensor* z, + ggml_tensor* z, bool decode_graph, - struct ggml_tensor** output, - struct ggml_context* output_ctx = nullptr) override { + ggml_tensor** output, + ggml_context* output_ctx = nullptr) override { if (true) { - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(z, decode_graph); }; return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); @@ -1283,11 +1283,11 @@ namespace WAN { ae.clear_cache(); int64_t t = z->ne[2]; int i = 0; - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph_partial(z, decode_graph, i); }; - struct ggml_tensor* out = nullptr; - bool res = GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx); + ggml_tensor* out = nullptr; + bool res = GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx); ae.clear_cache(); if (t == 1) { *output = out; @@ -1325,12 +1325,12 @@ namespace WAN { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1G params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); if (true) { @@ -1342,7 +1342,7 @@ namespace WAN { ggml_set_f32(z, 0.5f); z = load_tensor_from_file(work_ctx, "wan_vae_z.bin"); print_ggml_tensor(z); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); _compute(8, z, true, &out, work_ctx); @@ -1410,10 +1410,10 @@ namespace WAN { } } - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr) { + virtual ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe, + ggml_tensor* mask = nullptr) { // x: [N, n_token, dim] // pe: [n_token, d_head/2, 2, 2] // return [N, n_token, dim] @@ -1451,10 +1451,10 @@ namespace WAN { bool qk_norm = true, float eps = 1e-6) : WanSelfAttention(dim, num_heads, qk_norm, eps) {} - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context, - int64_t context_img_len) = 0; + virtual ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context, + int64_t context_img_len) = 0; }; class WanT2VCrossAttention : public WanCrossAttention { @@ -1464,10 +1464,10 @@ namespace WAN { bool qk_norm = true, float eps = 1e-6) : WanCrossAttention(dim, num_heads, qk_norm, eps) {} - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context, - int64_t context_img_len) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context, + int64_t context_img_len) override { // x: [N, n_token, dim] // context: [N, n_context, dim] // context_img_len: unused @@ -1512,10 +1512,10 @@ namespace WAN { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* context, - int64_t context_img_len) override { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* context, + int64_t context_img_len) override { // x: [N, n_token, dim] // context: [N, context_img_len + context_txt_len, dim] // return [N, n_token, dim] @@ -1560,7 +1560,7 @@ namespace WAN { } }; - static struct ggml_tensor* modulate_add(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* e) { + static ggml_tensor* modulate_add(ggml_context* ctx, ggml_tensor* x, ggml_tensor* e) { // x: [N, n_token, dim] // e: [N, 1, dim] or [N, T, 1, dim] if (ggml_n_dims(e) == 3) { @@ -1574,7 +1574,7 @@ namespace WAN { return x; } - static struct ggml_tensor* modulate_mul(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* e) { + static ggml_tensor* modulate_mul(ggml_context* ctx, ggml_tensor* x, ggml_tensor* e) { // x: [N, n_token, dim] // e: [N, 1, dim] or [N, T, 1, dim] if (ggml_n_dims(e) == 3) { @@ -1592,7 +1592,7 @@ namespace WAN { protected: int64_t dim; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 6, 1); } @@ -1626,12 +1626,12 @@ namespace WAN { blocks["ffn.2"] = std::shared_ptr(new Linear(ffn_dim, dim)); } - virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* e, - struct ggml_tensor* pe, - struct ggml_tensor* context, - int64_t context_img_len = 257) { + virtual ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* e, + ggml_tensor* pe, + ggml_tensor* context, + int64_t context_img_len = 257) { // x: [N, n_token, dim] // e: [N, 6, dim] or [N, T, 6, dim] // context: [N, context_img_len + context_txt_len, dim] @@ -1680,7 +1680,7 @@ namespace WAN { class VaceWanAttentionBlock : public WanAttentionBlock { protected: int block_id; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 6, 1); } @@ -1702,11 +1702,11 @@ namespace WAN { } std::pair forward(GGMLRunnerContext* ctx, - struct ggml_tensor* c, - struct ggml_tensor* x, - struct ggml_tensor* e, - struct ggml_tensor* pe, - struct ggml_tensor* context, + ggml_tensor* c, + ggml_tensor* x, + ggml_tensor* e, + ggml_tensor* pe, + ggml_tensor* context, int64_t context_img_len = 257) { // x: [N, n_token, dim] // e: [N, 6, dim] or [N, T, 6, dim] @@ -1732,7 +1732,7 @@ namespace WAN { protected: int64_t dim; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 2, 1); } @@ -1749,9 +1749,9 @@ namespace WAN { blocks["head"] = std::shared_ptr(new Linear(dim, out_dim)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* e) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* e) { // x: [N, n_token, dim] // e: [N, dim] or [N, T, dim] // return [N, n_token, out_dim] @@ -1779,7 +1779,7 @@ namespace WAN { int64_t in_dim; int64_t flf_pos_embed_token_number; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { if (flf_pos_embed_token_number > 0) { params["emb_pos"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, in_dim, flf_pos_embed_token_number, 1); } @@ -1797,8 +1797,8 @@ namespace WAN { blocks["proj.4"] = std::shared_ptr(new LayerNorm(out_dim)); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* image_embeds) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* image_embeds) { if (flf_pos_embed_token_number > 0) { auto emb_pos = params["emb_pos"]; @@ -1917,8 +1917,8 @@ namespace WAN { } } - struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { + ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, + ggml_tensor* x) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; int64_t T = x->ne[2]; @@ -1930,11 +1930,11 @@ namespace WAN { return x; } - struct ggml_tensor* unpatchify(struct ggml_context* ctx, - struct ggml_tensor* x, - int64_t t_len, - int64_t h_len, - int64_t w_len) { + ggml_tensor* unpatchify(ggml_context* ctx, + ggml_tensor* x, + int64_t t_len, + int64_t h_len, + int64_t w_len) { // x: [N, t_len*h_len*w_len, pt*ph*pw*C] // return: [N*C, t_len*pt, h_len*ph, w_len*pw] int64_t N = x->ne[3]; @@ -1957,15 +1957,15 @@ namespace WAN { return x; } - struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* context, - struct ggml_tensor* pe, - struct ggml_tensor* clip_fea = nullptr, - struct ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f, - int64_t N = 1) { + ggml_tensor* forward_orig(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* pe, + ggml_tensor* clip_fea = nullptr, + ggml_tensor* vace_context = nullptr, + float vace_strength = 1.f, + int64_t N = 1) { // x: [N*C, T, H, W], C => in_dim // vace_context: [N*vace_in_dim, T, H, W] // timestep: [N,] or [T] @@ -2051,16 +2051,16 @@ namespace WAN { return x; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* context, - struct ggml_tensor* pe, - struct ggml_tensor* clip_fea = nullptr, - struct ggml_tensor* time_dim_concat = nullptr, - struct ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f, - int64_t N = 1) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* pe, + ggml_tensor* clip_fea = nullptr, + ggml_tensor* time_dim_concat = nullptr, + ggml_tensor* vace_context = nullptr, + float vace_strength = 1.f, + int64_t N = 1) { // Forward pass of DiT. // x: [N*C, T, H, W] // timestep: [N,] @@ -2225,19 +2225,19 @@ namespace WAN { return desc; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { wan.get_param_tensors(tensors, prefix); } - struct ggml_cgraph* build_graph(struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* clip_fea = nullptr, - struct ggml_tensor* c_concat = nullptr, - struct ggml_tensor* time_dim_concat = nullptr, - struct ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f) { - struct ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE); + ggml_cgraph* build_graph(ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* clip_fea = nullptr, + ggml_tensor* c_concat = nullptr, + ggml_tensor* time_dim_concat = nullptr, + ggml_tensor* vace_context = nullptr, + float vace_strength = 1.f) { + ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE); x = to_backend(x); timesteps = to_backend(timesteps); @@ -2270,15 +2270,15 @@ namespace WAN { auto runner_ctx = get_context(); - struct ggml_tensor* out = wan.forward(&runner_ctx, - x, - timesteps, - context, - pe, - clip_fea, - time_dim_concat, - vace_context, - vace_strength); + ggml_tensor* out = wan.forward(&runner_ctx, + x, + timesteps, + context, + pe, + clip_fea, + time_dim_concat, + vace_context, + vace_strength); ggml_build_forward_expand(gf, out); @@ -2286,17 +2286,17 @@ namespace WAN { } bool compute(int n_threads, - struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - struct ggml_tensor* clip_fea = nullptr, - struct ggml_tensor* c_concat = nullptr, - struct ggml_tensor* time_dim_concat = nullptr, - struct ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + ggml_tensor* clip_fea = nullptr, + ggml_tensor* c_concat = nullptr, + ggml_tensor* time_dim_concat = nullptr, + ggml_tensor* vace_context = nullptr, + float vace_strength = 1.f, + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength); }; @@ -2304,12 +2304,12 @@ namespace WAN { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(200 * 1024 * 1024); // 200 MB params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); { @@ -2332,7 +2332,7 @@ namespace WAN { // auto clip_fea = load_tensor_from_file(work_ctx, "wan_dit_clip_fea.bin"); // print_ggml_tensor(clip_fea); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); compute(8, x, timesteps, context, nullptr, nullptr, nullptr, nullptr, 1.f, &out, work_ctx); diff --git a/src/z_image.hpp b/src/z_image.hpp index 8f405a59..9f17890b 100644 --- a/src/z_image.hpp +++ b/src/z_image.hpp @@ -42,10 +42,10 @@ namespace ZImage { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe, + ggml_tensor* mask = nullptr) { // x: [N, n_token, hidden_size] int64_t n_token = x->ne[1]; int64_t N = x->ne[2]; @@ -124,7 +124,7 @@ namespace ZImage { blocks["w3"] = std::make_shared(dim, hidden_dim, false); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto w1 = std::dynamic_pointer_cast(blocks["w1"]); auto w2 = std::dynamic_pointer_cast(blocks["w2"]); auto w3 = std::dynamic_pointer_cast(blocks["w3"]); @@ -138,9 +138,9 @@ namespace ZImage { } }; - __STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* scale) { + __STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* scale) { // x: [N, L, C] // scale: [N, C] scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]); // [N, 1, C] @@ -175,11 +175,11 @@ namespace ZImage { } } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr, - struct ggml_tensor* adaln_input = nullptr) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* pe, + ggml_tensor* mask = nullptr, + ggml_tensor* adaln_input = nullptr) { auto attention = std::dynamic_pointer_cast(blocks["attention"]); auto feed_forward = std::dynamic_pointer_cast(blocks["feed_forward"]); auto attention_norm1 = std::dynamic_pointer_cast(blocks["attention_norm1"]); @@ -241,9 +241,9 @@ namespace ZImage { blocks["adaLN_modulation.1"] = std::make_shared(MIN(hidden_size, ADALN_EMBED_DIM), hidden_size); } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* c) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* c) { // x: [N, n_token, hidden_size] // c: [N, hidden_size] // return: [N, n_token, patch_size * patch_size * out_channels] @@ -284,7 +284,7 @@ namespace ZImage { protected: ZImageParams z_image_params; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size); params["x_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size); } @@ -346,11 +346,11 @@ namespace ZImage { blocks["final_layer"] = std::make_shared(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels); } - struct ggml_tensor* forward_core(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* context, - struct ggml_tensor* pe) { + ggml_tensor* forward_core(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* pe) { auto x_embedder = std::dynamic_pointer_cast(blocks["x_embedder"]); auto t_embedder = std::dynamic_pointer_cast(blocks["t_embedder"]); auto cap_embedder_0 = std::dynamic_pointer_cast(blocks["cap_embedder.0"]); @@ -414,12 +414,12 @@ namespace ZImage { return img; } - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* timestep, - struct ggml_tensor* context, - struct ggml_tensor* pe, - std::vector ref_latents = {}) { + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* timestep, + ggml_tensor* context, + ggml_tensor* pe, + std::vector ref_latents = {}) { // Forward pass of DiT. // x: [N, C, H, W] // timestep: [N,] @@ -477,17 +477,17 @@ namespace ZImage { return "z_image"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) { z_image.get_param_tensors(tensors, prefix); } - struct ggml_cgraph* build_graph(struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false) { + ggml_cgraph* build_graph(ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, + std::vector ref_latents = {}, + bool increase_ref_index = false) { GGML_ASSERT(x->ne[3] == 1); - struct ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE); + ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE); x = to_backend(x); context = to_backend(context); @@ -518,12 +518,12 @@ namespace ZImage { set_backend_tensor_data(pe, pe_vec.data()); auto runner_ctx = get_context(); - struct ggml_tensor* out = z_image.forward(&runner_ctx, - x, - timesteps, - context, - pe, - ref_latents); + ggml_tensor* out = z_image.forward(&runner_ctx, + x, + timesteps, + context, + pe, + ref_latents); ggml_build_forward_expand(gf, out); @@ -531,17 +531,17 @@ namespace ZImage { } bool compute(int n_threads, - struct ggml_tensor* x, - struct ggml_tensor* timesteps, - struct ggml_tensor* context, + ggml_tensor* x, + ggml_tensor* timesteps, + ggml_tensor* context, std::vector ref_latents = {}, bool increase_ref_index = false, - struct ggml_tensor** output = nullptr, - struct ggml_context* output_ctx = nullptr) { + ggml_tensor** output = nullptr, + ggml_context* output_ctx = nullptr) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] - auto get_graph = [&]() -> struct ggml_cgraph* { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, ref_latents, increase_ref_index); }; @@ -549,12 +549,12 @@ namespace ZImage { } void test() { - struct ggml_init_params params; + ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1GB params.mem_buffer = nullptr; params.no_alloc = false; - struct ggml_context* work_ctx = ggml_init(params); + ggml_context* work_ctx = ggml_init(params); GGML_ASSERT(work_ctx != nullptr); { @@ -571,7 +571,7 @@ namespace ZImage { auto context = load_tensor_from_file(work_ctx, "./z_image_context.bin"); print_ggml_tensor(context); - struct ggml_tensor* out = nullptr; + ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); compute(8, x, timesteps, context, {}, false, &out, work_ctx); From 5265a5efa1cf0fc2b5ac0faa41c6898430387624 Mon Sep 17 00:00:00 2001 From: Tay Date: Mon, 16 Mar 2026 11:27:46 -0500 Subject: [PATCH 16/20] perf(z-image): switch to fused SwiGLU kernel (#1302) --- src/z_image.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/z_image.hpp b/src/z_image.hpp index 9f17890b..53a7cf82 100644 --- a/src/z_image.hpp +++ b/src/z_image.hpp @@ -131,7 +131,7 @@ namespace ZImage { auto x1 = w1->forward(ctx, x); auto x3 = w3->forward(ctx, x); - x = ggml_mul(ctx->ggml_ctx, ggml_silu(ctx->ggml_ctx, x1), x3); + x = ggml_swiglu_split(ctx->ggml_ctx, x1, x3); x = w2->forward(ctx, x); return x; From 545fac4f3fb0117a4e962b1a04cf933a7e635933 Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 17 Mar 2026 00:28:03 +0800 Subject: [PATCH 17/20] refactor: simplify sample cache flow (#1350) --- src/stable-diffusion.cpp | 766 +++++++++++++++++++++------------------ 1 file changed, 406 insertions(+), 360 deletions(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index a5dbc772..bbf2f979 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -103,6 +103,379 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { return std::max(0.0f, reuse_threshold); } +enum class SampleCacheMode { + NONE, + EASYCACHE, + UCACHE, + CACHEDIT, +}; + +struct SampleCacheRuntime { + SampleCacheMode mode = SampleCacheMode::NONE; + + EasyCacheState easycache; + UCacheState ucache; + CacheDitConditionState cachedit; + SpectrumState spectrum; + + bool spectrum_enabled = false; + + bool has_step_cache() const { + return mode != SampleCacheMode::NONE; + } + + bool easycache_enabled() const { + return mode == SampleCacheMode::EASYCACHE; + } + + bool ucache_enabled() const { + return mode == SampleCacheMode::UCACHE; + } + + bool cachedit_enabled() const { + return mode == SampleCacheMode::CACHEDIT; + } +}; + +static bool has_valid_cache_percent_range(const sd_cache_params_t& cache_params) { + if (cache_params.mode != SD_CACHE_EASYCACHE && cache_params.mode != SD_CACHE_UCACHE) { + return true; + } + + return cache_params.start_percent >= 0.0f && + cache_params.start_percent < 1.0f && + cache_params.end_percent > 0.0f && + cache_params.end_percent <= 1.0f && + cache_params.start_percent < cache_params.end_percent; +} + +static void init_easycache_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + Denoiser* denoiser) { + if (!sd_version_is_dit(version)) { + LOG_WARN("EasyCache requested but not supported for this model type"); + return; + } + + EasyCacheConfig config; + config.enabled = true; + config.reuse_threshold = get_cache_reuse_threshold(cache_params); + config.start_percent = cache_params.start_percent; + config.end_percent = cache_params.end_percent; + + runtime.easycache.init(config, denoiser); + if (!runtime.easycache.enabled()) { + LOG_WARN("EasyCache requested but could not be initialized for this run"); + return; + } + + runtime.mode = SampleCacheMode::EASYCACHE; + LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f", + config.reuse_threshold, + config.start_percent, + config.end_percent); +} + +static void init_ucache_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + Denoiser* denoiser, + const std::vector& sigmas) { + if (!sd_version_is_unet(version)) { + LOG_WARN("UCache requested but not supported for this model type (only UNET models)"); + return; + } + + UCacheConfig config; + config.enabled = true; + config.reuse_threshold = get_cache_reuse_threshold(cache_params); + config.start_percent = cache_params.start_percent; + config.end_percent = cache_params.end_percent; + config.error_decay_rate = std::max(0.0f, std::min(1.0f, cache_params.error_decay_rate)); + config.use_relative_threshold = cache_params.use_relative_threshold; + config.reset_error_on_compute = cache_params.reset_error_on_compute; + + runtime.ucache.init(config, denoiser); + if (!runtime.ucache.enabled()) { + LOG_WARN("UCache requested but could not be initialized for this run"); + return; + } + + runtime.ucache.set_sigmas(sigmas); + runtime.mode = SampleCacheMode::UCACHE; + LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s", + config.reuse_threshold, + config.start_percent, + config.end_percent, + config.error_decay_rate, + config.use_relative_threshold ? "true" : "false", + config.reset_error_on_compute ? "true" : "false"); +} + +static void init_cachedit_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + const std::vector& sigmas) { + if (!sd_version_is_dit(version)) { + LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)"); + return; + } + + DBCacheConfig dbcfg; + dbcfg.enabled = (cache_params.mode == SD_CACHE_DBCACHE || + cache_params.mode == SD_CACHE_CACHE_DIT); + dbcfg.Fn_compute_blocks = cache_params.Fn_compute_blocks; + dbcfg.Bn_compute_blocks = cache_params.Bn_compute_blocks; + dbcfg.residual_diff_threshold = cache_params.residual_diff_threshold; + dbcfg.max_warmup_steps = cache_params.max_warmup_steps; + dbcfg.max_cached_steps = cache_params.max_cached_steps; + dbcfg.max_continuous_cached_steps = cache_params.max_continuous_cached_steps; + if (cache_params.scm_mask != nullptr && strlen(cache_params.scm_mask) > 0) { + dbcfg.steps_computation_mask = parse_scm_mask(cache_params.scm_mask); + } + dbcfg.scm_policy_dynamic = cache_params.scm_policy_dynamic; + + TaylorSeerConfig tcfg; + tcfg.enabled = (cache_params.mode == SD_CACHE_TAYLORSEER || + cache_params.mode == SD_CACHE_CACHE_DIT); + tcfg.n_derivatives = cache_params.taylorseer_n_derivatives; + tcfg.skip_interval_steps = cache_params.taylorseer_skip_interval; + + runtime.cachedit.init(dbcfg, tcfg); + if (!runtime.cachedit.enabled()) { + LOG_WARN("CacheDIT requested but could not be initialized for this run"); + return; + } + + runtime.cachedit.set_sigmas(sigmas); + runtime.mode = SampleCacheMode::CACHEDIT; + LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d", + cache_params.mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params.mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"), + dbcfg.Fn_compute_blocks, + dbcfg.Bn_compute_blocks, + dbcfg.residual_diff_threshold, + dbcfg.max_warmup_steps); +} + +static void init_spectrum_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + const std::vector& sigmas) { + if (!sd_version_is_unet(version) && !sd_version_is_dit(version)) { + LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)"); + return; + } + + SpectrumConfig config; + config.w = cache_params.spectrum_w; + config.m = cache_params.spectrum_m; + config.lam = cache_params.spectrum_lam; + config.window_size = cache_params.spectrum_window_size; + config.flex_window = cache_params.spectrum_flex_window; + config.warmup_steps = cache_params.spectrum_warmup_steps; + config.stop_percent = cache_params.spectrum_stop_percent; + + size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; + runtime.spectrum.init(config, total_steps); + runtime.spectrum_enabled = true; + + LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%", + config.w, config.m, config.lam, + config.window_size, config.flex_window, + config.warmup_steps, config.stop_percent * 100.0f); +} + +static SampleCacheRuntime init_sample_cache_runtime(SDVersion version, + const sd_cache_params_t* cache_params, + Denoiser* denoiser, + const std::vector& sigmas) { + SampleCacheRuntime runtime; + if (cache_params == nullptr || cache_params->mode == SD_CACHE_DISABLED) { + return runtime; + } + + if (!has_valid_cache_percent_range(*cache_params)) { + LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)", + cache_params->start_percent, + cache_params->end_percent); + return runtime; + } + + switch (cache_params->mode) { + case SD_CACHE_EASYCACHE: + init_easycache_runtime(runtime, version, *cache_params, denoiser); + break; + case SD_CACHE_UCACHE: + init_ucache_runtime(runtime, version, *cache_params, denoiser, sigmas); + break; + case SD_CACHE_DBCACHE: + case SD_CACHE_TAYLORSEER: + case SD_CACHE_CACHE_DIT: + init_cachedit_runtime(runtime, version, *cache_params, sigmas); + break; + case SD_CACHE_SPECTRUM: + init_spectrum_runtime(runtime, version, *cache_params, sigmas); + break; + default: + break; + } + + return runtime; +} + +struct SampleStepCacheDispatcher { + SampleCacheRuntime& runtime; + int step; + float sigma; + int step_index; + + SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma) + : runtime(runtime), step(step), sigma(sigma), step_index(step > 0 ? (step - 1) : -1) { + if (step_index < 0) { + return; + } + + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + runtime.easycache.begin_step(step_index, sigma); + break; + case SampleCacheMode::UCACHE: + runtime.ucache.begin_step(step_index, sigma); + break; + case SampleCacheMode::CACHEDIT: + runtime.cachedit.begin_step(step_index, sigma); + break; + case SampleCacheMode::NONE: + break; + } + } + + bool before_condition(const SDCondition* condition, ggml_tensor* input, ggml_tensor* output) { + if (step_index < 0 || condition == nullptr || input == nullptr || output == nullptr) { + return false; + } + + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + return runtime.easycache.before_condition(condition, input, output, sigma, step_index); + case SampleCacheMode::UCACHE: + return runtime.ucache.before_condition(condition, input, output, sigma, step_index); + case SampleCacheMode::CACHEDIT: + return runtime.cachedit.before_condition(condition, input, output, sigma, step_index); + case SampleCacheMode::NONE: + return false; + } + + return false; + } + + void after_condition(const SDCondition* condition, ggml_tensor* input, ggml_tensor* output) { + if (step_index < 0 || condition == nullptr || input == nullptr || output == nullptr) { + return; + } + + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + runtime.easycache.after_condition(condition, input, output); + break; + case SampleCacheMode::UCACHE: + runtime.ucache.after_condition(condition, input, output); + break; + case SampleCacheMode::CACHEDIT: + runtime.cachedit.after_condition(condition, input, output); + break; + case SampleCacheMode::NONE: + break; + } + } + + bool is_step_skipped() const { + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + return runtime.easycache.is_step_skipped(); + case SampleCacheMode::UCACHE: + return runtime.ucache.is_step_skipped(); + case SampleCacheMode::CACHEDIT: + return runtime.cachedit.is_step_skipped(); + case SampleCacheMode::NONE: + return false; + } + + return false; + } +}; + +static void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps) { + if (runtime.easycache_enabled()) { + if (runtime.easycache.total_steps_skipped > 0 && total_steps > 0) { + if (runtime.easycache.total_steps_skipped < static_cast(total_steps)) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.easycache.total_steps_skipped); + LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.easycache.total_steps_skipped, + total_steps, + speedup); + } else { + LOG_INFO("EasyCache skipped %d/%zu steps", + runtime.easycache.total_steps_skipped, + total_steps); + } + } else if (total_steps > 0) { + LOG_INFO("EasyCache completed without skipping steps"); + } + } + + if (runtime.ucache_enabled()) { + if (runtime.ucache.total_steps_skipped > 0 && total_steps > 0) { + if (runtime.ucache.total_steps_skipped < static_cast(total_steps)) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.ucache.total_steps_skipped); + LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.ucache.total_steps_skipped, + total_steps, + speedup); + } else { + LOG_INFO("UCache skipped %d/%zu steps", + runtime.ucache.total_steps_skipped, + total_steps); + } + } else if (total_steps > 0) { + LOG_INFO("UCache completed without skipping steps"); + } + } + + if (runtime.cachedit_enabled()) { + if (runtime.cachedit.total_steps_skipped > 0 && total_steps > 0) { + if (runtime.cachedit.total_steps_skipped < static_cast(total_steps)) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.cachedit.total_steps_skipped); + LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup), accum_diff: %.4f", + runtime.cachedit.total_steps_skipped, + total_steps, + speedup, + runtime.cachedit.accumulated_residual_diff); + } else { + LOG_INFO("CacheDIT skipped %d/%zu steps, accum_diff: %.4f", + runtime.cachedit.total_steps_skipped, + total_steps, + runtime.cachedit.accumulated_residual_diff); + } + } else if (total_steps > 0) { + LOG_INFO("CacheDIT completed without skipping steps"); + } + } + + if (runtime.spectrum_enabled && runtime.spectrum.total_steps_skipped > 0 && total_steps > 0) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.spectrum.total_steps_skipped); + LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.spectrum.total_steps_skipped, + total_steps, + speedup); + } +} + /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -1662,148 +2035,7 @@ public: img_cfg_scale = cfg_scale; } - EasyCacheState easycache_state; - UCacheState ucache_state; - CacheDitConditionState cachedit_state; - SpectrumState spectrum_state; - bool easycache_enabled = false; - bool ucache_enabled = false; - bool cachedit_enabled = false; - bool spectrum_enabled = false; - - if (cache_params != nullptr && cache_params->mode != SD_CACHE_DISABLED) { - bool percent_valid = true; - if (cache_params->mode == SD_CACHE_EASYCACHE || cache_params->mode == SD_CACHE_UCACHE) { - percent_valid = cache_params->start_percent >= 0.0f && - cache_params->start_percent < 1.0f && - cache_params->end_percent > 0.0f && - cache_params->end_percent <= 1.0f && - cache_params->start_percent < cache_params->end_percent; - } - - if (!percent_valid) { - LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)", - cache_params->start_percent, - cache_params->end_percent); - } else if (cache_params->mode == SD_CACHE_EASYCACHE) { - bool easycache_supported = sd_version_is_dit(version); - if (!easycache_supported) { - LOG_WARN("EasyCache requested but not supported for this model type"); - } else { - EasyCacheConfig easycache_config; - easycache_config.enabled = true; - easycache_config.reuse_threshold = get_cache_reuse_threshold(*cache_params); - easycache_config.start_percent = cache_params->start_percent; - easycache_config.end_percent = cache_params->end_percent; - easycache_state.init(easycache_config, denoiser.get()); - if (easycache_state.enabled()) { - easycache_enabled = true; - LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f", - easycache_config.reuse_threshold, - easycache_config.start_percent, - easycache_config.end_percent); - } else { - LOG_WARN("EasyCache requested but could not be initialized for this run"); - } - } - } else if (cache_params->mode == SD_CACHE_UCACHE) { - bool ucache_supported = sd_version_is_unet(version); - if (!ucache_supported) { - LOG_WARN("UCache requested but not supported for this model type (only UNET models)"); - } else { - UCacheConfig ucache_config; - ucache_config.enabled = true; - ucache_config.reuse_threshold = get_cache_reuse_threshold(*cache_params); - ucache_config.start_percent = cache_params->start_percent; - ucache_config.end_percent = cache_params->end_percent; - ucache_config.error_decay_rate = std::max(0.0f, std::min(1.0f, cache_params->error_decay_rate)); - ucache_config.use_relative_threshold = cache_params->use_relative_threshold; - ucache_config.reset_error_on_compute = cache_params->reset_error_on_compute; - ucache_state.init(ucache_config, denoiser.get()); - if (ucache_state.enabled()) { - ucache_enabled = true; - LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s", - ucache_config.reuse_threshold, - ucache_config.start_percent, - ucache_config.end_percent, - ucache_config.error_decay_rate, - ucache_config.use_relative_threshold ? "true" : "false", - ucache_config.reset_error_on_compute ? "true" : "false"); - } else { - LOG_WARN("UCache requested but could not be initialized for this run"); - } - } - } else if (cache_params->mode == SD_CACHE_DBCACHE || - cache_params->mode == SD_CACHE_TAYLORSEER || - cache_params->mode == SD_CACHE_CACHE_DIT) { - bool cachedit_supported = sd_version_is_dit(version); - if (!cachedit_supported) { - LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)"); - } else { - DBCacheConfig dbcfg; - dbcfg.enabled = (cache_params->mode == SD_CACHE_DBCACHE || - cache_params->mode == SD_CACHE_CACHE_DIT); - dbcfg.Fn_compute_blocks = cache_params->Fn_compute_blocks; - dbcfg.Bn_compute_blocks = cache_params->Bn_compute_blocks; - dbcfg.residual_diff_threshold = cache_params->residual_diff_threshold; - dbcfg.max_warmup_steps = cache_params->max_warmup_steps; - dbcfg.max_cached_steps = cache_params->max_cached_steps; - dbcfg.max_continuous_cached_steps = cache_params->max_continuous_cached_steps; - if (cache_params->scm_mask != nullptr && strlen(cache_params->scm_mask) > 0) { - dbcfg.steps_computation_mask = parse_scm_mask(cache_params->scm_mask); - } - dbcfg.scm_policy_dynamic = cache_params->scm_policy_dynamic; - - TaylorSeerConfig tcfg; - tcfg.enabled = (cache_params->mode == SD_CACHE_TAYLORSEER || - cache_params->mode == SD_CACHE_CACHE_DIT); - tcfg.n_derivatives = cache_params->taylorseer_n_derivatives; - tcfg.skip_interval_steps = cache_params->taylorseer_skip_interval; - - cachedit_state.init(dbcfg, tcfg); - if (cachedit_state.enabled()) { - cachedit_enabled = true; - LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d", - cache_params->mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params->mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"), - dbcfg.Fn_compute_blocks, - dbcfg.Bn_compute_blocks, - dbcfg.residual_diff_threshold, - dbcfg.max_warmup_steps); - } else { - LOG_WARN("CacheDIT requested but could not be initialized for this run"); - } - } - } else if (cache_params->mode == SD_CACHE_SPECTRUM) { - bool spectrum_supported = sd_version_is_unet(version) || sd_version_is_dit(version); - if (!spectrum_supported) { - LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)"); - } else { - SpectrumConfig spectrum_config; - spectrum_config.w = cache_params->spectrum_w; - spectrum_config.m = cache_params->spectrum_m; - spectrum_config.lam = cache_params->spectrum_lam; - spectrum_config.window_size = cache_params->spectrum_window_size; - spectrum_config.flex_window = cache_params->spectrum_flex_window; - spectrum_config.warmup_steps = cache_params->spectrum_warmup_steps; - spectrum_config.stop_percent = cache_params->spectrum_stop_percent; - size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; - spectrum_state.init(spectrum_config, total_steps); - spectrum_enabled = true; - LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%", - spectrum_config.w, spectrum_config.m, spectrum_config.lam, - spectrum_config.window_size, spectrum_config.flex_window, - spectrum_config.warmup_steps, spectrum_config.stop_percent * 100.0f); - } - } - } - - if (ucache_enabled) { - ucache_state.set_sigmas(sigmas); - } - - if (cachedit_enabled) { - cachedit_state.set_sigmas(sigmas); - } + SampleCacheRuntime cache_runtime = init_sample_cache_runtime(version, cache_params, denoiser.get(), sigmas); size_t steps = sigmas.size() - 1; ggml_tensor* x = ggml_ext_dup_and_cpy_tensor(work_ctx, init_latent); @@ -1876,121 +2108,7 @@ public: } DiffusionParams diffusion_params; - - const bool easycache_step_active = easycache_enabled && step > 0; - int easycache_step_index = easycache_step_active ? (step - 1) : -1; - if (easycache_step_active) { - easycache_state.begin_step(easycache_step_index, sigma); - } - - auto easycache_before_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) -> bool { - if (!easycache_step_active || condition == nullptr || output_tensor == nullptr) { - return false; - } - return easycache_state.before_condition(condition, - diffusion_params.x, - output_tensor, - sigma, - easycache_step_index); - }; - - auto easycache_after_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) { - if (!easycache_step_active || condition == nullptr || output_tensor == nullptr) { - return; - } - easycache_state.after_condition(condition, - diffusion_params.x, - output_tensor); - }; - - auto easycache_step_is_skipped = [&]() { - return easycache_step_active && easycache_state.is_step_skipped(); - }; - - const bool ucache_step_active = ucache_enabled && step > 0; - int ucache_step_index = ucache_step_active ? (step - 1) : -1; - if (ucache_step_active) { - ucache_state.begin_step(ucache_step_index, sigma); - } - - auto ucache_before_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) -> bool { - if (!ucache_step_active || condition == nullptr || output_tensor == nullptr) { - return false; - } - return ucache_state.before_condition(condition, - diffusion_params.x, - output_tensor, - sigma, - ucache_step_index); - }; - - auto ucache_after_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) { - if (!ucache_step_active || condition == nullptr || output_tensor == nullptr) { - return; - } - ucache_state.after_condition(condition, - diffusion_params.x, - output_tensor); - }; - - auto ucache_step_is_skipped = [&]() { - return ucache_step_active && ucache_state.is_step_skipped(); - }; - - const bool cachedit_step_active = cachedit_enabled && step > 0; - int cachedit_step_index = cachedit_step_active ? (step - 1) : -1; - if (cachedit_step_active) { - cachedit_state.begin_step(cachedit_step_index, sigma); - } - - auto cachedit_before_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) -> bool { - if (!cachedit_step_active || condition == nullptr || output_tensor == nullptr) { - return false; - } - return cachedit_state.before_condition(condition, - diffusion_params.x, - output_tensor, - sigma, - cachedit_step_index); - }; - - auto cachedit_after_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) { - if (!cachedit_step_active || condition == nullptr || output_tensor == nullptr) { - return; - } - cachedit_state.after_condition(condition, - diffusion_params.x, - output_tensor); - }; - - auto cachedit_step_is_skipped = [&]() { - return cachedit_step_active && cachedit_state.is_step_skipped(); - }; - - auto cache_before_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) -> bool { - if (easycache_step_active) { - return easycache_before_condition(condition, output_tensor); - } else if (ucache_step_active) { - return ucache_before_condition(condition, output_tensor); - } else if (cachedit_step_active) { - return cachedit_before_condition(condition, output_tensor); - } - return false; - }; - - auto cache_after_condition = [&](const SDCondition* condition, ggml_tensor* output_tensor) { - if (easycache_step_active) { - easycache_after_condition(condition, output_tensor); - } else if (ucache_step_active) { - ucache_after_condition(condition, output_tensor); - } else if (cachedit_step_active) { - cachedit_after_condition(condition, output_tensor); - } - }; - - auto cache_step_is_skipped = [&]() { - return easycache_step_is_skipped() || ucache_step_is_skipped() || cachedit_step_is_skipped(); - }; + SampleStepCacheDispatcher step_cache(cache_runtime, step, sigma); std::vector scaling = denoiser->get_scalings(sigma); GGML_ASSERT(scaling.size() == 3); @@ -2017,8 +2135,8 @@ public: timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); - if (spectrum_enabled && spectrum_state.should_predict()) { - spectrum_state.predict(denoised); + if (cache_runtime.spectrum_enabled && cache_runtime.spectrum.should_predict()) { + cache_runtime.spectrum.predict(denoised); if (denoise_mask != nullptr) { apply_mask(denoised, init_latent, denoise_mask); @@ -2077,6 +2195,22 @@ public: diffusion_params.vace_context = vace_context; diffusion_params.vace_strength = vace_strength; + auto run_diffusion_condition = [&](const SDCondition* condition, ggml_tensor** output_tensor) -> bool { + if (step_cache.before_condition(condition, diffusion_params.x, *output_tensor)) { + return true; + } + + if (!work_diffusion_model->compute(n_threads, + diffusion_params, + output_tensor)) { + LOG_ERROR("diffusion model compute failed"); + return false; + } + + step_cache.after_condition(condition, diffusion_params.x, *output_tensor); + return true; + }; + const SDCondition* active_condition = nullptr; ggml_tensor** active_output = &out_cond; if (start_merge_step == -1 || step <= start_merge_step) { @@ -2092,18 +2226,11 @@ public: active_condition = &id_cond; } - bool skip_model = cache_before_condition(active_condition, *active_output); - if (!skip_model) { - if (!work_diffusion_model->compute(n_threads, - diffusion_params, - active_output)) { - LOG_ERROR("diffusion model compute failed"); - return nullptr; - } - cache_after_condition(active_condition, *active_output); + if (!run_diffusion_condition(active_condition, active_output)) { + return nullptr; } - bool current_step_skipped = cache_step_is_skipped(); + bool current_step_skipped = step_cache.is_step_skipped(); float* negative_data = nullptr; if (has_unconditioned) { @@ -2115,20 +2242,13 @@ public: LOG_ERROR("controlnet compute failed"); } } - current_step_skipped = cache_step_is_skipped(); + current_step_skipped = step_cache.is_step_skipped(); diffusion_params.controls = controls; diffusion_params.context = uncond.c_crossattn; diffusion_params.c_concat = uncond.c_concat; diffusion_params.y = uncond.c_vector; - bool skip_uncond = cache_before_condition(&uncond, out_uncond); - if (!skip_uncond) { - if (!work_diffusion_model->compute(n_threads, - diffusion_params, - &out_uncond)) { - LOG_ERROR("diffusion model compute failed"); - return nullptr; - } - cache_after_condition(&uncond, out_uncond); + if (!run_diffusion_condition(&uncond, &out_uncond)) { + return nullptr; } negative_data = (float*)out_uncond->data; } @@ -2138,15 +2258,8 @@ public: diffusion_params.context = img_cond.c_crossattn; diffusion_params.c_concat = img_cond.c_concat; diffusion_params.y = img_cond.c_vector; - bool skip_img_cond = cache_before_condition(&img_cond, out_img_cond); - if (!skip_img_cond) { - if (!work_diffusion_model->compute(n_threads, - diffusion_params, - &out_img_cond)) { - LOG_ERROR("diffusion model compute failed"); - return nullptr; - } - cache_after_condition(&img_cond, out_img_cond); + if (!run_diffusion_condition(&img_cond, &out_img_cond)) { + return nullptr; } img_cond_data = (float*)out_img_cond->data; } @@ -2156,7 +2269,7 @@ public: float* skip_layer_data = has_skiplayer ? (float*)out_skip->data : nullptr; if (is_skiplayer_step) { LOG_DEBUG("Skipping layers at step %d\n", step); - if (!cache_step_is_skipped()) { + if (!step_cache.is_step_skipped()) { // skip layer (same as conditioned) diffusion_params.context = cond.c_crossattn; diffusion_params.c_concat = cond.c_concat; @@ -2211,8 +2324,8 @@ public: vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; } - if (spectrum_enabled) { - spectrum_state.update(denoised); + if (cache_runtime.spectrum_enabled) { + cache_runtime.spectrum.update(denoised); } if (denoise_mask != nullptr) { @@ -2244,75 +2357,8 @@ public: return NULL; } - if (easycache_enabled) { - size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; - if (easycache_state.total_steps_skipped > 0 && total_steps > 0) { - if (easycache_state.total_steps_skipped < static_cast(total_steps)) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - easycache_state.total_steps_skipped); - LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)", - easycache_state.total_steps_skipped, - total_steps, - speedup); - } else { - LOG_INFO("EasyCache skipped %d/%zu steps", - easycache_state.total_steps_skipped, - total_steps); - } - } else if (total_steps > 0) { - LOG_INFO("EasyCache completed without skipping steps"); - } - } - - if (ucache_enabled) { - size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; - if (ucache_state.total_steps_skipped > 0 && total_steps > 0) { - if (ucache_state.total_steps_skipped < static_cast(total_steps)) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - ucache_state.total_steps_skipped); - LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)", - ucache_state.total_steps_skipped, - total_steps, - speedup); - } else { - LOG_INFO("UCache skipped %d/%zu steps", - ucache_state.total_steps_skipped, - total_steps); - } - } else if (total_steps > 0) { - LOG_INFO("UCache completed without skipping steps"); - } - } - - if (cachedit_enabled) { - size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; - if (cachedit_state.total_steps_skipped > 0 && total_steps > 0) { - if (cachedit_state.total_steps_skipped < static_cast(total_steps)) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - cachedit_state.total_steps_skipped); - LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup), accum_diff: %.4f", - cachedit_state.total_steps_skipped, - total_steps, - speedup, - cachedit_state.accumulated_residual_diff); - } else { - LOG_INFO("CacheDIT skipped %d/%zu steps, accum_diff: %.4f", - cachedit_state.total_steps_skipped, - total_steps, - cachedit_state.accumulated_residual_diff); - } - } else if (total_steps > 0) { - LOG_INFO("CacheDIT completed without skipping steps"); - } - } - - if (spectrum_enabled && spectrum_state.total_steps_skipped > 0) { - size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; - double speedup = static_cast(total_steps) / - static_cast(total_steps - spectrum_state.total_steps_skipped); - LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)", - spectrum_state.total_steps_skipped, total_steps, speedup); - } + size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; + log_sample_cache_summary(cache_runtime, total_steps); if (inverse_noise_scaling) { x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); From 6293ab5aaf22d2e56978cea11dd9181cde706f51 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 29 Mar 2026 13:12:57 -0300 Subject: [PATCH 18/20] docs: update Spectrum info about DiT models (#1360) --- docs/caching.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/caching.md b/docs/caching.md index cb103aee..b02a541b 100644 --- a/docs/caching.md +++ b/docs/caching.md @@ -11,7 +11,7 @@ Caching methods accelerate diffusion inference by reusing intermediate computati | `dbcache` | DiT models | Block-level L1 residual threshold | | `taylorseer` | DiT models | Taylor series approximation | | `cache-dit` | DiT models | Combined DBCache + TaylorSeer | -| `spectrum` | UNET models | Chebyshev + Taylor output forecasting | +| `spectrum` | UNET and DiT models | Chebyshev + Taylor output forecasting | ### UCache (UNET Models) @@ -111,9 +111,9 @@ Mask values: `1` = compute, `0` = can cache. --scm-policy dynamic ``` -### Spectrum (UNET Models) +### Spectrum (UNET and DiT Models) -Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum). +Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum). ```bash sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum From ed88e215a213a8daec128d35f6fb85749f203703 Mon Sep 17 00:00:00 2001 From: stduhpf Date: Sun, 29 Mar 2026 18:14:33 +0200 Subject: [PATCH 19/20] refactor: simplify f8_e5m2_to_f16 function a little bit (#1358) --- src/model.cpp | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/src/model.cpp b/src/model.cpp index d23b97fa..2c708ed6 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -162,43 +162,7 @@ uint16_t f8_e4m3_to_f16(uint8_t f8) { } uint16_t f8_e5m2_to_f16(uint8_t fp8) { - uint8_t sign = (fp8 >> 7) & 0x1; - uint8_t exponent = (fp8 >> 2) & 0x1F; - uint8_t mantissa = fp8 & 0x3; - - uint16_t fp16_sign = sign << 15; - uint16_t fp16_exponent; - uint16_t fp16_mantissa; - - if (exponent == 0 && mantissa == 0) { // zero - return fp16_sign; - } - - if (exponent == 0x1F) { // NAN and INF - fp16_exponent = 0x1F; - fp16_mantissa = mantissa ? (mantissa << 8) : 0; - return fp16_sign | (fp16_exponent << 10) | fp16_mantissa; - } - - if (exponent == 0) { // subnormal numbers - fp16_mantissa = (mantissa << 8); - return fp16_sign | fp16_mantissa; - } - - // normal numbers - int16_t true_exponent = (int16_t)exponent - 15 + 15; - if (true_exponent <= 0) { - fp16_exponent = 0; - fp16_mantissa = (mantissa << 8); - } else if (true_exponent >= 0x1F) { - fp16_exponent = 0x1F; - fp16_mantissa = 0; - } else { - fp16_exponent = (uint16_t)true_exponent; - fp16_mantissa = mantissa << 8; - } - - return fp16_sign | (fp16_exponent << 10) | fp16_mantissa; + return static_cast(fp8) << 8; } void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) { From f16a110f8776398ef23a2a6b7b57522c2471637a Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 30 Mar 2026 00:19:25 +0800 Subject: [PATCH 20/20] refactor: migrate generation pipeline to sd::Tensor (#1373) --- examples/cli/main.cpp | 2 +- src/anima.hpp | 37 +- src/auto_encoder_kl.hpp | 261 +-- src/cache_dit.hpp | 52 +- src/clip.hpp | 29 +- src/common_dit.hpp | 58 +- src/condition_cache_utils.hpp | 64 + src/conditioner.hpp | 951 ++++------ src/control.hpp | 98 +- src/denoiser.hpp | 1329 ++++--------- src/diffusion_model.hpp | 204 +- src/easycache.hpp | 72 +- src/esrgan.hpp | 19 +- src/flux.hpp | 119 +- src/ggml_extend.hpp | 469 +++-- src/latent-preview.h | 66 + src/llm.hpp | 156 +- src/lora.hpp | 2 +- src/mmdit.hpp | 68 +- src/pmid.hpp | 43 +- src/preprocessing.hpp | 362 ++-- src/qwen_image.hpp | 78 +- src/sample-cache.cpp | 361 ++++ src/sample-cache.h | 61 + src/spectrum.hpp | 20 +- src/stable-diffusion.cpp | 3297 ++++++++++++++------------------- src/t5.hpp | 2074 +++++++++++---------- src/tae.hpp | 56 +- src/tensor.hpp | 1249 +++++++++++++ src/tensor_ggml.hpp | 127 ++ src/tokenize_util.cpp | 1986 ++++++++++---------- src/ucache.hpp | 67 +- src/unet.hpp | 97 +- src/upscaler.cpp | 73 +- src/util.cpp | 194 +- src/util.h | 19 +- src/vae.hpp | 260 +-- src/wan.hpp | 308 ++- src/z_image.hpp | 78 +- 39 files changed, 7768 insertions(+), 7098 deletions(-) create mode 100644 src/condition_cache_utils.hpp create mode 100644 src/sample-cache.cpp create mode 100644 src/sample-cache.h create mode 100644 src/tensor.hpp create mode 100644 src/tensor_ggml.hpp diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index f9e4928e..ddb88c97 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -601,7 +601,7 @@ int main(int argc, const char* argv[]) { if (gen_params.end_image_path.size() > 0) { vae_decode_only = false; - if (!load_image_and_update_size(gen_params.init_image_path, end_image)) { + if (!load_image_and_update_size(gen_params.end_image_path, end_image)) { return 1; } } diff --git a/src/anima.hpp b/src/anima.hpp index 81dbefe7..5850cc3e 100644 --- a/src/anima.hpp +++ b/src/anima.hpp @@ -602,20 +602,19 @@ namespace Anima { return Rope::embed_nd(ids, bs, axis_thetas, axes_dim); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* t5_ids = nullptr, - ggml_tensor* t5_weights = nullptr) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& t5_ids_tensor = {}, + const sd::Tensor& t5_weights_tensor = {}) { + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* t5_ids = make_optional_input(t5_ids_tensor); + ggml_tensor* t5_weights = make_optional_input(t5_weights_tensor); GGML_ASSERT(x->ne[3] == 1); ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE); - x = to_backend(x); - timesteps = to_backend(timesteps); - context = to_backend(context); - t5_ids = to_backend(t5_ids); - t5_weights = to_backend(t5_weights); - int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size; int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size; int64_t h_pad = x->ne[1] + pad_h; @@ -667,18 +666,16 @@ namespace Anima { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* t5_ids = nullptr, - ggml_tensor* t5_weights = nullptr, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& t5_ids = {}, + const sd::Tensor& t5_weights = {}) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, t5_ids, t5_weights); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } }; } // namespace Anima diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp index 6efdb41a..039fb9df 100644 --- a/src/auto_encoder_kl.hpp +++ b/src/auto_encoder_kl.hpp @@ -1,4 +1,4 @@ -#ifndef __AUTO_ENCODER_KL_HPP__ +#ifndef __AUTO_ENCODER_KL_HPP__ #define __AUTO_ENCODER_KL_HPP__ #include "vae.hpp" @@ -685,10 +685,9 @@ struct AutoEncoderKL : public VAE { ae.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - z = to_backend(z); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); @@ -699,184 +698,100 @@ struct AutoEncoderKL : public VAE { return gf; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) override { + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z, + bool decode_graph) override { GGML_ASSERT(!decode_only || decode_graph); auto get_graph = [&]() -> ggml_cgraph* { return build_graph(z, decode_graph); }; - // ggml_set_f32(z, 0.5f); - // print_ggml_tensor(z); - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z.dim()); } - ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments, std::shared_ptr rng) { + sd::Tensor gaussian_latent_sample(const sd::Tensor& moments, std::shared_ptr rng) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor* latents = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); - ggml_tensor* noise = ggml_dup_tensor(work_ctx, latents); - ggml_ext_im_set_randn_f32(noise, rng); - { - float mean = 0; - float logvar = 0; - float value = 0; - float std_ = 0; - for (int i = 0; i < latents->ne[3]; i++) { - for (int j = 0; j < latents->ne[2]; j++) { - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - mean = ggml_ext_tensor_get_f32(moments, l, k, j, i); - logvar = ggml_ext_tensor_get_f32(moments, l, k, j + (int)latents->ne[2], i); - logvar = std::max(-30.0f, std::min(logvar, 20.0f)); - std_ = std::exp(0.5f * logvar); - value = mean + std_ * ggml_ext_tensor_get_f32(noise, l, k, j, i); - // printf("%d %d %d %d -> %f\n", i, j, k, l, value); - ggml_ext_tensor_set_f32(latents, value, l, k, j, i); - } - } - } - } - } + auto chunks = sd::ops::chunk(moments, 2, 2); + const auto& mean = chunks[0]; + const auto& logvar = chunks[1]; + sd::Tensor stddev = sd::ops::exp(0.5f * sd::ops::clamp(logvar, -30.0f, 20.0f)); + sd::Tensor noise = sd::Tensor::randn_like(mean, rng); + sd::Tensor latents = mean + stddev * noise; return latents; } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { if (sd_version_is_flux2(version)) { return vae_output; } else if (version == VERSION_SD1_PIX2PIX) { - return ggml_view_3d(work_ctx, - vae_output, - vae_output->ne[0], - vae_output->ne[1], - vae_output->ne[2] / 2, - vae_output->nb[1], - vae_output->nb[2], - 0); + return sd::ops::chunk(vae_output, 2, 2)[0]; } else { - return gaussian_latent_sample(work_ctx, vae_output, rng); + return gaussian_latent_sample(vae_output, rng); } } - void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector& latents_mean_vec, std::vector& latents_std_vec) { - // flux2 + std::pair, sd::Tensor> get_latents_mean_std(const sd::Tensor& latents, int channel_dim) { + GGML_ASSERT(channel_dim >= 0 && static_cast(channel_dim) < static_cast(latents.dim())); if (sd_version_is_flux2(version)) { - GGML_ASSERT(latents->ne[channel_dim] == 128); - latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f, - -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f, - -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f, - 0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f, - -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f, - 0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f, - 0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f, - 0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f, - -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f, - 0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f, - -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f, - 0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f, - -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f, - 0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f, - -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f, - -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f}; - latents_std_vec = { - 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f, - 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f, - 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f, - 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f, - 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f, - 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f, - 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f, - 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f, - 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f, - 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f, - 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f, - 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f, - 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f, - 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f, - 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f, - 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f}; + GGML_ASSERT(latents.shape()[channel_dim] == 128); + std::vector stats_shape(static_cast(latents.dim()), 1); + stats_shape[static_cast(channel_dim)] = latents.shape()[channel_dim]; + + auto mean_tensor = sd::Tensor::from_vector({-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f, + -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f, + -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f, + 0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f, + -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f, + 0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f, + 0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f, + 0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f, + -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f, + 0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f, + -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f, + 0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f, + -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f, + 0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f, + -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f, + -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f}); + mean_tensor.reshape_(stats_shape); + auto std_tensor = sd::Tensor::from_vector({1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f, + 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f, + 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f, + 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f, + 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f, + 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f, + 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f, + 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f, + 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f, + 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f, + 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f, + 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f, + 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f, + 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f, + 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f, + 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f}); + std_tensor.reshape_(stats_shape); + return {std::move(mean_tensor), std::move(std_tensor)}; } else { GGML_ABORT("unknown version %d", version); } } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - ggml_tensor* vae_latents = ggml_dup(work_ctx, latents); + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { if (sd_version_is_flux2(version)) { - int channel_dim = 2; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latents->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latents->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[j]; - std_ = latents_std_vec[j]; - } - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); - value = value * std_ / scale_factor + mean; - ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i); - } - } - } - } - } else { - ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3); - value = (value / scale_factor) + shift_factor; - ggml_ext_tensor_set_f32(vae_latents, value, i0, i1, i2, i3); - }); + int channel_dim = 2; + auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim); + return (latents * std_tensor) / scale_factor + mean_tensor; } - return vae_latents; + return (latents / scale_factor) + shift_factor; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents); + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { if (sd_version_is_flux2(version)) { - int channel_dim = 2; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latents->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latents->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[j]; - std_ = latents_std_vec[j]; - } - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); - value = (value - mean) * scale_factor / std_; - ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i); - } - } - } - } - } else { - ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3); - value = (value - shift_factor) * scale_factor; - ggml_ext_tensor_set_f32(diffusion_latents, value, i0, i1, i2, i3); - }); + int channel_dim = 2; + auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim); + return ((latents - mean_tensor) * scale_factor) / std_tensor; } - return diffusion_latents; + return (latents - shift_factor) * scale_factor; } int get_encoder_output_channels(int input_channels) { @@ -889,24 +804,26 @@ struct AutoEncoderKL : public VAE { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // CPU, x{1, 3, 64, 64}: Pass // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan // CPU, x{2, 3, 64, 64}: Wrong result // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result - auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2); - ggml_set_f32(x, 0.5f); - print_ggml_tensor(x); - ggml_tensor* out = nullptr; + sd::Tensor x({64, 64, 3, 2}); + x.fill_(0.5f); + print_sd_tensor(x); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - _compute(8, x, false, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = _compute(8, x, false); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("encode test done in %lldms", t1 - t0); } @@ -915,16 +832,18 @@ struct AutoEncoderKL : public VAE { // CUDA, z{1, 4, 8, 8}: Pass // CPU, z{3, 4, 8, 8}: Wrong result // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result - auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); - ggml_set_f32(z, 0.5f); - print_ggml_tensor(z); - ggml_tensor* out = nullptr; + sd::Tensor z({8, 8, 4, 1}); + z.fill_(0.5f); + print_sd_tensor(z); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - _compute(8, z, true, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = _compute(8, z, true); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("decode test done in %lldms", t1 - t0); } }; diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp index 9af627fb..dad67d45 100644 --- a/src/cache_dit.hpp +++ b/src/cache_dit.hpp @@ -8,7 +8,9 @@ #include #include +#include "condition_cache_utils.hpp" #include "ggml_extend.hpp" +#include "tensor.hpp" struct DBCacheConfig { bool enabled = false; @@ -771,35 +773,37 @@ struct CacheDitConditionState { return it != cache_diffs.end() && !it->second.diff.empty(); } - void update_cache(const void* cond, const float* input, const float* output, size_t size) { + void update_cache(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { CacheEntry& entry = cache_diffs[cond]; - entry.diff.resize(size); - for (size_t i = 0; i < size; i++) { - entry.diff[i] = output[i] - input[i]; + if (!sd::store_condition_cache_diff(&entry.diff, input, output)) { + entry.prev_input.clear(); + entry.prev_output.clear(); + entry.has_prev = false; + return; } + size_t size = static_cast(output.numel()); + const float* input_data = input.data(); + const float* output_data = output.data(); entry.prev_input.resize(size); entry.prev_output.resize(size); for (size_t i = 0; i < size; i++) { - entry.prev_input[i] = input[i]; - entry.prev_output[i] = output[i]; + entry.prev_input[i] = input_data[i]; + entry.prev_output[i] = output_data[i]; } entry.has_prev = true; } - void apply_cache(const void* cond, const float* input, float* output, size_t size) { + void apply_cache(const void* cond, + const sd::Tensor& input, + sd::Tensor* output) { auto it = cache_diffs.find(cond); if (it == cache_diffs.end() || it->second.diff.empty()) return; - if (it->second.diff.size() != size) - return; - - for (size_t i = 0; i < size; i++) { - output[i] = input[i] + it->second.diff[i]; - } + sd::apply_condition_cache_diff(it->second.diff, input, output); } - bool before_condition(const void* cond, ggml_tensor* input, ggml_tensor* output, float sigma, int step_index) { + bool before_condition(const void* cond, const sd::Tensor& input, sd::Tensor* output, float sigma, int step_index) { if (!enabled() || step_index < 0) return false; @@ -819,8 +823,7 @@ struct CacheDitConditionState { if (skip_current_step) { if (has_cache(cond)) { - apply_cache(cond, (float*)input->data, (float*)output->data, - static_cast(ggml_nelements(output))); + apply_cache(cond, input, output); return true; } return false; @@ -833,13 +836,13 @@ struct CacheDitConditionState { if (it == cache_diffs.end() || !it->second.has_prev) return false; - size_t ne = static_cast(ggml_nelements(input)); + size_t ne = static_cast(input.numel()); if (it->second.prev_input.size() != ne) return false; - float* input_data = (float*)input->data; - float diff = CacheDitState::calculate_residual_diff( - it->second.prev_input.data(), input_data, ne); + const float* input_data = input.data(); + float diff = CacheDitState::calculate_residual_diff( + it->second.prev_input.data(), input_data, ne); float effective_threshold = config.residual_diff_threshold; if (config.Fn_compute_blocks > 0) { @@ -859,7 +862,7 @@ struct CacheDitConditionState { cached_steps.push_back(current_step_index); continuous_cached_steps++; accumulated_residual_diff += diff; - apply_cache(cond, input_data, (float*)output->data, ne); + apply_cache(cond, input, output); return true; } @@ -867,15 +870,14 @@ struct CacheDitConditionState { return false; } - void after_condition(const void* cond, ggml_tensor* input, ggml_tensor* output) { + void after_condition(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { if (!step_is_active()) return; - size_t ne = static_cast(ggml_nelements(output)); - update_cache(cond, (float*)input->data, (float*)output->data, ne); + update_cache(cond, input, output); if (cond == anchor_condition && taylor_config.enabled) { - taylor_state.update_derivatives((float*)output->data, ne, current_step_index); + taylor_state.update_derivatives(output.data(), static_cast(output.numel()), current_step_index); } } diff --git a/src/clip.hpp b/src/clip.hpp index f4e5ef78..8f2ac064 100644 --- a/src/clip.hpp +++ b/src/clip.hpp @@ -957,15 +957,14 @@ struct CLIPTextModelRunner : public GGMLRunner { return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip); } - ggml_cgraph* build_graph(ggml_tensor* input_ids, + ggml_cgraph* build_graph(const sd::Tensor& input_ids_tensor, int num_custom_embeddings = 0, void* custom_embeddings_data = nullptr, size_t max_token_idx = 0, bool return_pooled = false, int clip_skip = -1) { - ggml_cgraph* gf = new_graph_custom(2048); - - input_ids = to_backend(input_ids); + ggml_cgraph* gf = new_graph_custom(2048); + ggml_tensor* input_ids = make_input(input_ids_tensor); ggml_tensor* embeddings = nullptr; @@ -1004,19 +1003,21 @@ struct CLIPTextModelRunner : public GGMLRunner { return gf; } - bool compute(const int n_threads, - ggml_tensor* input_ids, - int num_custom_embeddings, - void* custom_embeddings_data, - size_t max_token_idx, - bool return_pooled, - int clip_skip, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(const int n_threads, + const sd::Tensor& input_ids, + int num_custom_embeddings, + void* custom_embeddings_data, + size_t max_token_idx, + bool return_pooled, + int clip_skip) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + auto result = GGMLRunner::compute(get_graph, n_threads, true); + if (return_pooled) { + return take_or_empty(std::move(result)); + } + return restore_trailing_singleton_dims(std::move(result), 3); } }; diff --git a/src/common_dit.hpp b/src/common_dit.hpp index 0e6f0f08..30141d42 100644 --- a/src/common_dit.hpp +++ b/src/common_dit.hpp @@ -4,11 +4,11 @@ #include "ggml_extend.hpp" namespace DiT { - ggml_tensor* patchify(ggml_context* ctx, - ggml_tensor* x, - int pw, - int ph, - bool patch_last = true) { + inline ggml_tensor* patchify(ggml_context* ctx, + ggml_tensor* x, + int pw, + int ph, + bool patch_last = true) { // x: [N, C, H, W] // return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C] int64_t N = x->ne[3]; @@ -33,13 +33,13 @@ namespace DiT { return x; } - ggml_tensor* unpatchify(ggml_context* ctx, - ggml_tensor* x, - int64_t h, - int64_t w, - int ph, - int pw, - bool patch_last = true) { + inline ggml_tensor* unpatchify(ggml_context* ctx, + ggml_tensor* x, + int64_t h, + int64_t w, + int ph, + int pw, + bool patch_last = true) { // x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C] // return: [N, C, H, W] int64_t N = x->ne[2]; @@ -64,10 +64,10 @@ namespace DiT { return x; } - ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, - ggml_tensor* x, - int ph, - int pw) { + inline ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, + ggml_tensor* x, + int ph, + int pw) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; @@ -77,23 +77,23 @@ namespace DiT { return x; } - ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx, - ggml_tensor* x, - int ph, - int pw, - bool patch_last = true) { + inline ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx, + ggml_tensor* x, + int ph, + int pw, + bool patch_last = true) { x = pad_to_patch_size(ctx, x, ph, pw); x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last); return x; } - ggml_tensor* unpatchify_and_crop(ggml_context* ctx, - ggml_tensor* x, - int64_t H, - int64_t W, - int ph, - int pw, - bool patch_last = true) { + inline ggml_tensor* unpatchify_and_crop(ggml_context* ctx, + ggml_tensor* x, + int64_t H, + int64_t W, + int ph, + int pw, + bool patch_last = true) { int pad_h = (ph - H % ph) % ph; int pad_w = (pw - W % pw) % pw; int64_t h = ((H + pad_h) / ph); @@ -105,4 +105,4 @@ namespace DiT { } } // namespace DiT -#endif // __COMMON_DIT_HPP__ \ No newline at end of file +#endif // __COMMON_DIT_HPP__ diff --git a/src/condition_cache_utils.hpp b/src/condition_cache_utils.hpp new file mode 100644 index 00000000..903d64e3 --- /dev/null +++ b/src/condition_cache_utils.hpp @@ -0,0 +1,64 @@ +#ifndef __CONDITION_CACHE_UTILS_HPP__ +#define __CONDITION_CACHE_UTILS_HPP__ + +#include + +#include "tensor.hpp" + +namespace sd { + + inline bool store_condition_cache_diff(std::vector* diff, + const sd::Tensor& input, + const sd::Tensor& output) { + if (diff == nullptr || input.empty() || output.empty()) { + return false; + } + + size_t input_size = static_cast(input.numel()); + size_t output_size = static_cast(output.numel()); + if (input_size == 0 || input_size != output_size) { + diff->clear(); + return false; + } + + const float* input_data = input.data(); + const float* output_data = output.data(); + if (input_data == nullptr || output_data == nullptr) { + diff->clear(); + return false; + } + + diff->resize(output_size); + for (size_t i = 0; i < output_size; ++i) { + (*diff)[i] = output_data[i] - input_data[i]; + } + return true; + } + + inline bool apply_condition_cache_diff(const std::vector& diff, + const sd::Tensor& input, + sd::Tensor* output) { + if (output == nullptr || input.empty() || diff.empty()) { + return false; + } + + size_t input_size = static_cast(input.numel()); + if (input_size == 0 || diff.size() != input_size) { + return false; + } + + *output = input; + float* output_data = output->data(); + if (output_data == nullptr) { + return false; + } + + for (size_t i = 0; i < input_size; ++i) { + output_data[i] += diff[i]; + } + return true; + } + +} // namespace sd + +#endif // __CONDITION_CACHE_UTILS_HPP__ diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 534a2f11..05167cfd 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -1,39 +1,85 @@ #ifndef __CONDITIONER_HPP__ #define __CONDITIONER_HPP__ +#include + #include "clip.hpp" #include "llm.hpp" #include "t5.hpp" +#include "tensor_ggml.hpp" struct SDCondition { - ggml_tensor* c_crossattn = nullptr; // aka context - ggml_tensor* c_vector = nullptr; // aka y - ggml_tensor* c_concat = nullptr; + sd::Tensor c_crossattn; + sd::Tensor c_vector; + sd::Tensor c_concat; + sd::Tensor c_t5_ids; + sd::Tensor c_t5_weights; - std::vector extra_c_crossattns; + std::vector> extra_c_crossattns; SDCondition() = default; - SDCondition(ggml_tensor* c_crossattn, - ggml_tensor* c_vector, - ggml_tensor* c_concat, - const std::vector& extra_c_crossattns = {}) - : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {} + + SDCondition(sd::Tensor c_crossattn, + sd::Tensor c_vector, + sd::Tensor c_concat) + : c_crossattn(std::move(c_crossattn)), c_vector(std::move(c_vector)), c_concat(std::move(c_concat)) {} + + bool empty() const { + if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() || + !c_t5_ids.empty() || !c_t5_weights.empty()) { + return false; + } + + for (const auto& tensor : extra_c_crossattns) { + if (!tensor.empty()) { + return false; + } + } + + return true; + } }; +static inline sd::Tensor apply_token_weights(sd::Tensor hidden_states, + const std::vector& weights) { + if (hidden_states.empty()) { + return hidden_states; + } + + if (hidden_states.dim() == 1) { + hidden_states.unsqueeze_(1); + } + + GGML_ASSERT(static_cast(hidden_states.shape()[1]) == weights.size()); + + float original_mean = hidden_states.mean(); + auto chunk_weights = sd::Tensor::from_vector(weights); + chunk_weights.reshape_({1, static_cast(weights.size())}); + hidden_states *= chunk_weights; + float new_mean = hidden_states.mean(); + if (new_mean != 0.0f) { + hidden_states *= (original_mean / new_mean); + } + + return hidden_states; +} + struct ConditionerParams { std::string text; - int clip_skip = -1; - int width = -1; - int height = -1; - int adm_in_channels = -1; - bool zero_out_masked = false; - int num_input_imgs = 0; // for photomaker - std::vector ref_images = {}; // for qwen image edit + int clip_skip = -1; + int width = -1; + int height = -1; + int adm_in_channels = -1; + bool zero_out_masked = false; + int num_input_imgs = 0; // for photomaker + const std::vector>* ref_images = nullptr; // for qwen image edit }; struct Conditioner { - virtual SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + virtual ~Conditioner() = default; + +public: + virtual SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) = 0; virtual void alloc_params_buffer() = 0; virtual void free_params_buffer() = 0; @@ -41,13 +87,11 @@ struct Conditioner { virtual size_t get_params_buffer_size() = 0; virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} - virtual std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx, - int n_threads, + virtual std::tuple> get_learned_condition_with_trigger(int n_threads, const ConditionerParams& conditioner_params) { GGML_ABORT("Not implemented yet!"); } - virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx, - const std::string& prompt) { + virtual std::string remove_trigger_from_prompt(const std::string& prompt) { GGML_ABORT("Not implemented yet!"); } }; @@ -426,8 +470,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return {tokens, weights}; } - SDCondition get_learned_condition_common(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition_common(int n_threads, std::vector& tokens, std::vector& weights, int clip_skip, @@ -435,13 +478,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { int height, int adm_in_channels = -1, bool zero_out_masked = false) { - int64_t t0 = ggml_time_ms(); - ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] - ggml_tensor* chunk_hidden_states = nullptr; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] - ggml_tensor* chunk_hidden_states1 = nullptr; // [n_token, hidden_size] - ggml_tensor* chunk_hidden_states2 = nullptr; // [n_token, hidden_size2] - ggml_tensor* pooled = nullptr; - std::vector hidden_states_vec; + int64_t t0 = ggml_time_ms(); + sd::Tensor hidden_states; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] + sd::Tensor pooled; if (clip_skip <= 0) { clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1; @@ -455,9 +494,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::vector chunk_weights(weights.begin() + chunk_idx * chunk_len, weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - ggml_tensor* input_ids2 = nullptr; - size_t max_token_idx = 0; + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); + sd::Tensor input_ids2; + size_t max_token_idx = 0; if (sd_version_is_sdxl(version)) { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID); if (it != chunk_tokens.end()) { @@ -466,7 +505,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - input_ids2 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + input_ids2 = sd::Tensor({static_cast(chunk_tokens.size())}, chunk_tokens); // for (int i = 0; i < chunk_tokens.size(); i++) { // printf("%d ", chunk_tokens[i]); @@ -475,118 +514,87 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } { - text_model->compute(n_threads, - input_ids, - num_custom_embeddings, - token_embed_custom.data(), - max_token_idx, - false, - clip_skip, - &chunk_hidden_states1, - work_ctx); + auto chunk_hidden_states = text_model->compute(n_threads, + input_ids, + num_custom_embeddings, + token_embed_custom.data(), + max_token_idx, + false, + clip_skip); + GGML_ASSERT(!chunk_hidden_states.empty()); if (sd_version_is_sdxl(version)) { - text_model2->compute(n_threads, - input_ids2, - num_custom_embeddings, - token_embed_custom.data(), - max_token_idx, - false, - clip_skip, - &chunk_hidden_states2, work_ctx); - // concat - chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0); + auto chunk_hidden_states2 = text_model2->compute(n_threads, + input_ids2, + num_custom_embeddings, + token_embed_custom.data(), + max_token_idx, + false, + clip_skip); + GGML_ASSERT(!chunk_hidden_states2.empty()); + chunk_hidden_states = sd::ops::concat(chunk_hidden_states, chunk_hidden_states2, 0); if (chunk_idx == 0) { - text_model2->compute(n_threads, - input_ids2, - num_custom_embeddings, - token_embed_custom.data(), - max_token_idx, - true, - clip_skip, - &pooled, - work_ctx); + pooled = text_model2->compute(n_threads, + input_ids2, + num_custom_embeddings, + token_embed_custom.data(), + max_token_idx, + true, + clip_skip); + GGML_ASSERT(!pooled.empty()); } - } else { - chunk_hidden_states = chunk_hidden_states1; } - } + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states); - { - float original_mean = ggml_ext_tensor_mean(chunk_hidden_states); - for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) { - for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) { - for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(chunk_hidden_states, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(result, value, i0, i1, i2); - } - } + chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights); + + if (zero_out_masked) { + chunk_hidden_states.fill_(0.0f); } - float new_mean = ggml_ext_tensor_mean(result); - ggml_ext_tensor_scale_inplace(result, (original_mean / new_mean)); - } - if (zero_out_masked) { - float* vec = (float*)result->data; - for (int i = 0; i < ggml_nelements(result); i++) { - vec[i] = 0; + if (!hidden_states.empty()) { + hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1); + } else { + hidden_states = std::move(chunk_hidden_states); } } - hidden_states_vec.insert(hidden_states_vec.end(), (float*)result->data, ((float*)result->data) + ggml_nelements(result)); } - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); - - ggml_tensor* vec = nullptr; + sd::Tensor vec; if (sd_version_is_sdxl(version)) { int out_dim = 256; - vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels); - // [0:1280] + GGML_ASSERT(!pooled.empty()); + vec = sd::Tensor({adm_in_channels}); + vec.fill_(0.0f); size_t offset = 0; - memcpy(vec->data, pooled->data, ggml_nbytes(pooled)); - offset += ggml_nbytes(pooled); + std::copy(pooled.values().begin(), pooled.values().end(), vec.values().begin()); + offset += pooled.values().size(); - // original_size_as_tuple - float orig_width = (float)width; - float orig_height = (float)height; - std::vector timesteps = {orig_height, orig_width}; + auto append_embedding = [&](const std::vector& timesteps) { + sd::Tensor embedding; + set_timestep_embedding(timesteps, &embedding, out_dim); + std::copy(embedding.values().begin(), embedding.values().end(), vec.values().begin() + static_cast(offset)); + offset += embedding.values().size(); + }; - ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); - offset += ggml_nbytes(embed_view); - set_timestep_embedding(timesteps, embed_view, out_dim); - // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); - // crop_coords_top_left - float crop_coord_top = 0.f; - float crop_coord_left = 0.f; - timesteps = {crop_coord_top, crop_coord_left}; - embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); - offset += ggml_nbytes(embed_view); - set_timestep_embedding(timesteps, embed_view, out_dim); - // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); - // target_size_as_tuple - float target_width = (float)width; - float target_height = (float)height; - timesteps = {target_height, target_width}; - embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); - offset += ggml_nbytes(embed_view); - set_timestep_embedding(timesteps, embed_view, out_dim); - // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); - GGML_ASSERT(offset == ggml_nbytes(vec)); + append_embedding({static_cast(height), static_cast(width)}); + append_embedding({0.0f, 0.0f}); + append_embedding({static_cast(height), static_cast(width)}); + GGML_ASSERT(offset == vec.values().size()); } - // print_ggml_tensor(result); - return {hidden_states, vec, nullptr}; + SDCondition result; + if (!hidden_states.empty()) { + result.c_crossattn = std::move(hidden_states); + } + + if (!vec.empty()) { + result.c_vector = std::move(vec); + } + return result; } std::tuple> - get_learned_condition_with_trigger(ggml_context* work_ctx, - int n_threads, + get_learned_condition_with_trigger(int n_threads, const ConditionerParams& conditioner_params) override { auto image_tokens = convert_token_to_id(trigger_word); // if(image_tokens.size() == 1){ @@ -608,8 +616,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { // for(int i = 0; i < clsm.size(); ++i) // printf("%d ", clsm[i]?1:0); // printf("\n"); - auto cond = get_learned_condition_common(work_ctx, - n_threads, + auto cond = get_learned_condition_common(n_threads, tokens, weights, conditioner_params.clip_skip, @@ -620,8 +627,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return std::make_tuple(cond, clsm); } - std::string remove_trigger_from_prompt(ggml_context* work_ctx, - const std::string& prompt) override { + std::string remove_trigger_from_prompt(const std::string& prompt) override { auto image_tokens = convert_token_to_id(trigger_word); GGML_ASSERT(image_tokens.size() == 1); auto tokens_and_weights = tokenize(prompt, false); @@ -632,14 +638,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return decode(tokens); } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, true); std::vector& tokens = tokens_and_weights.first; std::vector& weights = tokens_and_weights.second; - return get_learned_condition_common(work_ctx, - n_threads, + return get_learned_condition_common(n_threads, tokens, weights, conditioner_params.clip_skip, @@ -680,10 +684,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { vision_model.get_param_tensors(tensors, "cond_stage_model.transformer"); } - ggml_cgraph* build_graph(ggml_tensor* pixel_values, bool return_pooled, int clip_skip) { - ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - pixel_values = to_backend(pixel_values); + ggml_cgraph* build_graph(const sd::Tensor& pixel_values_tensor, bool return_pooled, int clip_skip) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_tensor* pixel_values = make_input(pixel_values_tensor); auto runner_ctx = get_context(); @@ -694,16 +697,14 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { return gf; } - bool compute(const int n_threads, - ggml_tensor* pixel_values, - bool return_pooled, - int clip_skip, - ggml_tensor** output, - ggml_context* output_ctx) { + sd::Tensor compute(const int n_threads, + const sd::Tensor& pixel_values, + bool return_pooled, + int clip_skip) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(pixel_values, return_pooled, clip_skip); }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); } }; @@ -893,8 +894,7 @@ struct SD3CLIPEmbedder : public Conditioner { return {{clip_l_tokens, clip_l_weights}, {clip_g_tokens, clip_g_weights}, {t5_tokens, t5_weights}}; } - SDCondition get_learned_condition_common(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition_common(int n_threads, std::vector, std::vector>> token_and_weights, int clip_skip, bool zero_out_masked = false) { @@ -909,232 +909,155 @@ struct SD3CLIPEmbedder : public Conditioner { clip_skip = 2; } - int64_t t0 = ggml_time_ms(); - ggml_tensor* hidden_states = nullptr; // [N, n_token*2, 4096] - ggml_tensor* chunk_hidden_states = nullptr; // [n_token*2, 4096] - ggml_tensor* chunk_hidden_states_l = nullptr; // [n_token, hidden_size_l] - ggml_tensor* chunk_hidden_states_g = nullptr; // [n_token, hidden_size_g] - ggml_tensor* chunk_hidden_states_t5 = nullptr; // [n_token, hidden_size_t5] - ggml_tensor* pooled = nullptr; - ggml_tensor* pooled_l = nullptr; // [768,] - ggml_tensor* pooled_g = nullptr; // [1280,] - std::vector hidden_states_vec; + size_t chunk_len = 77; + int64_t t0 = ggml_time_ms(); + sd::Tensor hidden_states; + sd::Tensor pooled; - size_t chunk_len = 77; size_t chunk_count = std::max(std::max(clip_l_tokens.size(), clip_g_tokens.size()), t5_tokens.size()) / chunk_len; + for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { // clip_l + sd::Tensor chunk_hidden_states_l; + sd::Tensor pooled_l; if (clip_l) { std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len, clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len, clip_l_weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); size_t max_token_idx = 0; - clip_l->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - false, - clip_skip, - &chunk_hidden_states_l, - work_ctx); - { - auto tensor = chunk_hidden_states_l; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); - } + chunk_hidden_states_l = clip_l->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + false, + clip_skip); + GGML_ASSERT(!chunk_hidden_states_l.empty()); + chunk_hidden_states_l = ::apply_token_weights(std::move(chunk_hidden_states_l), chunk_weights); if (chunk_idx == 0) { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - clip_l->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - true, - clip_skip, - &pooled_l, - work_ctx); + pooled_l = clip_l->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + true, + clip_skip); + GGML_ASSERT(!pooled_l.empty()); } } else { - chunk_hidden_states_l = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, chunk_len); - ggml_set_f32(chunk_hidden_states_l, 0.f); + chunk_hidden_states_l = sd::Tensor::zeros({768, static_cast(chunk_len), 1}); if (chunk_idx == 0) { - pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); - ggml_set_f32(pooled_l, 0.f); + pooled = sd::Tensor::zeros({768, 1}); } } // clip_g + sd::Tensor chunk_hidden_states_g; + sd::Tensor pooled_g; if (clip_g) { std::vector chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len, clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len, clip_g_weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); size_t max_token_idx = 0; - clip_g->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - false, - clip_skip, - &chunk_hidden_states_g, - work_ctx); - - { - auto tensor = chunk_hidden_states_g; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); - } + chunk_hidden_states_g = clip_g->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + false, + clip_skip); + GGML_ASSERT(!chunk_hidden_states_g.empty()); + chunk_hidden_states_g = ::apply_token_weights(std::move(chunk_hidden_states_g), chunk_weights); if (chunk_idx == 0) { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID); max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - clip_g->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - true, - clip_skip, - &pooled_g, - work_ctx); + pooled_g = clip_g->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + true, + clip_skip); + GGML_ASSERT(!pooled_g.empty()); } } else { - chunk_hidden_states_g = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 1280, chunk_len); - ggml_set_f32(chunk_hidden_states_g, 0.f); + chunk_hidden_states_g = sd::Tensor::zeros({1280, static_cast(chunk_len), 1}); if (chunk_idx == 0) { - pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280); - ggml_set_f32(pooled_g, 0.f); + pooled_g = sd::Tensor::zeros({1280, 1}); } } // t5 + sd::Tensor chunk_hidden_states_t5; if (t5) { std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, t5_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, t5_weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); - t5->compute(n_threads, - input_ids, - nullptr, - &chunk_hidden_states_t5, - work_ctx); - { - auto tensor = chunk_hidden_states_t5; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); - } + chunk_hidden_states_t5 = t5->compute(n_threads, + input_ids, + sd::Tensor()); + GGML_ASSERT(!chunk_hidden_states_t5.empty()); + chunk_hidden_states_t5 = ::apply_token_weights(std::move(chunk_hidden_states_t5), chunk_weights); } else { - chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); - ggml_set_f32(chunk_hidden_states_t5, 0.f); + chunk_hidden_states_t5 = sd::Tensor::zeros({4096, static_cast(chunk_len), 1}); } - auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx, - chunk_hidden_states_l->type, - 4096, - chunk_hidden_states_l->ne[1], - chunk_hidden_states_l->ne[2]); // [n_token, 4096] - - for (int i2 = 0; i2 < chunk_hidden_states_lg_pad->ne[2]; i2++) { - for (int i1 = 0; i1 < chunk_hidden_states_lg_pad->ne[1]; i1++) { - for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) { - float value = 0.f; - if (i0 < chunk_hidden_states_l->ne[0]) { - value = ggml_ext_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2); - } else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) { - value = ggml_ext_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2); - } - ggml_ext_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2); - } - } + sd::Tensor chunk_hidden_states_lg = sd::ops::concat(chunk_hidden_states_l, chunk_hidden_states_g, 0); + if (chunk_hidden_states_lg.shape()[0] < 4096) { + auto pad_shape = chunk_hidden_states_lg.shape(); + pad_shape[0] = 4096 - chunk_hidden_states_lg.shape()[0]; + chunk_hidden_states_lg = sd::ops::concat(chunk_hidden_states_lg, + sd::Tensor::zeros(pad_shape), + 0); } - chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096] + sd::Tensor chunk_hidden_states = sd::ops::concat(chunk_hidden_states_lg, + chunk_hidden_states_t5, + 1); // [n_token*2, 4096] if (chunk_idx == 0) { - pooled = ggml_ext_tensor_concat(work_ctx, pooled_l, pooled_g, 0); // [768 + 1280] + pooled = sd::ops::concat(pooled_l, pooled_g, 0); // [768 + 1280] } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); if (zero_out_masked) { - float* vec = (float*)chunk_hidden_states->data; - for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) { - vec[i] = 0; - } + chunk_hidden_states.fill_(0.0f); } - hidden_states_vec.insert(hidden_states_vec.end(), - (float*)chunk_hidden_states->data, - ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); + if (!hidden_states.empty()) { + hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1); + } else { + hidden_states = std::move(chunk_hidden_states); + } } - if (hidden_states_vec.size() > 0) { - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); - } else { - hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256); - ggml_set_f32(hidden_states, 0.f); - } - if (pooled == nullptr) { - pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2048); - ggml_set_f32(pooled, 0.f); - } - return {hidden_states, pooled, nullptr}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.c_vector = std::move(pooled); + return result; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, 77, true); - return get_learned_condition_common(work_ctx, - n_threads, + return get_learned_condition_common(n_threads, tokens_and_weights, conditioner_params.clip_skip, conditioner_params.zero_out_masked); @@ -1292,8 +1215,7 @@ struct FluxCLIPEmbedder : public Conditioner { return {{clip_l_tokens, clip_l_weights}, {t5_tokens, t5_weights}}; } - SDCondition get_learned_condition_common(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition_common(int n_threads, std::vector, std::vector>> token_and_weights, int clip_skip, bool zero_out_masked = false) { @@ -1306,11 +1228,9 @@ struct FluxCLIPEmbedder : public Conditioner { clip_skip = 2; } - int64_t t0 = ggml_time_ms(); - ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] - ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] - ggml_tensor* pooled = nullptr; // [768,] - std::vector hidden_states_vec; + int64_t t0 = ggml_time_ms(); + sd::Tensor hidden_states; // [N, n_token, 4096] + sd::Tensor pooled; // [768,] size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len; for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { @@ -1323,95 +1243,65 @@ struct FluxCLIPEmbedder : public Conditioner { std::vector chunk_weights(clip_l_weights.begin(), clip_l_weights.begin() + chunk_len_l); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); size_t max_token_idx = 0; auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - clip_l->compute(n_threads, - input_ids, - 0, - nullptr, - max_token_idx, - true, - clip_skip, - &pooled, - work_ctx); + pooled = clip_l->compute(n_threads, + input_ids, + 0, + nullptr, + max_token_idx, + true, + clip_skip); + GGML_ASSERT(!pooled.empty()); + } else { + pooled = sd::Tensor::zeros({768}); } } // t5 + sd::Tensor chunk_hidden_states; if (t5) { std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, t5_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, t5_weights.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - - t5->compute(n_threads, - input_ids, - nullptr, - &chunk_hidden_states, - work_ctx); - { - auto tensor = chunk_hidden_states; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); + chunk_hidden_states = t5->compute(n_threads, + input_ids, + sd::Tensor()); + GGML_ASSERT(!chunk_hidden_states.empty()); + chunk_hidden_states = ::apply_token_weights(std::move(chunk_hidden_states), chunk_weights); + if (zero_out_masked) { + chunk_hidden_states.fill_(0.0f); } } else { - chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); - ggml_set_f32(chunk_hidden_states, 0.f); + chunk_hidden_states = sd::Tensor::zeros({4096, static_cast(chunk_len)}); } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - if (zero_out_masked) { - float* vec = (float*)chunk_hidden_states->data; - for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) { - vec[i] = 0; - } + if (!hidden_states.empty()) { + hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1); + } else { + hidden_states = std::move(chunk_hidden_states); } - - hidden_states_vec.insert(hidden_states_vec.end(), - (float*)chunk_hidden_states->data, - ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); } - if (hidden_states_vec.size() > 0) { - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); - } else { - hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256); - ggml_set_f32(hidden_states, 0.f); - } - if (pooled == nullptr) { - pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); - ggml_set_f32(pooled, 0.f); - } - return {hidden_states, pooled, nullptr}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.c_vector = std::move(pooled); + return result; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true); - return get_learned_condition_common(work_ctx, - n_threads, + return get_learned_condition_common(n_threads, tokens_and_weights, conditioner_params.clip_skip, conditioner_params.zero_out_masked); @@ -1523,8 +1413,9 @@ struct T5CLIPEmbedder : public Conditioner { return {t5_tokens, t5_weights, t5_mask}; } - void modify_mask_to_attend_padding(ggml_tensor* mask, int max_seq_length, int num_extra_padding = 8) { - float* mask_data = (float*)mask->data; + void modify_mask_to_attend_padding(sd::Tensor* mask, int max_seq_length, int num_extra_padding = 8) { + GGML_ASSERT(mask != nullptr); + float* mask_data = mask->data(); int num_pad = 0; for (int64_t i = 0; i < max_seq_length; i++) { if (num_pad >= num_extra_padding) { @@ -1538,29 +1429,23 @@ struct T5CLIPEmbedder : public Conditioner { // LOG_DEBUG("PAD: %d", num_pad); } - SDCondition get_learned_condition_common(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition_common(int n_threads, std::tuple, std::vector, std::vector> token_and_weights, int clip_skip, bool zero_out_masked = false) { if (!t5) { - auto hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256); - ggml_set_f32(hidden_states, 0.f); - auto t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 256); - ggml_set_f32(t5_attn_mask, -HUGE_VALF); - return {hidden_states, t5_attn_mask, nullptr}; + SDCondition result; + result.c_crossattn = sd::Tensor::zeros({4096, 256}); + result.c_vector = sd::Tensor::full({256}, -HUGE_VALF); + return result; } auto& t5_tokens = std::get<0>(token_and_weights); auto& t5_weights = std::get<1>(token_and_weights); auto& t5_attn_mask_vec = std::get<2>(token_and_weights); - int64_t t0 = ggml_time_ms(); - ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] - ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] - ggml_tensor* pooled = nullptr; - ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [n_token] - - std::vector hidden_states_vec; + int64_t t0 = ggml_time_ms(); + sd::Tensor t5_attn_mask = sd::Tensor::from_vector(t5_attn_mask_vec); + sd::Tensor hidden_states; size_t chunk_count = t5_tokens.size() / chunk_len; @@ -1573,68 +1458,46 @@ struct T5CLIPEmbedder : public Conditioner { std::vector chunk_mask(t5_attn_mask_vec.begin() + chunk_idx * chunk_len, t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : nullptr; + sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); + sd::Tensor t5_attn_mask_chunk; + if (use_mask) { + t5_attn_mask_chunk = sd::Tensor({static_cast(chunk_mask.size())}, chunk_mask); + } - t5->compute(n_threads, - input_ids, - t5_attn_mask_chunk, - &chunk_hidden_states, - work_ctx); - { - auto tensor = chunk_hidden_states; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); + auto chunk_hidden_states = t5->compute(n_threads, + input_ids, + t5_attn_mask_chunk); + GGML_ASSERT(!chunk_hidden_states.empty()); + chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights); + + if (zero_out_masked) { + auto chunk_mask_tensor = sd::Tensor::from_vector(chunk_mask) + .reshape_({1, static_cast(chunk_mask.size())}); + chunk_hidden_states.masked_fill_(chunk_mask_tensor < 0.0f, 0.0f); } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - if (zero_out_masked) { - auto tensor = chunk_hidden_states; - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - if (chunk_mask[i1] < 0.f) { - ggml_ext_tensor_set_f32(tensor, 0.f, i0, i1, i2); - } - } - } - } - } - hidden_states_vec.insert(hidden_states_vec.end(), - (float*)chunk_hidden_states->data, - ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); + if (!hidden_states.empty()) { + hidden_states = sd::ops::concat(hidden_states, chunk_hidden_states, 1); + } else { + hidden_states = std::move(chunk_hidden_states); + } } - GGML_ASSERT(hidden_states_vec.size() > 0); - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); + modify_mask_to_attend_padding(&t5_attn_mask, static_cast(t5_attn_mask.numel()), mask_pad); - modify_mask_to_attend_padding(t5_attn_mask, static_cast(ggml_nelements(t5_attn_mask)), mask_pad); - - return {hidden_states, t5_attn_mask, nullptr}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.c_vector = std::move(t5_attn_mask); + return result; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true); - return get_learned_condition_common(work_ctx, - n_threads, + return get_learned_condition_common(n_threads, tokens_and_weights, conditioner_params.clip_skip, conditioner_params.zero_out_masked); @@ -1723,8 +1586,7 @@ struct AnimaConditioner : public Conditioner { return {qwen_tokens, qwen_weights, t5_tokens, t5_weights}; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { int64_t t0 = ggml_time_ms(); @@ -1734,46 +1596,25 @@ struct AnimaConditioner : public Conditioner { auto& t5_tokens = std::get<2>(tokenized); auto& t5_weights = std::get<3>(tokenized); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens); - - ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024] - llm->compute(n_threads, - input_ids, - nullptr, - {}, - {}, - &hidden_states, - work_ctx); - - { - auto tensor = hidden_states; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= qwen_weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_ext_tensor_mean(tensor); - if (new_mean != 0.f) { - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); - } - } - - ggml_tensor* t5_ids_tensor = nullptr; - ggml_tensor* t5_weight_tensor = nullptr; - if (!t5_tokens.empty()) { - t5_ids_tensor = vector_to_ggml_tensor_i32(work_ctx, t5_tokens); - t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights); - } + sd::Tensor input_ids({static_cast(qwen_tokens.size()), 1}, qwen_tokens); + auto hidden_states = llm->compute(n_threads, + input_ids, + sd::Tensor(), + {}, + {}); + GGML_ASSERT(!hidden_states.empty()); + hidden_states = apply_token_weights(std::move(hidden_states), qwen_weights); + auto t5_ids_tensor = sd::Tensor::from_vector(t5_tokens); + auto t5_weight_tensor = sd::Tensor::from_vector(t5_weights); int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - return {hidden_states, t5_weight_tensor, t5_ids_tensor}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.c_t5_ids = std::move(t5_ids_tensor); + result.c_t5_weights = std::move(t5_weight_tensor); + return result; } }; @@ -1884,15 +1725,14 @@ struct LLMEmbedder : public Conditioner { return {tokens, weights}; } - ggml_tensor* encode_prompt(ggml_context* work_ctx, - int n_threads, - const std::string prompt, - const std::pair& prompt_attn_range, - int max_length, - int min_length, - std::vector> image_embeds, - const std::set& out_layers, - int prompt_template_encode_start_idx) { + sd::Tensor encode_prompt(int n_threads, + const std::string prompt, + const std::pair& prompt_attn_range, + int max_length, + int min_length, + const std::vector>>& image_embeds, + const std::set& out_layers, + int prompt_template_encode_start_idx) { auto tokens_and_weights = tokenize(prompt, prompt_attn_range); auto& tokens = std::get<0>(tokens_and_weights); auto& weights = std::get<1>(tokens_and_weights); @@ -1904,81 +1744,59 @@ struct LLMEmbedder : public Conditioner { tokenizer->pad_tokens(tokens, weights, max_length, true); } - ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] - - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - - ggml_tensor* attention_mask = nullptr; + sd::Tensor input_ids({static_cast(tokens.size())}, tokens); + sd::Tensor attention_mask; if (!mask.empty()) { - attention_mask = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, mask.size(), mask.size()); - ggml_ext_tensor_iter(attention_mask, [&](ggml_tensor* attention_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.f; - if (mask[i0] == 0.f) { - value = -INFINITY; - } else if (i0 > i1) { - value = -INFINITY; - } - ggml_ext_tensor_set_f32(attention_mask, value, i0, i1, i2, i3); - }); - } - - llm->compute(n_threads, - input_ids, - attention_mask, - image_embeds, - out_layers, - &hidden_states, - work_ctx); - { - auto tensor = hidden_states; - float original_mean = ggml_ext_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); - value *= weights[i1]; - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); + attention_mask = sd::Tensor({static_cast(mask.size()), static_cast(mask.size())}); + for (size_t i1 = 0; i1 < mask.size(); ++i1) { + for (size_t i0 = 0; i0 < mask.size(); ++i0) { + float value = 0.0f; + if (mask[i0] == 0.0f || i0 > i1) { + value = -INFINITY; } + attention_mask[static_cast(i0 + mask.size() * i1)] = value; } } - float new_mean = ggml_ext_tensor_mean(tensor); - ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); } - GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx); + auto hidden_states = llm->compute(n_threads, + input_ids, + attention_mask, + image_embeds, + out_layers); + GGML_ASSERT(!hidden_states.empty()); + hidden_states = apply_token_weights(std::move(hidden_states), weights); + GGML_ASSERT(hidden_states.shape()[1] > prompt_template_encode_start_idx); int64_t zero_pad_len = 0; if (min_length > 0) { - if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) { - zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx; + if (hidden_states.shape()[1] - prompt_template_encode_start_idx < min_length) { + zero_pad_len = min_length - hidden_states.shape()[1] + prompt_template_encode_start_idx; } } - ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx, - GGML_TYPE_F32, - hidden_states->ne[0], - hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len, - hidden_states->ne[2]); - - ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.f; - if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) { - value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3); - } - ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3); - }); + sd::Tensor new_hidden_states = sd::ops::slice(hidden_states, + 1, + prompt_template_encode_start_idx, + hidden_states.shape()[1]); + if (zero_pad_len > 0) { + auto pad_shape = new_hidden_states.shape(); + pad_shape[1] = zero_pad_len; + new_hidden_states = sd::ops::concat(new_hidden_states, + sd::Tensor::zeros(std::move(pad_shape)), + 1); + } return new_hidden_states; } - SDCondition get_learned_condition(ggml_context* work_ctx, - int n_threads, + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { std::string prompt; std::pair prompt_attn_range; std::vector extra_prompts; std::vector> extra_prompts_attn_range; - std::vector> image_embeds; + std::vector>> image_embeds; int prompt_template_encode_start_idx = 34; int max_length = 0; // pad tokens int min_length = 0; // zero pad hidden_states @@ -1987,7 +1805,7 @@ struct LLMEmbedder : public Conditioner { int64_t t0 = ggml_time_ms(); if (sd_version_is_qwen_image(version)) { - if (llm->enable_vision && !conditioner_params.ref_images.empty()) { + if (llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) { LOG_INFO("QwenImageEditPlusPipeline"); prompt_template_encode_start_idx = 64; int image_embed_idx = 64 + 6; @@ -1997,13 +1815,13 @@ struct LLMEmbedder : public Conditioner { std::string placeholder = "<|image_pad|>"; std::string img_prompt; - for (int i = 0; i < conditioner_params.ref_images.size(); i++) { - sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]); - double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size; - int height = image.height; - int width = image.width; - int h_bar = static_cast(std::round(height / factor) * factor); - int w_bar = static_cast(std::round(width / factor) * factor); + for (int i = 0; i < conditioner_params.ref_images->size(); i++) { + const auto& image = (*conditioner_params.ref_images)[i]; + double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size; + int height = static_cast(image.shape()[1]); + int width = static_cast(image.shape()[0]); + int h_bar = static_cast(std::round(height / factor) * factor); + int w_bar = static_cast(std::round(width / factor) * factor); if (static_cast(h_bar) * w_bar > max_pixels) { double beta = std::sqrt((height * width) / static_cast(max_pixels)); @@ -2017,24 +1835,17 @@ struct LLMEmbedder : public Conditioner { w_bar = static_cast(std::ceil(width * beta / factor)) * static_cast(factor); } - LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar); + LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar); - sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar); - free(image.data); - image.data = nullptr; + auto resized_image = clip_preprocess(image, w_bar, h_bar); - ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1); - sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false); - free(resized_image.data); - resized_image.data = nullptr; - - ggml_tensor* image_embed = nullptr; - llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx); + auto image_embed = llm->encode_image(n_threads, resized_image); + GGML_ASSERT(!image_embed.empty()); image_embeds.emplace_back(image_embed_idx, image_embed); - image_embed_idx += 1 + static_cast(image_embed->ne[1]) + 6; + image_embed_idx += 1 + static_cast(image_embed.shape()[1]) + 6; img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652] - int64_t num_image_tokens = image_embed->ne[1]; + int64_t num_image_tokens = image_embed.shape()[1]; img_prompt.reserve(num_image_tokens * placeholder.size()); for (int j = 0; j < num_image_tokens; j++) { img_prompt += placeholder; @@ -2077,10 +1888,10 @@ struct LLMEmbedder : public Conditioner { prompt_template_encode_start_idx = 0; out_layers = {35}; // -2 - if (!conditioner_params.ref_images.empty()) { + if (conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty()) { LOG_INFO("ZImageOmniPipeline"); prompt = "<|im_start|>user\n<|vision_start|>"; - for (int i = 0; i < conditioner_params.ref_images.size() - 1; i++) { + for (int i = 0; i < conditioner_params.ref_images->size() - 1; i++) { extra_prompts.push_back("<|vision_end|><|vision_start|>"); } extra_prompts.push_back("<|vision_end|>" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n<|vision_start|>"); @@ -2121,8 +1932,7 @@ struct LLMEmbedder : public Conditioner { GGML_ABORT("unknown version %d", version); } - auto hidden_states = encode_prompt(work_ctx, - n_threads, + auto hidden_states = encode_prompt(n_threads, prompt, prompt_attn_range, max_length, @@ -2130,11 +1940,9 @@ struct LLMEmbedder : public Conditioner { image_embeds, out_layers, prompt_template_encode_start_idx); - - std::vector extra_hidden_states_vec; + std::vector> extra_hidden_states_vec; for (int i = 0; i < extra_prompts.size(); i++) { - auto extra_hidden_states = encode_prompt(work_ctx, - n_threads, + auto extra_hidden_states = encode_prompt(n_threads, extra_prompts[i], extra_prompts_attn_range[i], max_length, @@ -2142,12 +1950,15 @@ struct LLMEmbedder : public Conditioner { image_embeds, out_layers, prompt_template_encode_start_idx); - extra_hidden_states_vec.push_back(extra_hidden_states); + extra_hidden_states_vec.push_back(std::move(extra_hidden_states)); } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - return {hidden_states, nullptr, nullptr, extra_hidden_states_vec}; + SDCondition result; + result.c_crossattn = std::move(hidden_states); + result.extra_c_crossattns = std::move(extra_hidden_states_vec); + return result; } }; diff --git a/src/control.hpp b/src/control.hpp index 93df10a4..d227ec94 100644 --- a/src/control.hpp +++ b/src/control.hpp @@ -310,11 +310,13 @@ struct ControlNet : public GGMLRunner { SDVersion version = VERSION_SD1; ControlNetBlock control_net; - ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory + ggml_backend_buffer_t control_buffer = nullptr; ggml_context* control_ctx = nullptr; - std::vector controls; // (12 input block outputs, 1 middle block output) SD 1.5 - ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference - bool guided_hint_cached = false; + std::vector control_outputs_ggml; + ggml_tensor* guided_hint_output_ggml = nullptr; + std::vector> controls; + sd::Tensor guided_hint; + bool guided_hint_cached = false; ControlNet(ggml_backend_t backend, bool offload_params_to_cpu, @@ -335,16 +337,16 @@ struct ControlNet : public GGMLRunner { params.no_alloc = true; control_ctx = ggml_init(params); - controls.resize(outs.size() - 1); + control_outputs_ggml.resize(outs.size() - 1); size_t control_buffer_size = 0; - guided_hint = ggml_dup_tensor(control_ctx, outs[0]); - control_buffer_size += ggml_nbytes(guided_hint); + guided_hint_output_ggml = ggml_dup_tensor(control_ctx, outs[0]); + control_buffer_size += ggml_nbytes(guided_hint_output_ggml); for (int i = 0; i < outs.size() - 1; i++) { - controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]); - control_buffer_size += ggml_nbytes(controls[i]); + control_outputs_ggml[i] = ggml_dup_tensor(control_ctx, outs[i + 1]); + control_buffer_size += ggml_nbytes(control_outputs_ggml[i]); } control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend); @@ -361,8 +363,10 @@ struct ControlNet : public GGMLRunner { ggml_free(control_ctx); control_ctx = nullptr; } - guided_hint = nullptr; - guided_hint_cached = false; + guided_hint_output_ggml = nullptr; + guided_hint_cached = false; + guided_hint = {}; + control_outputs_ggml.clear(); controls.clear(); } @@ -374,29 +378,33 @@ struct ControlNet : public GGMLRunner { control_net.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* hint, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* y = nullptr) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& hint_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& y_tensor = {}) { ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE); - x = to_backend(x); - if (guided_hint_cached) { - hint = nullptr; + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* hint = nullptr; + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* y = make_optional_input(y_tensor); + + ggml_tensor* guided_hint_input = nullptr; + if (guided_hint_cached && !guided_hint.empty()) { + guided_hint_input = make_input(guided_hint); + hint = nullptr; } else { - hint = to_backend(hint); + hint = make_input(hint_tensor); } - context = to_backend(context); - y = to_backend(y); - timesteps = to_backend(timesteps); auto runner_ctx = get_context(); auto outs = control_net.forward(&runner_ctx, x, hint, - guided_hint_cached ? guided_hint : nullptr, + guided_hint_input, timesteps, context, y); @@ -405,22 +413,20 @@ struct ControlNet : public GGMLRunner { alloc_control_ctx(outs); } - ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint)); + ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint_output_ggml)); for (int i = 0; i < outs.size() - 1; i++) { - ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i])); + ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], control_outputs_ggml[i])); } return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* hint, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* y, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + std::optional>> compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& hint, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& y = {}) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] @@ -429,12 +435,24 @@ struct ControlNet : public GGMLRunner { return build_graph(x, hint, timesteps, context, y); }; - bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); - if (res) { - // cache guided_hint - guided_hint_cached = true; + auto compute_result = GGMLRunner::compute(get_graph, n_threads, false); + if (!compute_result.has_value()) { + return std::nullopt; } - return res; + + if (guided_hint_output_ggml != nullptr) { + guided_hint = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml(guided_hint_output_ggml), + 4); + } + controls.clear(); + controls.reserve(control_outputs_ggml.size()); + for (ggml_tensor* control : control_outputs_ggml) { + auto control_host = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml(control), 4); + GGML_ASSERT(!control_host.empty()); + controls.push_back(std::move(control_host)); + } + guided_hint_cached = true; + return controls; } bool load_from_file(const std::string& file_path, int n_threads) { @@ -462,4 +480,4 @@ struct ControlNet : public GGMLRunner { } }; -#endif // __CONTROL_HPP__ \ No newline at end of file +#endif // __CONTROL_HPP__ diff --git a/src/denoiser.hpp b/src/denoiser.hpp index b92ca4e3..077a1b79 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -5,6 +5,7 @@ #include "ggml_extend.hpp" #include "gits_noise.inl" +#include "tensor.hpp" /*================================================= CompVisDenoiser ==================================================*/ @@ -73,9 +74,9 @@ constexpr double interp(double left, double right, double perc) noexcept { /* This will make the assumption that the reference x and y values are * already sorted in ascending order because they are being generated as * such in the calling function */ -std::vector linear_interp(std::vector new_x, - const std::vector ref_x, - const std::vector ref_y) { +inline std::vector linear_interp(std::vector new_x, + const std::vector ref_x, + const std::vector ref_y) { const size_t len_x = new_x.size(); size_t i = 0; size_t j = 0; @@ -109,7 +110,7 @@ std::vector linear_interp(std::vector new_x, return new_y; } -std::vector linear_space(const float start, const float end, const size_t num_points) { +inline std::vector linear_space(const float start, const float end, const size_t num_points) { std::vector result(num_points); const float inc = (end - start) / (static_cast(num_points - 1)); @@ -124,8 +125,8 @@ std::vector linear_space(const float start, const float end, const size_t return result; } -std::vector log_linear_interpolation(std::vector sigma_in, - const size_t new_len) { +inline std::vector log_linear_interpolation(std::vector sigma_in, + const size_t new_len) { const size_t s_len = sigma_in.size(); std::vector x_vals = linear_space(0.f, 1.f, s_len); std::vector y_vals(s_len); @@ -478,13 +479,16 @@ struct KLOptimalScheduler : SigmaScheduler { }; struct Denoiser { - virtual float sigma_min() = 0; - virtual float sigma_max() = 0; - virtual float sigma_to_t(float sigma) = 0; - virtual float t_to_sigma(float t) = 0; - virtual std::vector get_scalings(float sigma) = 0; - virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0; - virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) = 0; + virtual float sigma_min() = 0; + virtual float sigma_max() = 0; + virtual float sigma_to_t(float sigma) = 0; + virtual float t_to_sigma(float t) = 0; + virtual std::vector get_scalings(float sigma) = 0; + virtual sd::Tensor noise_scaling(float sigma, + const sd::Tensor& noise, + const sd::Tensor& latent) = 0; + virtual sd::Tensor inverse_noise_scaling(float sigma, + const sd::Tensor& latent) = 0; virtual std::vector get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) { auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); @@ -598,14 +602,15 @@ struct CompVisDenoiser : public Denoiser { return {c_skip, c_out, c_in}; } - // this function will modify noise/latent - ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { - ggml_ext_tensor_scale_inplace(noise, sigma); - ggml_ext_tensor_add_inplace(latent, noise); - return latent; + virtual sd::Tensor noise_scaling(float sigma, + const sd::Tensor& noise, + const sd::Tensor& latent) override { + GGML_ASSERT(noise.numel() == latent.numel()); + return latent + noise * sigma; } - ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { + sd::Tensor inverse_noise_scaling(float sigma, const sd::Tensor& latent) override { + SD_UNUSED(sigma); return latent; } }; @@ -644,7 +649,7 @@ struct EDMVDenoiser : public CompVisVDenoiser { } }; -float time_snr_shift(float alpha, float t) { +inline float time_snr_shift(float alpha, float t) { if (alpha == 1.0f) { return t; } @@ -696,21 +701,18 @@ struct DiscreteFlowDenoiser : public Denoiser { return {c_skip, c_out, c_in}; } - // this function will modify noise/latent - ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { - ggml_ext_tensor_scale_inplace(noise, sigma); - ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma); - ggml_ext_tensor_add_inplace(latent, noise); - return latent; + sd::Tensor noise_scaling(float sigma, + const sd::Tensor& noise, + const sd::Tensor& latent) override { + GGML_ASSERT(noise.numel() == latent.numel()); + return latent * (1.0f - sigma) + noise * sigma; } - - ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { - ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma)); - return latent; + sd::Tensor inverse_noise_scaling(float sigma, const sd::Tensor& latent) override { + return latent * (1.0f / (1.0f - sigma)); } }; -float flux_time_shift(float mu, float sigma, float t) { +inline float flux_time_shift(float mu, float sigma, float t) { return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma)); } @@ -759,938 +761,289 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser { } }; -typedef std::function denoise_cb_t; +typedef std::function(const sd::Tensor&, float, int)> denoise_cb_t; // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t -static bool sample_k_diffusion(sample_method_t method, - denoise_cb_t model, - ggml_context* work_ctx, - ggml_tensor* x, - std::vector sigmas, - std::shared_ptr rng, - float eta) { +static sd::Tensor sample_k_diffusion(sample_method_t method, + denoise_cb_t model, + sd::Tensor x, + std::vector sigmas, + std::shared_ptr rng, + float eta) { size_t steps = sigmas.size() - 1; - // sample_euler_ancestral switch (method) { case EULER_A_SAMPLE_METHOD: { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - for (int i = 0; i < steps; i++) { - float sigma = sigmas[i]; - - // denoise - ggml_tensor* denoised = model(x, sigma, i + 1); - if (denoised == nullptr) { - return false; + float sigma = sigmas[i]; + auto denoised_opt = model(x, sigma, i + 1); + if (denoised_opt.empty()) { + return {}; } - - // d = (x - denoised) / sigma - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int i = 0; i < ggml_nelements(d); i++) { - vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma; - } - } - - // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); - float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); - - // Euler method - float dt = sigma_down - sigmas[i]; - // x = x + d * dt - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - - for (int i = 0; i < ggml_nelements(x); i++) { - vec_x[i] = vec_x[i] + vec_d[i] * dt; - } - } - + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - denoised) / sigma; + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); + float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); + float dt = sigma_down - sigmas[i]; + x += d * dt; if (sigmas[i + 1] > 0) { - // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up - ggml_ext_im_set_randn_f32(noise, rng); - // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin"); - { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int i = 0; i < ggml_nelements(x); i++) { - vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; - } - } + x += sd::Tensor::randn_like(x, rng) * sigma_up; } } - } break; - case EULER_SAMPLE_METHOD: // Implemented without any sigma churn - { - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - + return x; + } + case EULER_SAMPLE_METHOD: { for (int i = 0; i < steps; i++) { - float sigma = sigmas[i]; - - // denoise - ggml_tensor* denoised = model(x, sigma, i + 1); - if (denoised == nullptr) { - return false; - } - - // d = (x - denoised) / sigma - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(d); j++) { - vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma; - } - } - - float dt = sigmas[i + 1] - sigma; - // x = x + d * dt - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_d[j] * dt; - } + float sigma = sigmas[i]; + auto denoised_opt = model(x, sigma, i + 1); + if (denoised_opt.empty()) { + return {}; } + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - denoised) / sigma; + float dt = sigmas[i + 1] - sigma; + x += d * dt; } - } break; + return x; + } case HEUN_SAMPLE_METHOD: { - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); - for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], -(i + 1)); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], -(i + 1)); + if (denoised_opt.empty()) { + return {}; } - - // d = (x - denoised) / sigma - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; - } - } - - float dt = sigmas[i + 1] - sigmas[i]; + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - denoised) / sigmas[i]; + float dt = sigmas[i + 1] - sigmas[i]; if (sigmas[i + 1] == 0) { - // Euler step - // x = x + d * dt - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_d[j] * dt; - } + x += d * dt; } else { - // Heun step - float* vec_d = (float*)d->data; - float* vec_d2 = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x2[j] = vec_x[j] + vec_d[j] * dt; - } - - ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); - if (denoised == nullptr) { - return false; - } - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1]; - vec_d[j] = (vec_d[j] + d2) / 2; - vec_x[j] = vec_x[j] + vec_d[j] * dt; + sd::Tensor x2 = x + d * dt; + auto denoised2_opt = model(x2, sigmas[i + 1], i + 1); + if (denoised2_opt.empty()) { + return {}; } + sd::Tensor denoised2 = std::move(denoised2_opt); + d = (d + (x2 - denoised2) / sigmas[i + 1]) / 2.0f; + x += d * dt; } } - } break; + return x; + } case DPM2_SAMPLE_METHOD: { - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); - for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], -(i + 1)); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], -(i + 1)); + if (denoised_opt.empty()) { + return {}; } - - // d = (x - denoised) / sigma - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; - } - } - + sd::Tensor denoised = std::move(denoised_opt); + sd::Tensor d = (x - denoised) / sigmas[i]; if (sigmas[i + 1] == 0) { - // Euler step - // x = x + d * dt - float dt = sigmas[i + 1] - sigmas[i]; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_d[j] * dt; - } + float dt = sigmas[i + 1] - sigmas[i]; + x += d * dt; } else { - // DPM-Solver-2 - float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1]))); - float dt_1 = sigma_mid - sigmas[i]; - float dt_2 = sigmas[i + 1] - sigmas[i]; - - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x2[j] = vec_x[j] + vec_d[j] * dt_1; - } - - ggml_tensor* denoised = model(x2, sigma_mid, i + 1); - if (denoised == nullptr) { - return false; - } - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid; - vec_x[j] = vec_x[j] + d2 * dt_2; + float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1]))); + float dt_1 = sigma_mid - sigmas[i]; + float dt_2 = sigmas[i + 1] - sigmas[i]; + sd::Tensor x2 = x + d * dt_1; + auto denoised2_opt = model(x2, sigma_mid, i + 1); + if (denoised2_opt.empty()) { + return {}; } + sd::Tensor denoised2 = std::move(denoised2_opt); + x += ((x2 - denoised2) / sigma_mid) * dt_2; } } - - } break; + return x; + } case DPMPP2S_A_SAMPLE_METHOD: { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); - for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], -(i + 1)); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], -(i + 1)); + if (denoised_opt.empty()) { + return {}; } - - // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); - float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - auto sigma_fn = [](float t) -> float { return exp(-t); }; + sd::Tensor denoised = std::move(denoised_opt); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); + float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); + auto t_fn = [](float sigma) -> float { return -log(sigma); }; + auto sigma_fn = [](float t) -> float { return exp(-t); }; if (sigma_down == 0) { - // d = (x - denoised) / sigmas[i]; - // dt = sigma_down - sigmas[i]; - // x += d * dt; - // => x = denoised - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_denoised[j]; - } + x = denoised; } else { - // DPM-Solver++(2S) - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigma_down); - float h = t_next - t; - float s = t + 0.5f * h; - - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; - float* vec_denoised = (float*)denoised->data; - - // First half-step - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x2[j] = (sigma_fn(s) / sigma_fn(t)) * vec_x[j] - (exp(-h * 0.5f) - 1) * vec_denoised[j]; - } - - ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); - if (denoised == nullptr) { - return false; - } - - // Second half-step - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = (sigma_fn(t_next) / sigma_fn(t)) * vec_x[j] - (exp(-h) - 1) * vec_denoised[j]; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigma_down); + float h = t_next - t; + float s = t + 0.5f * h; + sd::Tensor x2 = (sigma_fn(s) / sigma_fn(t)) * x - (exp(-h * 0.5f) - 1) * denoised; + auto denoised2_opt = model(x2, sigmas[i + 1], i + 1); + if (denoised2_opt.empty()) { + return {}; } + sd::Tensor denoised2 = std::move(denoised2_opt); + x = (sigma_fn(t_next) / sigma_fn(t)) * (x) - (exp(-h) - 1) * denoised2; } - // Noise addition if (sigmas[i + 1] > 0) { - ggml_ext_im_set_randn_f32(noise, rng); - { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int i = 0; i < ggml_nelements(x); i++) { - vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; - } - } + x += sd::Tensor::randn_like(x, rng) * sigma_up; } } - } break; - case DPMPP2M_SAMPLE_METHOD: // DPM++ (2M) from Karras et al (2022) - { - ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); - - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - + return x; + } + case DPMPP2M_SAMPLE_METHOD: { + sd::Tensor old_denoised = x; + auto t_fn = [](float sigma) -> float { return -log(sigma); }; for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], i + 1); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], i + 1); + if (denoised_opt.empty()) { + return {}; } - - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float b = exp(-h) - 1.f; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; + sd::Tensor denoised = std::move(denoised_opt); + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float b = exp(-h) - 1.f; if (i == 0 || sigmas[i + 1] == 0) { - // Simpler step for the edge cases - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = a * vec_x[j] - b * vec_denoised[j]; - } + x = a * (x)-b * denoised; } else { - float h_last = t - t_fn(sigmas[i - 1]); - float r = h_last / h; - for (int j = 0; j < ggml_nelements(x); j++) { - float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; - vec_x[j] = a * vec_x[j] - b * denoised_d; - } - } - - // old_denoised = denoised - for (int j = 0; j < ggml_nelements(x); j++) { - vec_old_denoised[j] = vec_denoised[j]; + float h_last = t - t_fn(sigmas[i - 1]); + float r = h_last / h; + sd::Tensor denoised_d = (1.f + 1.f / (2.f * r)) * denoised - (1.f / (2.f * r)) * old_denoised; + x = a * (x)-b * denoised_d; } + old_denoised = denoised; } - } break; - case DPMPP2Mv2_SAMPLE_METHOD: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 - { - ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); - - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - + return x; + } + case DPMPP2Mv2_SAMPLE_METHOD: { + sd::Tensor old_denoised = x; + auto t_fn = [](float sigma) -> float { return -log(sigma); }; for (int i = 0; i < steps; i++) { - // denoise - ggml_tensor* denoised = model(x, sigmas[i], i + 1); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], i + 1); + if (denoised_opt.empty()) { + return {}; } - - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; - + sd::Tensor denoised = std::move(denoised_opt); + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; if (i == 0 || sigmas[i + 1] == 0) { - // Simpler step for the edge cases float b = exp(-h) - 1.f; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = a * vec_x[j] - b * vec_denoised[j]; - } + x = a * (x)-b * denoised; } else { - float h_last = t - t_fn(sigmas[i - 1]); - float h_min = std::min(h_last, h); - float h_max = std::max(h_last, h); - float r = h_max / h_min; - float h_d = (h_max + h_min) / 2.f; - float b = exp(-h_d) - 1.f; - for (int j = 0; j < ggml_nelements(x); j++) { - float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; - vec_x[j] = a * vec_x[j] - b * denoised_d; - } + float h_last = t - t_fn(sigmas[i - 1]); + float h_min = std::min(h_last, h); + float h_max = std::max(h_last, h); + float r = h_max / h_min; + float h_d = (h_max + h_min) / 2.f; + float b = exp(-h_d) - 1.f; + sd::Tensor denoised_d = (1.f + 1.f / (2.f * r)) * denoised - (1.f / (2.f * r)) * old_denoised; + x = a * (x)-b * denoised_d; } + old_denoised = denoised; + } + return x; + } + case LCM_SAMPLE_METHOD: { + for (int i = 0; i < steps; i++) { + auto denoised_opt = model(x, sigmas[i], i + 1); + if (denoised_opt.empty()) { + return {}; + } + sd::Tensor denoised = std::move(denoised_opt); - // old_denoised = denoised - for (int j = 0; j < ggml_nelements(x); j++) { - vec_old_denoised[j] = vec_denoised[j]; + x = denoised; + if (sigmas[i + 1] > 0) { + x += sd::Tensor::randn_like(x, rng) * sigmas[i + 1]; } } - } break; - case IPNDM_SAMPLE_METHOD: // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main - { - int max_order = 4; - ggml_tensor* x_next = x; - std::vector buffer_model; - + return x; + } + case IPNDM_SAMPLE_METHOD: { + int max_order = 4; + std::vector> hist = {}; for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; float sigma_next = sigmas[i + 1]; - ggml_tensor* x_cur = x_next; - float* vec_x_cur = (float*)x_cur->data; - float* vec_x_next = (float*)x_next->data; - - // Denoising step - ggml_tensor* denoised = model(x_cur, sigma, i + 1); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigma, i + 1); + if (denoised_opt.empty()) { + return {}; } - float* vec_denoised = (float*)denoised->data; - // d_cur = (x_cur - denoised) / sigma - ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur); - float* vec_d_cur = (float*)d_cur->data; + sd::Tensor denoised = std::move(denoised_opt); - for (int j = 0; j < ggml_nelements(d_cur); j++) { - vec_d_cur[j] = (vec_x_cur[j] - vec_denoised[j]) / sigma; - } + sd::Tensor d_cur = (x - denoised) / sigma; + int order = std::min(max_order, i + 1); + float dt = sigma_next - sigma; - int order = std::min(max_order, i + 1); - - // Calculate vec_x_next based on the order switch (order) { - case 1: // First Euler step - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * vec_d_cur[j]; - } + case 1: + x += d_cur * dt; + break; + case 2: + x += ((3.f * d_cur - hist.back()) / 2.f) * dt; + break; + case 3: + x += ((23.f * d_cur - 16.f * hist[hist.size() - 1] + 5.f * hist[hist.size() - 2]) / 12.f) * dt; + break; + case 4: + x += ((55.f * d_cur - 59.f * hist[hist.size() - 1] + 37.f * hist[hist.size() - 2] - 9.f * hist[hist.size() - 3]) / 24.f) * dt; break; - - case 2: // Use one history point - { - float* vec_d_prev1 = (float*)buffer_model.back()->data; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (3 * vec_d_cur[j] - vec_d_prev1[j]) / 2; - } - } break; - - case 3: // Use two history points - { - float* vec_d_prev1 = (float*)buffer_model.back()->data; - float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12; - } - } break; - - case 4: // Use three history points - { - float* vec_d_prev1 = (float*)buffer_model.back()->data; - float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data; - float* vec_d_prev3 = (float*)buffer_model[buffer_model.size() - 3]->data; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24; - } - } break; } - // Manage buffer_model - if (buffer_model.size() == max_order - 1) { - // Shift elements to the left - for (int k = 0; k < max_order - 2; k++) { - buffer_model[k] = buffer_model[k + 1]; - } - buffer_model.back() = d_cur; // Replace the last element with d_cur - } else { - buffer_model.push_back(d_cur); + if (hist.size() == static_cast(max_order - 1)) { + hist.erase(hist.begin()); } + hist.push_back(std::move(d_cur)); } - } break; - case IPNDM_V_SAMPLE_METHOD: // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main - { - int max_order = 4; - std::vector buffer_model; - ggml_tensor* x_next = x; - + return x; + } + case IPNDM_V_SAMPLE_METHOD: { + int max_order = 4; + std::vector> hist = {}; for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; float t_next = sigmas[i + 1]; - // Denoising step - ggml_tensor* denoised = model(x, sigma, i + 1); - float* vec_denoised = (float*)denoised->data; - ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x); - float* vec_d_cur = (float*)d_cur->data; - float* vec_x = (float*)x->data; - - // d_cur = (x - denoised) / sigma - for (int j = 0; j < ggml_nelements(d_cur); j++) { - vec_d_cur[j] = (vec_x[j] - vec_denoised[j]) / sigma; + auto denoised_opt = model(x, sigma, i + 1); + if (denoised_opt.empty()) { + return {}; } + sd::Tensor denoised = std::move(denoised_opt); - int order = std::min(max_order, i + 1); - float h_n = t_next - sigma; - float h_n_1 = (i > 0) ? (sigma - sigmas[i - 1]) : h_n; + sd::Tensor d_cur = (x - denoised) / sigma; + int order = std::min(max_order, i + 1); + float h_n = t_next - sigma; + float h_n_1 = (i > 0) ? (sigma - sigmas[i - 1]) : h_n; switch (order) { - case 1: // First Euler step - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x[j] += vec_d_cur[j] * h_n; - } + case 1: + x += d_cur * h_n; break; - - case 2: { - float* vec_d_prev1 = (float*)buffer_model.back()->data; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x[j] += h_n * ((2 + (h_n / h_n_1)) * vec_d_cur[j] - (h_n / h_n_1) * vec_d_prev1[j]) / 2; - } + case 2: + x += (((2.f + (h_n / h_n_1)) * d_cur - (h_n / h_n_1) * hist.back()) / 2.f) * h_n; break; - } - - case 3: { - float h_n_2 = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1; - float* vec_d_prev1 = (float*)buffer_model.back()->data; - float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x[j] += h_n * ((23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12); - } + case 3: + x += ((23.f * d_cur - 16.f * hist[hist.size() - 1] + 5.f * hist[hist.size() - 2]) / 12.f) * h_n; break; - } - - case 4: { - float h_n_2 = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1; - float h_n_3 = (i > 2) ? (sigmas[i - 2] - sigmas[i - 3]) : h_n_2; - float* vec_d_prev1 = (float*)buffer_model.back()->data; - float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1; - float* vec_d_prev3 = (buffer_model.size() > 2) ? (float*)buffer_model[buffer_model.size() - 3]->data : vec_d_prev2; - for (int j = 0; j < ggml_nelements(x_next); j++) { - vec_x[j] += h_n * ((55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24); - } + case 4: + x += ((55.f * d_cur - 59.f * hist[hist.size() - 1] + 37.f * hist[hist.size() - 2] - 9.f * hist[hist.size() - 3]) / 24.f) * h_n; break; - } } - // Manage buffer_model - if (buffer_model.size() == max_order - 1) { - buffer_model.erase(buffer_model.begin()); + if (hist.size() == static_cast(max_order - 1)) { + hist.erase(hist.begin()); } - buffer_model.push_back(d_cur); - - // Prepare the next d tensor - d_cur = ggml_dup_tensor(work_ctx, x_next); + hist.push_back(std::move(d_cur)); } - } break; - case LCM_SAMPLE_METHOD: // Latent Consistency Models - { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - - for (int i = 0; i < steps; i++) { - float sigma = sigmas[i]; - - // denoise - ggml_tensor* denoised = model(x, sigma, i + 1); - if (denoised == nullptr) { - return false; - } - - // x = denoised - { - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_denoised[j]; - } - } - - if (sigmas[i + 1] > 0) { - // x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1]) - ggml_ext_im_set_randn_f32(noise, rng); - // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); - { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j]; - } - } - } - } - } break; - case DDIM_TRAILING_SAMPLE_METHOD: // Denoising Diffusion Implicit Models - // with the "trailing" timestep spacing - { - // See J. Song et al., "Denoising Diffusion Implicit - // Models", arXiv:2010.02502 [cs.LG] - // - // DDIM itself needs alphas_cumprod (DDPM, J. Ho et al., - // arXiv:2006.11239 [cs.LG] with k-diffusion's start and - // end beta) (which unfortunately k-diffusion's data - // structure hides from the denoiser), and the sigmas are - // also needed to invert the behavior of CompVisDenoiser - // (k-diffusion's LMSDiscreteSchedulerr) - float beta_start = 0.00085f; - float beta_end = 0.0120f; - std::vector alphas_cumprod; - std::vector compvis_sigmas; - - alphas_cumprod.reserve(TIMESTEPS); - compvis_sigmas.reserve(TIMESTEPS); - for (int i = 0; i < TIMESTEPS; i++) { - alphas_cumprod[i] = - (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * - (1.0f - - std::pow(sqrtf(beta_start) + - (sqrtf(beta_end) - sqrtf(beta_start)) * - ((float)i / (TIMESTEPS - 1)), - 2)); - compvis_sigmas[i] = - std::sqrt((1 - alphas_cumprod[i]) / - alphas_cumprod[i]); - } - - ggml_tensor* pred_original_sample = - ggml_dup_tensor(work_ctx, x); - ggml_tensor* variance_noise = - ggml_dup_tensor(work_ctx, x); - - for (int i = 0; i < steps; i++) { - // The "trailing" DDIM timestep, see S. Lin et al., - // "Common Diffusion Noise Schedulers and Sample Steps - // are Flawed", arXiv:2305.08891 [cs], p. 4, Table - // 2. Most variables below follow Diffusers naming - // - // Diffuser naming vs. Song et al. (2010), p. 5, (12) - // and p. 16, (16) ( -> ): - // - // - pred_noise_t -> epsilon_theta^(t)(x_t) - // - pred_original_sample -> f_theta^(t)(x_t) or x_0 - // - std_dev_t -> sigma_t (not the LMS sigma) - // - eta -> eta (set to 0 at the moment) - // - pred_sample_direction -> "direction pointing to - // x_t" - // - pred_prev_sample -> "x_t-1" - int timestep = static_cast(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1; - // 1. get previous step value (=t-1) - int prev_timestep = timestep - TIMESTEPS / static_cast(steps); - // The sigma here is chosen to cause the - // CompVisDenoiser to produce t = timestep - float sigma = static_cast(compvis_sigmas[timestep]); - if (i == 0) { - // The function add_noise intializes x to - // Diffusers' latents * sigma (as in Diffusers' - // pipeline) or sample * sigma (Diffusers' - // scheduler), where this sigma = init_noise_sigma - // in Diffusers. For DDPM and DDIM however, - // init_noise_sigma = 1. But the k-diffusion - // model() also evaluates F_theta(c_in(sigma) x; - // ...) instead of the bare U-net F_theta, with - // c_in = 1 / sqrt(sigma^2 + 1), as defined in - // T. Karras et al., "Elucidating the Design Space - // of Diffusion-Based Generative Models", - // arXiv:2206.00364 [cs.CV], p. 3, Table 1. Hence - // the first call has to be prescaled as x <- x / - // (c_in * sigma) with the k-diffusion pipeline - // and CompVisDenoiser. - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] *= std::sqrt(sigma * sigma + 1) / - sigma; - } - } else { - // For the subsequent steps after the first one, - // at this point x = latents or x = sample, and - // needs to be prescaled with x <- sample / c_in - // to compensate for model() applying the scale - // c_in before the U-net F_theta - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] *= std::sqrt(sigma * sigma + 1); - } - } - // Note (also noise_pred in Diffuser's pipeline) - // model_output = model() is the D(x, sigma) as - // defined in Karras et al. (2022), p. 3, Table 1 and - // p. 8 (7), compare also p. 38 (226) therein. - ggml_tensor* model_output = - model(x, sigma, i + 1); - // Here model_output is still the k-diffusion denoiser - // output, not the U-net output F_theta(c_in(sigma) x; - // ...) in Karras et al. (2022), whereas Diffusers' - // model_output is F_theta(...). Recover the actual - // model_output, which is also referred to as the - // "Karras ODE derivative" d or d_cur in several - // samplers above. - { - float* vec_x = (float*)x->data; - float* vec_model_output = - (float*)model_output->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_model_output[j] = - (vec_x[j] - vec_model_output[j]) * - (1 / sigma); - } - } - // 2. compute alphas, betas - float alpha_prod_t = static_cast(alphas_cumprod[timestep]); - // Note final_alpha_cumprod = alphas_cumprod[0] due to - // trailing timestep spacing - float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); - float beta_prod_t = 1 - alpha_prod_t; - // 3. compute predicted original sample from predicted - // noise also called "predicted x_0" of formula (12) - // from https://arxiv.org/pdf/2010.02502.pdf - { - float* vec_x = (float*)x->data; - float* vec_model_output = - (float*)model_output->data; - float* vec_pred_original_sample = - (float*)pred_original_sample->data; - // Note the substitution of latents or sample = x - // * c_in = x / sqrt(sigma^2 + 1) - for (int j = 0; j < ggml_nelements(x); j++) { - vec_pred_original_sample[j] = - (vec_x[j] / std::sqrt(sigma * sigma + 1) - - std::sqrt(beta_prod_t) * - vec_model_output[j]) * - (1 / std::sqrt(alpha_prod_t)); - } - } - // Assuming the "epsilon" prediction type, where below - // pred_epsilon = model_output is inserted, and is not - // defined/copied explicitly. - // - // 5. compute variance: "sigma_t(eta)" -> see formula - // (16) - // - // sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) * - // sqrt(1 - alpha_t/alpha_t-1) - float beta_prod_t_prev = 1 - alpha_prod_t_prev; - float variance = (beta_prod_t_prev / beta_prod_t) * - (1 - alpha_prod_t / alpha_prod_t_prev); - float std_dev_t = eta * std::sqrt(variance); - // 6. compute "direction pointing to x_t" of formula - // (12) from https://arxiv.org/pdf/2010.02502.pdf - // 7. compute x_t without "random noise" of formula - // (12) from https://arxiv.org/pdf/2010.02502.pdf - { - float* vec_model_output = (float*)model_output->data; - float* vec_pred_original_sample = - (float*)pred_original_sample->data; - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - // Two step inner loop without an explicit - // tensor - float pred_sample_direction = - ::sqrtf(1 - alpha_prod_t_prev - - ::powf(std_dev_t, 2)) * - vec_model_output[j]; - vec_x[j] = std::sqrt(alpha_prod_t_prev) * - vec_pred_original_sample[j] + - pred_sample_direction; - } - } - if (eta > 0) { - ggml_ext_im_set_randn_f32(variance_noise, rng); - float* vec_variance_noise = - (float*)variance_noise->data; - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] += std_dev_t * vec_variance_noise[j]; - } - } - // See the note above: x = latents or sample here, and - // is not scaled by the c_in. For the final output - // this is correct, but for subsequent iterations, x - // needs to be prescaled again, since k-diffusion's - // model() differes from the bare U-net F_theta by the - // factor c_in. - } - } break; - case TCD_SAMPLE_METHOD: // Strategic Stochastic Sampling (Algorithm 4) in - // Trajectory Consistency Distillation - { - // See J. Zheng et al., "Trajectory Consistency - // Distillation: Improved Latent Consistency Distillation - // by Semi-Linear Consistency Function with Trajectory - // Mapping", arXiv:2402.19159 [cs.CV] - float beta_start = 0.00085f; - float beta_end = 0.0120f; - std::vector alphas_cumprod; - std::vector compvis_sigmas; - - alphas_cumprod.reserve(TIMESTEPS); - compvis_sigmas.reserve(TIMESTEPS); - for (int i = 0; i < TIMESTEPS; i++) { - alphas_cumprod[i] = - (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * - (1.0f - - std::pow(sqrtf(beta_start) + - (sqrtf(beta_end) - sqrtf(beta_start)) * - ((float)i / (TIMESTEPS - 1)), - 2)); - compvis_sigmas[i] = - std::sqrt((1 - alphas_cumprod[i]) / - alphas_cumprod[i]); - } - int original_steps = 50; - - ggml_tensor* pred_original_sample = - ggml_dup_tensor(work_ctx, x); - ggml_tensor* noise = - ggml_dup_tensor(work_ctx, x); - - for (int i = 0; i < steps; i++) { - // Analytic form for TCD timesteps - int timestep = TIMESTEPS - 1 - - (TIMESTEPS / original_steps) * - (int)floor(i * ((float)original_steps / steps)); - // 1. get previous step value - int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps)); - // Here timestep_s is tau_n' in Algorithm 4. The _s - // notation appears to be that from C. Lu, - // "DPM-Solver: A Fast ODE Solver for Diffusion - // Probabilistic Model Sampling in Around 10 Steps", - // arXiv:2206.00927 [cs.LG], but this notation is not - // continued in Algorithm 4, where _n' is used. - int timestep_s = - (int)floor((1 - eta) * prev_timestep); - // Begin k-diffusion specific workaround for - // evaluating F_theta(x; ...) from D(x, sigma), same - // as in DDIM (and see there for detailed comments) - float sigma = static_cast(compvis_sigmas[timestep]); - if (i == 0) { - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] *= std::sqrt(sigma * sigma + 1) / - sigma; - } - } else { - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] *= std::sqrt(sigma * sigma + 1); - } - } - ggml_tensor* model_output = - model(x, sigma, i + 1); - { - float* vec_x = (float*)x->data; - float* vec_model_output = - (float*)model_output->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_model_output[j] = - (vec_x[j] - vec_model_output[j]) * - (1 / sigma); - } - } - // 2. compute alphas, betas - // - // When comparing TCD with DDPM/DDIM note that Zheng - // et al. (2024) follows the DPM-Solver notation for - // alpha. One can find the following comment in the - // original DPM-Solver code - // (https://github.com/LuChengTHU/dpm-solver/): - // "**Important**: Please pay special attention for - // the args for `alphas_cumprod`: The `alphas_cumprod` - // is the \hat{alpha_n} arrays in the notations of - // DDPM. [...] Therefore, the notation \hat{alpha_n} - // is different from the notation alpha_t in - // DPM-Solver. In fact, we have alpha_{t_n} = - // \sqrt{\hat{alpha_n}}, [...]" - float alpha_prod_t = static_cast(alphas_cumprod[timestep]); - float beta_prod_t = 1 - alpha_prod_t; - // Note final_alpha_cumprod = alphas_cumprod[0] since - // TCD is always "trailing" - float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); - // The subscript _s are the only portion in this - // section (2) unique to TCD - float alpha_prod_s = static_cast(alphas_cumprod[timestep_s]); - float beta_prod_s = 1 - alpha_prod_s; - // 3. Compute the predicted noised sample x_s based on - // the model parameterization - // - // This section is also exactly the same as DDIM - { - float* vec_x = (float*)x->data; - float* vec_model_output = - (float*)model_output->data; - float* vec_pred_original_sample = - (float*)pred_original_sample->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_pred_original_sample[j] = - (vec_x[j] / std::sqrt(sigma * sigma + 1) - - std::sqrt(beta_prod_t) * - vec_model_output[j]) * - (1 / std::sqrt(alpha_prod_t)); - } - } - // This consistency function step can be difficult to - // decipher from Algorithm 4, as it is simply stated - // using a consistency function. This step is the - // modified DDIM, i.e. p. 8 (32) in Zheng et - // al. (2024), with eta set to 0 (see the paragraph - // immediately thereafter that states this somewhat - // obliquely). - { - float* vec_pred_original_sample = - (float*)pred_original_sample->data; - float* vec_model_output = - (float*)model_output->data; - float* vec_x = (float*)x->data; - for (int j = 0; j < ggml_nelements(x); j++) { - // Substituting x = pred_noised_sample and - // pred_epsilon = model_output - vec_x[j] = - std::sqrt(alpha_prod_s) * - vec_pred_original_sample[j] + - std::sqrt(beta_prod_s) * - vec_model_output[j]; - } - } - // 4. Sample and inject noise z ~ N(0, I) for - // MultiStep Inference Noise is not used on the final - // timestep of the timestep schedule. This also means - // that noise is not used for one-step sampling. Eta - // (referred to as "gamma" in the paper) was - // introduced to control the stochasticity in every - // step. When eta = 0, it represents deterministic - // sampling, whereas eta = 1 indicates full stochastic - // sampling. - if (eta > 0 && i != steps - 1) { - // In this case, x is still pred_noised_sample, - // continue in-place - ggml_ext_im_set_randn_f32(noise, rng); - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - for (int j = 0; j < ggml_nelements(x); j++) { - // Corresponding to (35) in Zheng et - // al. (2024), substituting x = - // pred_noised_sample - vec_x[j] = - std::sqrt(alpha_prod_t_prev / - alpha_prod_s) * - vec_x[j] + - std::sqrt(1 - alpha_prod_t_prev / - alpha_prod_s) * - vec_noise[j]; - } - } - } - } break; - case RES_MULTISTEP_SAMPLE_METHOD: // Res Multistep sampler - { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); + return x; + } + case RES_MULTISTEP_SAMPLE_METHOD: { + sd::Tensor old_denoised = x; bool have_old_sigma = false; float old_sigma_down = 0.0f; @@ -1712,10 +1065,11 @@ static bool sample_k_diffusion(sample_method_t method, }; for (int i = 0; i < steps; i++) { - ggml_tensor* denoised = model(x, sigmas[i], i + 1); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigmas[i], i + 1); + if (denoised_opt.empty()) { + return {}; } + sd::Tensor denoised = std::move(denoised_opt); float sigma_from = sigmas[i]; float sigma_to = sigmas[i + 1]; @@ -1737,14 +1091,7 @@ static bool sample_k_diffusion(sample_method_t method, } if (sigma_down == 0.0f || !have_old_sigma) { - float dt = sigma_down - sigma_from; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - float d = (vec_x[j] - vec_denoised[j]) / sigma_from; - vec_x[j] = vec_x[j] + d * dt; - } + x += ((x - denoised) / sigma_from) * (sigma_down - sigma_from); } else { float t = t_fn(sigma_from); float t_old = t_fn(old_sigma_down); @@ -1765,42 +1112,20 @@ static bool sample_k_diffusion(sample_method_t method, b2 = 0.0f; } - float sigma_h = sigma_fn(h); - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = sigma_h * vec_x[j] + h * (b1 * vec_denoised[j] + b2 * vec_old_denoised[j]); - } + x = sigma_fn(h) * (x) + h * (b1 * denoised + b2 * old_denoised); } if (sigmas[i + 1] > 0 && sigma_up > 0.0f) { - ggml_ext_im_set_randn_f32(noise, rng); - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up; - } - } - - float* vec_old_denoised = (float*)old_denoised->data; - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_old_denoised[j] = vec_denoised[j]; + x += sd::Tensor::randn_like(x, rng) * sigma_up; } + old_denoised = denoised; old_sigma_down = sigma_down; have_old_sigma = true; } - } break; - case RES_2S_SAMPLE_METHOD: // Res 2s sampler - { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x0 = ggml_dup_tensor(work_ctx, x); - ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); - + return x; + } + case RES_2S_SAMPLE_METHOD: { const float c2 = 0.5f; auto t_fn = [](float sigma) -> float { return -logf(sigma); }; auto phi1_fn = [](float t) -> float { @@ -1821,10 +1146,11 @@ static bool sample_k_diffusion(sample_method_t method, float sigma_from = sigmas[i]; float sigma_to = sigmas[i + 1]; - ggml_tensor* denoised = model(x, sigma_from, -(i + 1)); - if (denoised == nullptr) { - return false; + auto denoised_opt = model(x, sigma_from, -(i + 1)); + if (denoised_opt.empty()) { + return {}; } + sd::Tensor denoised = std::move(denoised_opt); float sigma_up = 0.0f; float sigma_down = sigma_to; @@ -1842,17 +1168,9 @@ static bool sample_k_diffusion(sample_method_t method, sigma_down = sigma_down_sq > 0.0f ? std::sqrt(sigma_down_sq) : 0.0f; } - float* vec_x = (float*)x->data; - float* vec_x0 = (float*)x0->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x0[j] = vec_x[j]; - } - + sd::Tensor x0 = x; if (sigma_down == 0.0f || sigma_from == 0.0f) { - float* vec_denoised = (float*)denoised->data; - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_denoised[j]; - } + x = denoised; } else { float t = t_fn(sigma_from); float t_next = t_fn(sigma_down); @@ -1864,45 +1182,140 @@ static bool sample_k_diffusion(sample_method_t method, float b2 = phi2_val / c2; float b1 = phi1_val - b2; - float sigma_c2 = expf(-(t + h * c2)); + float sigma_c2 = expf(-(t + h * c2)); + sd::Tensor eps1 = denoised - x0; + sd::Tensor x2 = x0 + eps1 * (h * a21); - float* vec_denoised = (float*)denoised->data; - float* vec_x2 = (float*)x2->data; - for (int j = 0; j < ggml_nelements(x); j++) { - float eps1 = vec_denoised[j] - vec_x0[j]; - vec_x2[j] = vec_x0[j] + h * a21 * eps1; - } - - ggml_tensor* denoised2 = model(x2, sigma_c2, i + 1); - if (denoised2 == nullptr) { - return false; - } - float* vec_denoised2 = (float*)denoised2->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - float eps1 = vec_denoised[j] - vec_x0[j]; - float eps2 = vec_denoised2[j] - vec_x0[j]; - vec_x[j] = vec_x0[j] + h * (b1 * eps1 + b2 * eps2); + auto denoised2_opt = model(x2, sigma_c2, i + 1); + if (denoised2_opt.empty()) { + return {}; } + sd::Tensor denoised2 = std::move(denoised2_opt); + sd::Tensor eps2 = denoised2 - x0; + x = x0 + h * (b1 * eps1 + b2 * eps2); } if (sigmas[i + 1] > 0 && sigma_up > 0.0f) { - ggml_ext_im_set_randn_f32(noise, rng); - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; - - for (int j = 0; j < ggml_nelements(x); j++) { - vec_x[j] = vec_x[j] + vec_noise[j] * sigma_up; - } + x += sd::Tensor::randn_like(x, rng) * sigma_up; } } - } break; + return x; + } + case DDIM_TRAILING_SAMPLE_METHOD: { + float beta_start = 0.00085f; + float beta_end = 0.0120f; + std::vector alphas_cumprod(TIMESTEPS); + std::vector compvis_sigmas(TIMESTEPS); + for (int i = 0; i < TIMESTEPS; i++) { + alphas_cumprod[i] = + (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * + (1.0f - + std::pow(sqrtf(beta_start) + + (sqrtf(beta_end) - sqrtf(beta_start)) * + ((float)i / (TIMESTEPS - 1)), + 2)); + compvis_sigmas[i] = + std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); + } + for (int i = 0; i < steps; i++) { + int timestep = static_cast(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1; + int prev_timestep = timestep - TIMESTEPS / static_cast(steps); + float sigma = static_cast(compvis_sigmas[timestep]); + if (i == 0) { + x *= std::sqrt(sigma * sigma + 1) / sigma; + } else { + x *= std::sqrt(sigma * sigma + 1); + } + + auto model_output_opt = model(x, sigma, i + 1); + if (model_output_opt.empty()) { + return {}; + } + sd::Tensor model_output = std::move(model_output_opt); + model_output = (x - model_output) * (1.0f / sigma); + + float alpha_prod_t = static_cast(alphas_cumprod[timestep]); + float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); + float beta_prod_t = 1.0f - alpha_prod_t; + + sd::Tensor pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) - + std::sqrt(beta_prod_t) * model_output) * + (1.0f / std::sqrt(alpha_prod_t)); + + float beta_prod_t_prev = 1.0f - alpha_prod_t_prev; + float variance = (beta_prod_t_prev / beta_prod_t) * + (1.0f - alpha_prod_t / alpha_prod_t_prev); + float std_dev_t = eta * std::sqrt(variance); + + x = std::sqrt(alpha_prod_t_prev) * pred_original_sample + + std::sqrt(1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) * model_output; + + if (eta > 0) { + x += std_dev_t * sd::Tensor::randn_like(x, rng); + } + } + return x; + } + case TCD_SAMPLE_METHOD: { + float beta_start = 0.00085f; + float beta_end = 0.0120f; + std::vector alphas_cumprod(TIMESTEPS); + std::vector compvis_sigmas(TIMESTEPS); + for (int i = 0; i < TIMESTEPS; i++) { + alphas_cumprod[i] = + (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * + (1.0f - + std::pow(sqrtf(beta_start) + + (sqrtf(beta_end) - sqrtf(beta_start)) * + ((float)i / (TIMESTEPS - 1)), + 2)); + compvis_sigmas[i] = + std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); + } + int original_steps = 50; + for (int i = 0; i < steps; i++) { + int timestep = TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor(i * ((float)original_steps / steps)); + int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps)); + int timestep_s = (int)floor((1 - eta) * prev_timestep); + float sigma = static_cast(compvis_sigmas[timestep]); + + if (i == 0) { + x *= std::sqrt(sigma * sigma + 1) / sigma; + } else { + x *= std::sqrt(sigma * sigma + 1); + } + + auto model_output_opt = model(x, sigma, i + 1); + if (model_output_opt.empty()) { + return {}; + } + sd::Tensor model_output = std::move(model_output_opt); + model_output = (x - model_output) * (1.0f / sigma); + + float alpha_prod_t = static_cast(alphas_cumprod[timestep]); + float beta_prod_t = 1.0f - alpha_prod_t; + float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); + float alpha_prod_s = static_cast(alphas_cumprod[timestep_s]); + float beta_prod_s = 1.0f - alpha_prod_s; + + sd::Tensor pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) - + std::sqrt(beta_prod_t) * model_output) * + (1.0f / std::sqrt(alpha_prod_t)); + + x = std::sqrt(alpha_prod_s) * pred_original_sample + + std::sqrt(beta_prod_s) * model_output; + + if (eta > 0 && i != steps - 1) { + x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * (x) + + std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor::randn_like(x, rng); + } + } + return x; + } default: - LOG_ERROR("Attempting to sample with nonexisting sample method %i", method); - return false; + return {}; } - return true; } #endif // __DENOISER_HPP__ diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 07d9df89..eb0debff 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -1,37 +1,45 @@ #ifndef __DIFFUSION_MODEL_H__ #define __DIFFUSION_MODEL_H__ +#include #include "anima.hpp" #include "flux.hpp" #include "mmdit.hpp" #include "qwen_image.hpp" +#include "tensor_ggml.hpp" #include "unet.hpp" #include "wan.hpp" #include "z_image.hpp" struct DiffusionParams { - ggml_tensor* x = nullptr; - ggml_tensor* timesteps = nullptr; - ggml_tensor* context = nullptr; - ggml_tensor* c_concat = nullptr; - ggml_tensor* y = nullptr; - ggml_tensor* guidance = nullptr; - std::vector ref_latents = {}; - bool increase_ref_index = false; - int num_video_frames = -1; - std::vector controls = {}; - float control_strength = 0.f; - ggml_tensor* vace_context = nullptr; - float vace_strength = 1.f; - std::vector skip_layers = {}; + const sd::Tensor* x = nullptr; + const sd::Tensor* timesteps = nullptr; + const sd::Tensor* context = nullptr; + const sd::Tensor* c_concat = nullptr; + const sd::Tensor* y = nullptr; + const sd::Tensor* t5_ids = nullptr; + const sd::Tensor* t5_weights = nullptr; + const sd::Tensor* guidance = nullptr; + const std::vector>* ref_latents = nullptr; + bool increase_ref_index = false; + int num_video_frames = -1; + const std::vector>* controls = nullptr; + float control_strength = 0.f; + const sd::Tensor* vace_context = nullptr; + float vace_strength = 1.f; + const std::vector* skip_layers = nullptr; }; +template +static inline const sd::Tensor& tensor_or_empty(const sd::Tensor* tensor) { + static const sd::Tensor kEmpty; + return tensor != nullptr ? *tensor : kEmpty; +} + struct DiffusionModel { virtual std::string get_desc() = 0; - virtual bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) = 0; + virtual sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) = 0; virtual void alloc_params_buffer() = 0; virtual void free_params_buffer() = 0; virtual void free_compute_buffer() = 0; @@ -93,19 +101,20 @@ struct UNetModel : public DiffusionModel { unet.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector> empty_controls; return unet.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.c_concat, - diffusion_params.y, + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.c_concat), + tensor_or_empty(diffusion_params.y), diffusion_params.num_video_frames, - diffusion_params.controls, - diffusion_params.control_strength, output, output_ctx); + diffusion_params.controls ? *diffusion_params.controls : empty_controls, + diffusion_params.control_strength); } }; @@ -158,18 +167,17 @@ struct MMDiTModel : public DiffusionModel { mmdit.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector empty_skip_layers; return mmdit.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.y, - output, - output_ctx, - diffusion_params.skip_layers); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.y), + diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers); } }; @@ -224,22 +232,22 @@ struct FluxModel : public DiffusionModel { flux.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector> empty_ref_latents; + static const std::vector empty_skip_layers; return flux.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.c_concat, - diffusion_params.y, - diffusion_params.guidance, - diffusion_params.ref_latents, + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.c_concat), + tensor_or_empty(diffusion_params.y), + tensor_or_empty(diffusion_params.guidance), + diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents, diffusion_params.increase_ref_index, - output, - output_ctx, - diffusion_params.skip_layers); + diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers); } }; @@ -294,18 +302,16 @@ struct AnimaModel : public DiffusionModel { anima.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); return anima.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.c_concat, - diffusion_params.y, - output, - output_ctx); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.t5_ids), + tensor_or_empty(diffusion_params.t5_weights)); } }; @@ -361,21 +367,19 @@ struct WanModel : public DiffusionModel { wan.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); return wan.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.y, - diffusion_params.c_concat, - nullptr, - diffusion_params.vace_context, - diffusion_params.vace_strength, - output, - output_ctx); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + tensor_or_empty(diffusion_params.y), + tensor_or_empty(diffusion_params.c_concat), + sd::Tensor(), + tensor_or_empty(diffusion_params.vace_context), + diffusion_params.vace_strength); } }; @@ -432,18 +436,17 @@ struct QwenImageModel : public DiffusionModel { qwen_image.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector> empty_ref_latents; return qwen_image.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.ref_latents, - true, // increase_ref_index - output, - output_ctx); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents, + true); } }; @@ -499,18 +502,17 @@ struct ZImageModel : public DiffusionModel { z_image.set_circular_axes(circular_x, circular_y); } - bool compute(int n_threads, - DiffusionParams diffusion_params, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) override { + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + static const std::vector> empty_ref_latents; return z_image.compute(n_threads, - diffusion_params.x, - diffusion_params.timesteps, - diffusion_params.context, - diffusion_params.ref_latents, - true, // increase_ref_index - output, - output_ctx); + *diffusion_params.x, + *diffusion_params.timesteps, + tensor_or_empty(diffusion_params.context), + diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents, + true); } }; diff --git a/src/easycache.hpp b/src/easycache.hpp index 3f0287a6..409a464e 100644 --- a/src/easycache.hpp +++ b/src/easycache.hpp @@ -1,10 +1,15 @@ +#ifndef __EASYCACHE_HPP__ +#define __EASYCACHE_HPP__ + #include #include #include #include +#include "condition_cache_utils.hpp" #include "denoiser.hpp" #include "ggml_extend.hpp" +#include "tensor.hpp" struct EasyCacheConfig { bool enabled = false; @@ -19,15 +24,15 @@ struct EasyCacheCacheEntry { struct EasyCacheState { EasyCacheConfig config; - Denoiser* denoiser = nullptr; - float start_sigma = std::numeric_limits::max(); - float end_sigma = 0.0f; - bool initialized = false; - bool initial_step = true; - bool skip_current_step = false; - bool step_active = false; - const SDCondition* anchor_condition = nullptr; - std::unordered_map cache_diffs; + Denoiser* denoiser = nullptr; + float start_sigma = std::numeric_limits::max(); + float end_sigma = 0.0f; + bool initialized = false; + bool initial_step = true; + bool skip_current_step = false; + bool step_active = false; + const void* anchor_condition = nullptr; + std::unordered_map cache_diffs; std::vector prev_input; std::vector prev_output; float output_prev_norm = 0.0f; @@ -120,41 +125,30 @@ struct EasyCacheState { return enabled() && step_active && skip_current_step; } - bool has_cache(const SDCondition* cond) const { + bool has_cache(const void* cond) const { auto it = cache_diffs.find(cond); return it != cache_diffs.end() && !it->second.diff.empty(); } - void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void update_cache(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { EasyCacheCacheEntry& entry = cache_diffs[cond]; - size_t ne = static_cast(ggml_nelements(output)); - entry.diff.resize(ne); - float* out_data = (float*)output->data; - float* in_data = (float*)input->data; - for (size_t i = 0; i < ne; ++i) { - entry.diff[i] = out_data[i] - in_data[i]; - } + sd::store_condition_cache_diff(&entry.diff, input, output); } - void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void apply_cache(const void* cond, const sd::Tensor& input, sd::Tensor* output) { auto it = cache_diffs.find(cond); if (it == cache_diffs.end() || it->second.diff.empty()) { return; } - copy_ggml_tensor(output, input); - float* out_data = (float*)output->data; - const std::vector& diff = it->second.diff; - for (size_t i = 0; i < diff.size(); ++i) { - out_data[i] += diff[i]; - } + sd::apply_condition_cache_diff(it->second.diff, input, output); } - bool before_condition(const SDCondition* cond, - ggml_tensor* input, - ggml_tensor* output, + bool before_condition(const void* cond, + const sd::Tensor& input, + sd::Tensor* output, float sigma, int step_index) { - if (!enabled() || step_index < 0) { + if (!enabled() || step_index < 0 || output == nullptr) { return false; } if (step_index != current_step_index) { @@ -181,12 +175,12 @@ struct EasyCacheState { if (!has_prev_input || !has_prev_output || !has_cache(cond)) { return false; } - size_t ne = static_cast(ggml_nelements(input)); + size_t ne = static_cast(input.numel()); if (prev_input.size() != ne) { return false; } - float* input_data = (float*)input->data; - last_input_change = 0.0f; + const float* input_data = input.data(); + last_input_change = 0.0f; for (size_t i = 0; i < ne; ++i) { last_input_change += std::fabs(input_data[i] - prev_input[i]); } @@ -211,7 +205,7 @@ struct EasyCacheState { return false; } - void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void after_condition(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { if (!step_is_active()) { return; } @@ -220,16 +214,16 @@ struct EasyCacheState { return; } - size_t ne = static_cast(ggml_nelements(input)); - float* in_data = (float*)input->data; + size_t ne = static_cast(input.numel()); + const float* in_data = input.data(); prev_input.resize(ne); for (size_t i = 0; i < ne; ++i) { prev_input[i] = in_data[i]; } has_prev_input = true; - float* out_data = (float*)output->data; - float output_change = 0.0f; + const float* out_data = output.data(); + float output_change = 0.0f; if (has_prev_output && prev_output.size() == ne) { for (size_t i = 0; i < ne; ++i) { output_change += std::fabs(out_data[i] - prev_output[i]); @@ -262,4 +256,6 @@ struct EasyCacheState { cumulative_change_rate = 0.0f; has_last_input_change = false; } -}; \ No newline at end of file +}; + +#endif diff --git a/src/esrgan.hpp b/src/esrgan.hpp index efb3aed6..26c46f5b 100644 --- a/src/esrgan.hpp +++ b/src/esrgan.hpp @@ -341,12 +341,12 @@ struct ESRGAN : public GGMLRunner { return success; } - ggml_cgraph* build_graph(ggml_tensor* x) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor) { if (!rrdb_net) return nullptr; constexpr int kGraphNodes = 1 << 16; // 65k ggml_cgraph* gf = new_graph_custom(kGraphNodes); - x = to_backend(x); + ggml_tensor* x = make_input(x_tensor); auto runner_ctx = get_context(); ggml_tensor* out = rrdb_net->forward(&runner_ctx, x); @@ -354,15 +354,12 @@ struct ESRGAN : public GGMLRunner { return gf; } - bool compute(const int n_threads, - ggml_tensor* x, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(x); - }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + sd::Tensor compute(const int n_threads, + const sd::Tensor& x) { + auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); }; + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return result; } }; -#endif // __ESRGAN_HPP__ \ No newline at end of file +#endif // __ESRGAN_HPP__ diff --git a/src/flux.hpp b/src/flux.hpp index 93b9350a..e6bf002f 100644 --- a/src/flux.hpp +++ b/src/flux.hpp @@ -1178,6 +1178,7 @@ namespace Flux { std::vector pe_vec; std::vector mod_index_arange_vec; std::vector dct_vec; + sd::Tensor guidance_tensor; SDVersion version; bool use_mask = false; @@ -1353,29 +1354,42 @@ namespace Flux { return dct; } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* c_concat, - ggml_tensor* y, - ggml_tensor* guidance, - std::vector ref_latents = {}, - bool increase_ref_index = false, - std::vector skip_layers = {}) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& c_concat_tensor = {}, + const sd::Tensor& y_tensor = {}, + const sd::Tensor& guidance_tensor = {}, + const std::vector>& ref_latents_tensor = {}, + bool increase_ref_index = false, + std::vector skip_layers = {}) { + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* c_concat = make_optional_input(c_concat_tensor); + ggml_tensor* y = make_optional_input(y_tensor); + if (flux_params.guidance_embed || flux_params.is_chroma) { + if (!guidance_tensor.empty()) { + this->guidance_tensor = guidance_tensor; + if (flux_params.is_chroma) { + this->guidance_tensor.fill_(0.f); + } + } + } + ggml_tensor* guidance = make_optional_input(this->guidance_tensor); + std::vector ref_latents; + ref_latents.reserve(ref_latents_tensor.size()); + for (const auto& ref_latent_tensor : ref_latents_tensor) { + ref_latents.push_back(make_input(ref_latent_tensor)); + } + GGML_ASSERT(x->ne[3] == 1); ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE); ggml_tensor* mod_index_arange = nullptr; ggml_tensor* dct = nullptr; // for chroma radiance - x = to_backend(x); - context = to_backend(context); - if (c_concat != nullptr) { - c_concat = to_backend(c_concat); - } if (flux_params.is_chroma) { - guidance = ggml_set_f32(guidance, 0); - if (!use_mask) { y = nullptr; } @@ -1385,16 +1399,6 @@ namespace Flux { mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size()); set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data()); } - y = to_backend(y); - - timesteps = to_backend(timesteps); - if (flux_params.guidance_embed || flux_params.is_chroma) { - guidance = to_backend(guidance); - } - for (int i = 0; i < ref_latents.size(); i++) { - ref_latents[i] = to_backend(ref_latents[i]); - } - std::set txt_arange_dims; if (sd_version_is_flux2(version)) { txt_arange_dims = {3}; @@ -1455,18 +1459,16 @@ namespace Flux { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* c_concat, - ggml_tensor* y, - ggml_tensor* guidance, - std::vector ref_latents = {}, - bool increase_ref_index = false, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr, - std::vector skip_layers = std::vector()) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& c_concat = {}, + const sd::Tensor& y = {}, + const sd::Tensor& guidance = {}, + const std::vector>& ref_latents = {}, + bool increase_ref_index = false, + std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] @@ -1476,7 +1478,8 @@ namespace Flux { return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return result; } void test() { @@ -1485,41 +1488,51 @@ namespace Flux { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // cpu f16: // cuda f16: nan // cuda q8_0: pass - auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 128, 1); + sd::Tensor x({16, 16, 128, 1}); // ggml_set_f32(x, 0.01f); - // auto x = load_tensor_from_file(work_ctx, "chroma_x.bin"); + // auto x = load_tensor_from_file(ctx, "chroma_x.bin"); // print_ggml_tensor(x); std::vector timesteps_vec(1, 1.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); std::vector guidance_vec(1, 0.f); - auto guidance = vector_to_ggml_tensor(work_ctx, guidance_vec); + auto guidance = sd::Tensor::from_vector(guidance_vec); - auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 15360, 256, 1); + sd::Tensor context({15360, 256, 1}); // ggml_set_f32(context, 0.01f); - // auto context = load_tensor_from_file(work_ctx, "chroma_context.bin"); + // auto context = load_tensor_from_file(ctx, "chroma_context.bin"); // print_ggml_tensor(context); - // auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, 1); + // auto y = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 768, 1); // ggml_set_f32(y, 0.01f); auto y = nullptr; // print_ggml_tensor(y); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, nullptr, y, guidance, {}, false, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + {}, + {}, + guidance, + {}, + false); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("flux test done in %lldms", t1 - t0); } } diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index e6b27cc7..859270cb 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "ggml.h" #include "model.h" +#include "tensor.hpp" #ifdef SD_USE_CUDA #include "ggml-cuda.h" @@ -49,6 +51,7 @@ #endif #include "rng.hpp" +#include "tensor_ggml.hpp" #include "util.h" #define EPS 1e-05f @@ -205,14 +208,6 @@ __STATIC_INLINE__ float sd_image_get_f32(sd_image_t image, int64_t iw, int64_t i return value; } -__STATIC_INLINE__ float sd_image_get_f32(sd_image_f32_t image, int64_t iw, int64_t ih, int64_t ic, bool scale = true) { - float value = *(image.data + ih * image.width * image.channel + iw * image.channel + ic); - if (scale) { - value /= 255.f; - } - return value; -} - __STATIC_INLINE__ void print_ggml_tensor(ggml_tensor* tensor, bool shape_only = false, const char* mark = "") { printf("%s (%s): shape(%zu, %zu, %zu, %zu)\n", mark, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); fflush(stdout); @@ -250,6 +245,56 @@ __STATIC_INLINE__ void print_ggml_tensor(ggml_tensor* tensor, bool shape_only = } } +template +__STATIC_INLINE__ void print_sd_tensor(const sd::Tensor& tensor, bool shape_only = false, const char* mark = "") { + printf("%s: shape(", mark); + for (size_t i = 0; i < static_cast(tensor.dim()); ++i) { + printf("%s%lld", i == 0 ? "" : ", ", static_cast(tensor.shape()[i])); + } + printf(")\n"); + fflush(stdout); + if (shape_only) { + return; + } + int range = 3; + std::vector shape = tensor.shape(); + while (shape.size() < 4) { + shape.push_back(1); + } + for (int64_t i3 = 0; i3 < shape[3]; i3++) { + if (i3 >= range && i3 + range < shape[3]) { + continue; + } + for (int64_t i2 = 0; i2 < shape[2]; i2++) { + if (i2 >= range && i2 + range < shape[2]) { + continue; + } + for (int64_t i1 = 0; i1 < shape[1]; i1++) { + if (i1 >= range && i1 + range < shape[1]) { + continue; + } + for (int64_t i0 = 0; i0 < shape[0]; i0++) { + if (i0 >= range && i0 + range < shape[0]) { + continue; + } + size_t offset = static_cast(i0 + shape[0] * (i1 + shape[1] * (i2 + shape[2] * i3))); + printf(" [%lld, %lld, %lld, %lld] = ", static_cast(i3), static_cast(i2), static_cast(i1), static_cast(i0)); + if constexpr (std::is_same_v) { + printf("%f\n", tensor[static_cast(offset)]); + } else if constexpr (std::is_same_v) { + printf("%f\n", ggml_fp16_to_fp32(tensor[static_cast(offset)])); + } else if constexpr (std::is_same_v) { + printf("%d\n", tensor[static_cast(offset)]); + } else if constexpr (std::is_same_v) { + printf("%lld\n", static_cast(tensor[static_cast(offset)])); + } + fflush(stdout); + } + } + } + } +} + __STATIC_INLINE__ void ggml_ext_tensor_iter( ggml_tensor* tensor, const std::function& fn) { @@ -475,99 +520,6 @@ __STATIC_INLINE__ void ggml_ext_tensor_apply_mask(ggml_tensor* image_data, } } -__STATIC_INLINE__ void sd_image_f32_to_ggml_tensor(sd_image_f32_t image, - ggml_tensor* tensor, - bool scale = true) { - GGML_ASSERT(image.width == tensor->ne[0]); - GGML_ASSERT(image.height == tensor->ne[1]); - GGML_ASSERT(image.channel == tensor->ne[2]); - GGML_ASSERT(1 == tensor->ne[3]); - GGML_ASSERT(tensor->type == GGML_TYPE_F32); - ggml_ext_tensor_iter(tensor, [&](ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = sd_image_get_f32(image, i0, i1, i2, scale); - ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2, i3); - }); -} - -__STATIC_INLINE__ void ggml_ext_tensor_split_2d(ggml_tensor* input, - ggml_tensor* output, - int x, - int y) { - int64_t width = output->ne[0]; - int64_t height = output->ne[1]; - int64_t channels = output->ne[2]; - int64_t ne3 = output->ne[3]; - - int64_t input_width = input->ne[0]; - int64_t input_height = input->ne[1]; - - GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32); - for (int iy = 0; iy < height; iy++) { - for (int ix = 0; ix < width; ix++) { - for (int k = 0; k < channels; k++) { - for (int l = 0; l < ne3; l++) { - float value = ggml_ext_tensor_get_f32(input, (ix + x) % input_width, (iy + y) % input_height, k, l); - ggml_ext_tensor_set_f32(output, value, ix, iy, k, l); - } - } - } - } -} - -// unclamped -> expects x in the range [0-1] -__STATIC_INLINE__ float smootherstep_f32(const float x) { - GGML_ASSERT(x >= 0.f && x <= 1.f); - return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f); -} - -__STATIC_INLINE__ void ggml_ext_tensor_merge_2d(ggml_tensor* input, - ggml_tensor* output, - int x, - int y, - int overlap_x, - int overlap_y, - bool circular_x, - bool circular_y, - int x_skip = 0, - int y_skip = 0) { - int64_t width = input->ne[0]; - int64_t height = input->ne[1]; - int64_t channels = input->ne[2]; - int64_t ne3 = input->ne[3]; - - int64_t img_width = output->ne[0]; - int64_t img_height = output->ne[1]; - - GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32); - for (int iy = y_skip; iy < height; iy++) { - for (int ix = x_skip; ix < width; ix++) { - for (int k = 0; k < channels; k++) { - for (int l = 0; l < ne3; l++) { - float new_value = ggml_ext_tensor_get_f32(input, ix, iy, k, l); - if (overlap_x > 0 || overlap_y > 0) { // blend colors in overlapped area - float old_value = ggml_ext_tensor_get_f32(output, (x + ix) % img_width, (y + iy) % img_height, k, l); - - const float x_f_0 = (circular_x || (overlap_x > 0 && x > 0)) ? (ix - x_skip) / float(overlap_x) : 1; - const float x_f_1 = (circular_x || (overlap_x > 0 && x < (img_width - width))) ? (width - ix) / float(overlap_x) : 1; - const float y_f_0 = (circular_y || (overlap_y > 0 && y > 0)) ? (iy - y_skip) / float(overlap_y) : 1; - const float y_f_1 = (circular_y || (overlap_y > 0 && y < (img_height - height))) ? (height - iy) / float(overlap_y) : 1; - - const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f); - const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f); - - ggml_ext_tensor_set_f32( - output, - old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f), - (x + ix) % img_width, (y + iy) % img_height, k, l); - } else { - ggml_ext_tensor_set_f32(output, new_value, (x + ix) % img_width, (y + iy) % img_height, k, l); - } - } - } - } - } -} - __STATIC_INLINE__ float ggml_ext_tensor_mean(ggml_tensor* src) { float mean = 0.0f; int64_t nelements = ggml_nelements(src); @@ -832,22 +784,102 @@ __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim, } // Tiling -__STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, - ggml_tensor* output, - const int scale, - const int p_tile_size_x, - const int p_tile_size_y, - const float tile_overlap_factor, - const bool circular_x, - const bool circular_y, - on_tile_process on_processing, - bool slient = false) { - output = ggml_set_f32(output, 0); - int input_width = (int)input->ne[0]; - int input_height = (int)input->ne[1]; - int output_width = (int)output->ne[0]; - int output_height = (int)output->ne[1]; +__STATIC_INLINE__ int64_t sd_tensor_plane_size(const sd::Tensor& tensor) { + GGML_ASSERT(tensor.dim() >= 2); + return tensor.shape()[0] * tensor.shape()[1]; +} + +__STATIC_INLINE__ sd::Tensor sd_tensor_split_2d(const sd::Tensor& input, int width, int height, int x, int y) { + GGML_ASSERT(input.dim() >= 4); + std::vector output_shape = input.shape(); + output_shape[0] = width; + output_shape[1] = height; + sd::Tensor output(std::move(output_shape)); + int64_t input_width = input.shape()[0]; + int64_t input_height = input.shape()[1]; + int64_t input_plane = sd_tensor_plane_size(input); + int64_t output_plane = sd_tensor_plane_size(output); + int64_t plane_count = input.numel() / input_plane; + for (int iy = 0; iy < height; iy++) { + for (int ix = 0; ix < width; ix++) { + int64_t src_xy = (ix + x) % input_width + input_width * ((iy + y) % input_height); + int64_t dst_xy = ix + width * iy; + for (int64_t plane = 0; plane < plane_count; ++plane) { + output[plane * output_plane + dst_xy] = input[plane * input_plane + src_xy]; + } + } + } + return output; +} + +__STATIC_INLINE__ void sd_tensor_merge_2d(const sd::Tensor& input, + sd::Tensor* output, + int x, + int y, + int overlap_x, + int overlap_y, + bool circular_x, + bool circular_y, + int x_skip = 0, + int y_skip = 0) { + GGML_ASSERT(output != nullptr); + int64_t width = input.shape()[0]; + int64_t height = input.shape()[1]; + int64_t img_width = output->shape()[0]; + int64_t img_height = output->shape()[1]; + int64_t input_plane = sd_tensor_plane_size(input); + int64_t output_plane = sd_tensor_plane_size(*output); + int64_t plane_count = input.numel() / input_plane; + GGML_ASSERT(output->numel() / output_plane == plane_count); + + // unclamped -> expects x in the range [0-1] + auto smootherstep_f32 = [](const float x) -> float { + GGML_ASSERT(x >= 0.f && x <= 1.f); + return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f); + }; + + for (int iy = y_skip; iy < height; iy++) { + for (int ix = x_skip; ix < width; ix++) { + int64_t src_xy = ix + width * iy; + int64_t ox = (x + ix) % img_width; + int64_t oy = (y + iy) % img_height; + int64_t dst_xy = ox + img_width * oy; + for (int64_t plane = 0; plane < plane_count; ++plane) { + float new_value = input[plane * input_plane + src_xy]; + if (overlap_x > 0 || overlap_y > 0) { + float old_value = (*output)[plane * output_plane + dst_xy]; + const float x_f_0 = (circular_x || (overlap_x > 0 && x > 0)) ? (ix - x_skip) / float(overlap_x) : 1.f; + const float x_f_1 = (circular_x || (overlap_x > 0 && x < (img_width - width))) ? (width - ix) / float(overlap_x) : 1.f; + const float y_f_0 = (circular_y || (overlap_y > 0 && y > 0)) ? (iy - y_skip) / float(overlap_y) : 1.f; + const float y_f_1 = (circular_y || (overlap_y > 0 && y < (img_height - height))) ? (height - iy) / float(overlap_y) : 1.f; + const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f); + const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f); + (*output)[plane * output_plane + dst_xy] = + old_value + new_value * smootherstep_f32(y_f) * smootherstep_f32(x_f); + } else { + (*output)[plane * output_plane + dst_xy] = new_value; + } + } + } + } +} + +template +__STATIC_INLINE__ sd::Tensor process_tiles_2d(const sd::Tensor& input, + int output_width, + int output_height, + int scale, + int p_tile_size_x, + int p_tile_size_y, + float tile_overlap_factor, + bool circular_x, + bool circular_y, + Fn&& on_processing, + bool silent = false) { + sd::Tensor output; + int input_width = static_cast(input.shape()[0]); + int input_height = static_cast(input.shape()[1]); GGML_ASSERT(((input_width / output_width) == (input_height / output_height)) && ((output_width / input_width) == (output_height / input_height))); @@ -856,8 +888,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, int small_width = output_width; int small_height = output_height; - - bool decode = output_width > input_width; + bool decode = output_width > input_width; if (decode) { small_width = input_width; small_height = input_height; @@ -871,25 +902,16 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, float tile_overlap_factor_y; sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor, circular_y); - if (!slient) { - LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y); - LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor); - } - - int tile_overlap_x = (int32_t)(p_tile_size_x * tile_overlap_factor_x); + int tile_overlap_x = static_cast(p_tile_size_x * tile_overlap_factor_x); int non_tile_overlap_x = p_tile_size_x - tile_overlap_x; - - int tile_overlap_y = (int32_t)(p_tile_size_y * tile_overlap_factor_y); + int tile_overlap_y = static_cast(p_tile_size_y * tile_overlap_factor_y); int non_tile_overlap_y = p_tile_size_y - tile_overlap_y; - - int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width; - int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height; - + int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width; + int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height; int input_tile_size_x = tile_size_x; int input_tile_size_y = tile_size_y; int output_tile_size_x = tile_size_x; int output_tile_size_y = tile_size_y; - if (decode) { output_tile_size_x *= scale; output_tile_size_y *= scale; @@ -898,41 +920,23 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, input_tile_size_y *= scale; } - ggml_init_params params = {}; - params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float); // input chunk - params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float); // output chunk - params.mem_size += 3 * ggml_tensor_overhead(); - params.mem_buffer = nullptr; - params.no_alloc = false; - - if (!slient) { - LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); - } - - // draft context - ggml_context* tiles_ctx = ggml_init(params); - if (!tiles_ctx) { - LOG_ERROR("ggml_init() failed"); - return; - } - - // tiling - ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]); - ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]); - int num_tiles = num_tiles_x * num_tiles_y; - if (!slient) { + int num_tiles = num_tiles_x * num_tiles_y; + int tile_count = 1; + bool last_y = false; + bool last_x = false; + float last_time = 0.0f; + if (!silent) { + LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y); + LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor); LOG_DEBUG("processing %i tiles", num_tiles); pretty_progress(0, num_tiles, 0.0f); } - int tile_count = 1; - bool last_y = false, last_x = false; - float last_time = 0.0f; for (int y = 0; y < small_height && !last_y; y += non_tile_overlap_y) { int dy = 0; if (!circular_y && y + tile_size_y >= small_height) { - int _y = y; - y = small_height - tile_size_y; - dy = _y - y; + int original_y = y; + y = small_height - tile_size_y; + dy = original_y - y; if (decode) { dy *= scale; } @@ -941,9 +945,9 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, for (int x = 0; x < small_width && !last_x; x += non_tile_overlap_x) { int dx = 0; if (!circular_x && x + tile_size_x >= small_width) { - int _x = x; - x = small_width - tile_size_x; - dx = _x - x; + int original_x = x; + x = small_width - tile_size_x; + dx = original_x - x; if (decode) { dx *= scale; } @@ -958,38 +962,37 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, int overlap_x_out = decode ? tile_overlap_x * scale : tile_overlap_x; int overlap_y_out = decode ? tile_overlap_y * scale : tile_overlap_y; - int64_t t1 = ggml_time_ms(); - ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in); - if (on_processing(input_tile, output_tile, false)) { - ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, circular_x, circular_y, dx, dy); + int64_t t1 = ggml_time_ms(); + auto input_tile = sd_tensor_split_2d(input, input_tile_size_x, input_tile_size_y, x_in, y_in); + auto output_tile = on_processing(input_tile); + if (output_tile.empty()) { + return {}; + } + GGML_ASSERT(output_tile.shape()[0] == output_tile_size_x && output_tile.shape()[1] == output_tile_size_y); + if (output.empty()) { + std::vector output_shape = output_tile.shape(); + output_shape[0] = output_width; + output_shape[1] = output_height; + output = sd::Tensor::zeros(std::move(output_shape)); + } + sd_tensor_merge_2d(output_tile, &output, x_out, y_out, overlap_x_out, overlap_y_out, circular_x, circular_y, dx, dy); + if (!silent) { int64_t t2 = ggml_time_ms(); last_time = (t2 - t1) / 1000.0f; pretty_progress(tile_count, num_tiles, last_time); - } else { - LOG_ERROR("Failed to process patch %d at (%d, %d)", tile_count, x, y); } tile_count++; } last_x = false; } - if (!slient) { - if (tile_count < num_tiles) { - pretty_progress(num_tiles, num_tiles, last_time); - } + if (!silent && tile_count < num_tiles) { + pretty_progress(num_tiles, num_tiles, last_time); } - ggml_free(tiles_ctx); -} - -__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, - ggml_tensor* output, - const int scale, - const int tile_size, - const float tile_overlap_factor, - const bool circular_x, - const bool circular_y, - on_tile_process on_processing) { - sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, circular_x, circular_y, on_processing); + if (output.empty()) { + return {}; + } + return output; } __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm_32(ggml_context* ctx, @@ -1588,6 +1591,18 @@ __STATIC_INLINE__ void set_timestep_embedding(std::vector timesteps, memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding)); } +__STATIC_INLINE__ void set_timestep_embedding(std::vector timesteps, + sd::Tensor* embedding, + int dim, + int max_period = 10000) { + GGML_ASSERT(embedding != nullptr); + std::vector embedding_vec = timestep_embedding(timesteps, dim, max_period); + if (embedding->numel() != static_cast(embedding_vec.size())) { + embedding->resize({dim, static_cast(timesteps.size())}); + } + std::copy(embedding_vec.begin(), embedding_vec.end(), embedding->values().begin()); +} + __STATIC_INLINE__ ggml_tensor* new_timestep_embedding(ggml_context* ctx, std::vector timesteps, int dim, @@ -1705,6 +1720,32 @@ protected: bool circular_x_enabled = false; bool circular_y_enabled = false; + template + static sd::Tensor take_or_empty(std::optional> tensor) { + if (!tensor.has_value()) { + return {}; + } + return std::move(*tensor); + } + + template + static sd::Tensor restore_trailing_singleton_dims(std::optional> tensor, + size_t expected_dim) { + return restore_trailing_singleton_dims(take_or_empty(std::move(tensor)), expected_dim); + } + + template + static sd::Tensor restore_trailing_singleton_dims(sd::Tensor tensor, + size_t expected_dim) { + if (tensor.empty()) { + return tensor; + } + while (static_cast(tensor.dim()) < expected_dim) { + tensor.unsqueeze_(tensor.dim()); + } + return tensor; + } + void alloc_params_ctx() { ggml_init_params params; params.mem_size = static_cast(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead()); @@ -2042,6 +2083,29 @@ public: backend_tensor_data_map[tensor] = data; } + template + ggml_tensor* make_input(const sd::Tensor& tensor) { + ggml_tensor* input = sd::make_ggml_tensor(compute_ctx, tensor, false); + set_backend_tensor_data(input, tensor.data()); + return input; + } + + template + ggml_tensor* make_optional_input(const sd::Tensor& tensor) { + if (tensor.empty()) { + return nullptr; + } + return make_input(tensor); + } + + template + ggml_tensor* make_optional_input(const sd::Tensor* tensor) { + if (tensor == nullptr) { + return nullptr; + } + return make_input(*tensor); + } + ggml_tensor* to_backend(ggml_tensor* tensor) { GGML_ASSERT(compute_ctx != nullptr); if (tensor == nullptr) { @@ -2070,24 +2134,24 @@ public: return ggml_get_tensor(cache_ctx, name.c_str()); } - bool compute(get_graph_cb_t get_graph, - int n_threads, - bool free_compute_buffer_immediately = true, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + template + std::optional> compute(get_graph_cb_t get_graph, + int n_threads, + bool free_compute_buffer_immediately, + bool no_return = false) { if (!offload_params_to_runtime_backend()) { LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str()); - return false; + return std::nullopt; } if (!alloc_compute_buffer(get_graph)) { LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); - return false; + return std::nullopt; } reset_compute_ctx(); ggml_cgraph* gf = get_compute_graph(get_graph); if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); - return false; + return std::nullopt; } copy_data_to_backend_tensor(); if (ggml_backend_is_cpu(runtime_backend)) { @@ -2097,26 +2161,19 @@ public: ggml_status status = ggml_backend_graph_compute(runtime_backend, gf); if (status != GGML_STATUS_SUCCESS) { LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); - return false; + return std::nullopt; } -#ifdef GGML_PERF - ggml_graph_print(gf); -#endif copy_cache_tensors_to_cache_buffer(); - if (output != nullptr) { - auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); - if (*output == nullptr && output_ctx != nullptr) { - *output = ggml_dup_tensor(output_ctx, result); - } - if (*output != nullptr) { - ggml_ext_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output)); - } + auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); + std::optional> output; + if (!no_return) { + output = sd::make_sd_tensor_from_ggml(result); } if (free_compute_buffer_immediately) { free_compute_buffer(); } - return true; + return output; } void set_flash_attention_enabled(bool enabled) { diff --git a/src/latent-preview.h b/src/latent-preview.h index 5078a6bd..7f30734f 100644 --- a/src/latent-preview.h +++ b/src/latent-preview.h @@ -1,6 +1,8 @@ +#include #include #include #include "ggml.h" +#include "tensor.hpp" const float wan_21_latent_rgb_proj[16][3] = { {0.015123f, -0.148418f, 0.479828f}, @@ -232,3 +234,67 @@ void preview_latent_video(uint8_t* buffer, ggml_tensor* latents, const float (*l } } } + +static inline bool preview_latent_tensor_is_video(const sd::Tensor& latents) { + return latents.dim() == 5; +} + +void preview_latent_video(uint8_t* buffer, const sd::Tensor& latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) { + uint32_t latent_width = static_cast(latents.shape()[0]); + uint32_t latent_height = static_cast(latents.shape()[1]); + bool is_video = preview_latent_tensor_is_video(latents); + uint32_t frames = is_video ? static_cast(latents.shape()[2]) : 1; + uint32_t dim = is_video ? static_cast(latents.shape()[3]) : static_cast(latents.shape()[2]); + + uint32_t rgb_width = latent_width * patch_size; + uint32_t rgb_height = latent_height * patch_size; + uint32_t unpatched_dim = dim / (patch_size * patch_size); + + for (uint32_t k = 0; k < frames; k++) { + for (uint32_t rgb_x = 0; rgb_x < rgb_width; rgb_x++) { + for (uint32_t rgb_y = 0; rgb_y < rgb_height; rgb_y++) { + uint32_t latent_x = rgb_x / patch_size; + uint32_t latent_y = rgb_y / patch_size; + + uint32_t channel_offset = 0; + if (patch_size > 1) { + channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size)); + } + + size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x; + auto latent_value = [&](uint32_t latent_channel) -> float { + return is_video + ? latents.values()[latent_x + latent_width * (latent_y + latent_height * (k + frames * latent_channel))] + : latents.values()[latent_x + latent_width * (latent_y + latent_height * latent_channel)]; + }; + + float r = 0.f, g = 0.f, b = 0.f; + if (latent_rgb_proj != nullptr) { + for (uint32_t d = 0; d < unpatched_dim; d++) { + uint32_t latent_channel = d * patch_size * patch_size + channel_offset; + float value = latent_value(latent_channel); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } + } else { + r = latent_value(0); + g = latent_value(1); + b = latent_value(2); + } + if (latent_rgb_bias != nullptr) { + r += latent_rgb_bias[0]; + g += latent_rgb_bias[1]; + b += latent_rgb_bias[2]; + } + r = std::min(1.0f, std::max(0.0f, r * .5f + .5f)); + g = std::min(1.0f, std::max(0.0f, g * .5f + .5f)); + b = std::min(1.0f, std::max(0.0f, b * .5f + .5f)); + + buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255); + buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255); + buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255); + } + } + } +} diff --git a/src/llm.hpp b/src/llm.hpp index 5a9c25c8..c6c29614 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -194,6 +194,7 @@ namespace LLM { bool padding = false) { if (add_bos_token) { tokens.insert(tokens.begin(), BOS_TOKEN_ID); + weights.insert(weights.begin(), 1.f); } if (max_length > 0 && padding) { size_t n = static_cast(std::ceil(tokens.size() * 1.f / max_length)); @@ -1180,16 +1181,17 @@ namespace LLM { return hidden_states; } - ggml_cgraph* build_graph(ggml_tensor* input_ids, - ggml_tensor* attention_mask, - std::vector> image_embeds, + ggml_cgraph* build_graph(const sd::Tensor& input_ids_tensor, + const sd::Tensor& attention_mask_tensor, + const std::vector>>& image_embeds_tensor, std::set out_layers) { - ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - input_ids = to_backend(input_ids); - - for (auto& image_embed : image_embeds) { - image_embed.second = to_backend(image_embed.second); + ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_tensor* input_ids = make_input(input_ids_tensor); + std::vector> image_embeds; + image_embeds.reserve(image_embeds_tensor.size()); + for (const auto& [idx, embed_tensor] : image_embeds_tensor) { + ggml_tensor* embed = make_input(embed_tensor); + image_embeds.emplace_back(idx, embed); } int64_t n_tokens = input_ids->ne[0]; @@ -1213,8 +1215,9 @@ namespace LLM { input_pos_vec.size()); set_backend_tensor_data(input_pos, input_pos_vec.data()); - if (attention_mask != nullptr) { - attention_mask = to_backend(attention_mask); + ggml_tensor* attention_mask = nullptr; + if (!attention_mask_tensor.empty()) { + attention_mask = make_input(attention_mask_tensor); } else { attention_mask_vec.resize(n_tokens * n_tokens); for (int i0 = 0; i0 < n_tokens; i0++) { @@ -1239,17 +1242,15 @@ namespace LLM { return gf; } - bool compute(const int n_threads, - ggml_tensor* input_ids, - ggml_tensor* attention_mask, - std::vector> image_embeds, - std::set out_layers, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(const int n_threads, + const sd::Tensor& input_ids, + const sd::Tensor& attention_mask, + const std::vector>>& image_embeds, + std::set out_layers) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, attention_mask, image_embeds, out_layers); }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); } int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) { @@ -1288,8 +1289,9 @@ namespace LLM { return image; } - ggml_cgraph* build_encode_image_graph(ggml_tensor* image) { - ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE); + ggml_cgraph* build_encode_image_graph(const sd::Tensor& image_tensor) { + ggml_cgraph* gf = new_graph_custom(LLM_GRAPH_SIZE); + ggml_tensor* image = make_input(image_tensor); GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); @@ -1301,8 +1303,6 @@ namespace LLM { int llm_grid_w = grid_w / params.vision.spatial_merge_size; int vit_merger_window_size = params.vision.window_size / params.vision.patch_size / params.vision.spatial_merge_size; - image = to_backend(image); - auto pixel_values = process_image(compute_ctx, image); // window index @@ -1411,14 +1411,12 @@ namespace LLM { return gf; } - void encode_image(const int n_threads, - ggml_tensor* image, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor encode_image(const int n_threads, + const sd::Tensor& image) { auto get_graph = [&]() -> ggml_cgraph* { return build_encode_image_graph(image); }; - GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, false)); } }; @@ -1497,39 +1495,41 @@ namespace LLM { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); bool test_mistral = false; bool test_qwen3 = true; bool test_vit = false; bool test_decoder_with_vit = false; if (test_decoder_with_vit) { - ggml_tensor* image_embed = nullptr; + sd::Tensor image_embed; { - auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); - print_ggml_tensor(image, false, "image"); - ggml_tensor* out = nullptr; + auto image = sd::load_tensor_from_file_as_tensor("qwen2vl_normalized.bin"); + print_sd_tensor(image, false, "image"); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.encode_image(8, image, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.encode_image(8, image); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out, false, "image_embed"); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out, false, "image_embed"); image_embed = out; LOG_DEBUG("llm encode_image test done in %lldms", t1 - t0); } std::string placeholder = "<|image_pad|>"; std::string img_prompt = "Picture 1: <|vision_start|>"; // [24669, 220, 16, 25, 220, 151652] - int64_t num_image_tokens = image_embed->ne[1]; + int64_t num_image_tokens = image_embed.shape()[1]; img_prompt.reserve(num_image_tokens * placeholder.size()); for (int i = 0; i < num_image_tokens; i++) { img_prompt += placeholder; } img_prompt += "<|vision_end|>"; - std::vector> image_embeds; + std::vector>> image_embeds; image_embeds.emplace_back(64, image_embed); std::pair prompt_attn_range; @@ -1547,29 +1547,33 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - ggml_tensor* out = nullptr; + auto input_ids = sd::Tensor::from_vector(tokens); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, nullptr, image_embeds, {}, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, sd::Tensor(), image_embeds, {}); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("llm test done in %lldms", t1 - t0); } else if (test_vit) { - // auto image = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 280, 280, 3); + // auto image = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 280, 280, 3); // ggml_set_f32(image, 0.f); - auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); - print_ggml_tensor(image, false, "image"); - ggml_tensor* out = nullptr; + auto image = sd::load_tensor_from_file_as_tensor("qwen2vl_normalized.bin"); + print_sd_tensor(image, false, "image"); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.encode_image(8, image, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.encode_image(8, image); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out, false, "out"); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out, false, "out"); - // auto ref_out = load_tensor_from_file(work_ctx, "qwen2vl.bin"); + // auto ref_out = load_tensor_from_file(ctx, "qwen2vl.bin"); // ggml_ext_tensor_diff(ref_out, out, 0.01f); LOG_DEBUG("llm test done in %lldms", t1 - t0); @@ -1587,14 +1591,16 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - ggml_tensor* out = nullptr; + auto input_ids = sd::Tensor::from_vector(tokens); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, nullptr, {}, {10, 20, 30}, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, sd::Tensor(), {}, {10, 20, 30}); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("llm test done in %lldms", t1 - t0); } else if (test_qwen3) { std::pair prompt_attn_range; @@ -1610,14 +1616,16 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - ggml_tensor* out = nullptr; + auto input_ids = sd::Tensor::from_vector(tokens); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, nullptr, {}, {35}, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, sd::Tensor(), {}, {35}); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("llm test done in %lldms", t1 - t0); } else { std::pair prompt_attn_range; @@ -1633,14 +1641,16 @@ namespace LLM { printf("%d ", token); } printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - ggml_tensor* out = nullptr; + auto input_ids = sd::Tensor::from_vector(tokens); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, nullptr, {}, {}, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, sd::Tensor(), {}, {}); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("llm test done in %lldms", t1 - t0); } } diff --git a/src/lora.hpp b/src/lora.hpp index 7df04ea2..d4a749ef 100644 --- a/src/lora.hpp +++ b/src/lora.hpp @@ -792,7 +792,7 @@ struct LoraModel : public GGMLRunner { auto get_graph = [&]() -> ggml_cgraph* { return build_lora_graph(model_tensors, version); }; - GGMLRunner::compute(get_graph, n_threads, false); + GGMLRunner::compute(get_graph, n_threads, false, true); stat(); for (auto item : original_tensor_to_final_tensor) { ggml_tensor* original_tensor = item.first; diff --git a/src/mmdit.hpp b/src/mmdit.hpp index 7fbb2b24..e75736c5 100644 --- a/src/mmdit.hpp +++ b/src/mmdit.hpp @@ -836,17 +836,17 @@ struct MMDiTRunner : public GGMLRunner { mmdit.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* y, - std::vector skip_layers = std::vector()) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& y_tensor = {}, + std::vector skip_layers = std::vector()) { ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE); - x = to_backend(x); - context = to_backend(context); - y = to_backend(y); - timesteps = to_backend(timesteps); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* y = make_optional_input(y_tensor); auto runner_ctx = get_context(); ggml_tensor* out = mmdit.forward(&runner_ctx, @@ -861,14 +861,12 @@ struct MMDiTRunner : public GGMLRunner { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* y, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr, - std::vector skip_layers = std::vector()) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& y = {}, + std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size] @@ -877,7 +875,7 @@ struct MMDiTRunner : public GGMLRunner { return build_graph(x, timesteps, context, y, skip_layers); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -886,35 +884,41 @@ struct MMDiTRunner : public GGMLRunner { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // cpu f16: pass // cpu f32: pass // cuda f16: pass // cuda f32: pass - auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 128, 128, 16, 1); + sd::Tensor x({128, 128, 16, 1}); std::vector timesteps_vec(1, 999.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); - ggml_set_f32(x, 0.01f); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); + x.fill_(0.01f); // print_ggml_tensor(x); - auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 154, 1); - ggml_set_f32(context, 0.01f); + sd::Tensor context({4096, 154, 1}); + context.fill_(0.01f); // print_ggml_tensor(context); - auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 2048, 1); - ggml_set_f32(y, 0.01f); + sd::Tensor y({2048, 1}); + y.fill_(0.01f); // print_ggml_tensor(y); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, y, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + y); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("mmdit test done in %lldms", t1 - t0); } } diff --git a/src/pmid.hpp b/src/pmid.hpp index 30c47325..f19a8c3c 100644 --- a/src/pmid.hpp +++ b/src/pmid.hpp @@ -443,11 +443,10 @@ public: id_encoder2.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph( // ggml_allocr* allocr, - ggml_tensor* id_pixel_values, - ggml_tensor* prompt_embeds, - std::vector& class_tokens_mask, - ggml_tensor* id_embeds) { + ggml_cgraph* build_graph(const sd::Tensor& id_pixel_values_tensor, + const sd::Tensor& prompt_embeds_tensor, + std::vector& class_tokens_mask, + const sd::Tensor& id_embeds_tensor = {}) { ctm.clear(); ctmf16.clear(); ctmpos.clear(); @@ -460,16 +459,16 @@ public: ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_tensor* id_pixel_values = make_input(id_pixel_values_tensor); + ggml_tensor* prompt_embeds = make_input(prompt_embeds_tensor); + ggml_tensor* id_embeds = make_optional_input(id_embeds_tensor); + int64_t hidden_size = prompt_embeds->ne[0]; int64_t seq_length = prompt_embeds->ne[1]; ggml_type type = GGML_TYPE_F32; ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size()); - ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values); - ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds); - ggml_tensor* id_embeds_d = to_backend(id_embeds); - ggml_tensor* left = nullptr; ggml_tensor* right = nullptr; for (int i = 0; i < class_tokens_mask.size(); i++) { @@ -529,18 +528,18 @@ public: ggml_tensor* updated_prompt_embeds = nullptr; if (pm_version == PM_VERSION_1) updated_prompt_embeds = id_encoder.forward(&runner_ctx, - id_pixel_values_d, - prompt_embeds_d, + id_pixel_values, + prompt_embeds, class_tokens_mask_d, class_tokens_mask_pos, left, right); else if (pm_version == PM_VERSION_2) updated_prompt_embeds = id_encoder2.forward(&runner_ctx, - id_pixel_values_d, - prompt_embeds_d, + id_pixel_values, + prompt_embeds, class_tokens_mask_d, class_tokens_mask_pos, - id_embeds_d, + id_embeds, left, right); ggml_build_forward_expand(gf, updated_prompt_embeds); @@ -548,20 +547,16 @@ public: return gf; } - bool compute(const int n_threads, - ggml_tensor* id_pixel_values, - ggml_tensor* prompt_embeds, - ggml_tensor* id_embeds, - std::vector& class_tokens_mask, - ggml_tensor** updated_prompt_embeds, - ggml_context* output_ctx) { + sd::Tensor compute(const int n_threads, + const sd::Tensor& id_pixel_values, + const sd::Tensor& prompt_embeds, + const sd::Tensor& id_embeds, + std::vector& class_tokens_mask) { auto get_graph = [&]() -> ggml_cgraph* { - // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask); return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds); }; - // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds); - return GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); } }; diff --git a/src/preprocessing.hpp b/src/preprocessing.hpp index ca05ca22..7c83a289 100644 --- a/src/preprocessing.hpp +++ b/src/preprocessing.hpp @@ -1,179 +1,241 @@ #ifndef __PREPROCESSING_HPP__ #define __PREPROCESSING_HPP__ +#include +#include + #include "ggml_extend.hpp" + #define M_PI_ 3.14159265358979323846f -void convolve(ggml_tensor* input, ggml_tensor* output, ggml_tensor* kernel, int padding) { - ggml_init_params params; - params.mem_size = 80 * input->ne[0] * input->ne[1]; // 20M for 512x512 - params.mem_buffer = nullptr; - params.no_alloc = false; - ggml_context* ctx0 = ggml_init(params); - ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1); - ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel)); - ggml_tensor* h = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1); - ggml_cgraph* gf = ggml_new_graph(ctx0); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output)); - ggml_graph_compute_with_ctx(ctx0, gf, 1); - ggml_free(ctx0); +static inline int64_t preprocessing_offset_4d(const sd::Tensor& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { + const auto& shape = tensor.shape(); + int64_t n0 = shape.size() > 0 ? shape[0] : 1; + int64_t n1 = shape.size() > 1 ? shape[1] : 1; + int64_t n2 = shape.size() > 2 ? shape[2] : 1; + return ((i3 * n2 + i2) * n1 + i1) * n0 + i0; } -void gaussian_kernel(ggml_tensor* kernel) { - int ks_mid = static_cast(kernel->ne[0] / 2); +static inline float preprocessing_get_4d(const sd::Tensor& tensor, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { + return tensor.values()[static_cast(preprocessing_offset_4d(tensor, i0, i1, i2, i3))]; +} + +static inline void preprocessing_set_4d(sd::Tensor& tensor, float value, int64_t i0, int64_t i1 = 0, int64_t i2 = 0, int64_t i3 = 0) { + tensor.values()[static_cast(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value; +} + +static inline sd::Tensor sd_image_to_preprocessing_tensor(sd_image_t image) { + sd::Tensor tensor({static_cast(image.width), static_cast(image.height), static_cast(image.channel), 1}); + for (uint32_t y = 0; y < image.height; ++y) { + for (uint32_t x = 0; x < image.width; ++x) { + for (uint32_t c = 0; c < image.channel; ++c) { + preprocessing_set_4d(tensor, sd_image_get_f32(image, x, y, c), x, y, c, 0); + } + } + } + return tensor; +} + +static inline void preprocessing_tensor_to_sd_image(const sd::Tensor& tensor, uint8_t* image_data) { + GGML_ASSERT(tensor.dim() == 4); + GGML_ASSERT(tensor.shape()[3] == 1); + GGML_ASSERT(image_data != nullptr); + + int width = static_cast(tensor.shape()[0]); + int height = static_cast(tensor.shape()[1]); + int channel = static_cast(tensor.shape()[2]); + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < channel; ++c) { + float value = preprocessing_get_4d(tensor, x, y, c, 0); + value = std::min(1.0f, std::max(0.0f, value)); + image_data[(y * width + x) * channel + c] = static_cast(std::round(value * 255.0f)); + } + } + } +} + +static inline sd::Tensor gaussian_kernel_tensor(int kernel_size) { + sd::Tensor kernel({kernel_size, kernel_size, 1, 1}); + int ks_mid = kernel_size / 2; float sigma = 1.4f; - float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f)); - for (int y = 0; y < kernel->ne[0]; y++) { + float normal = 1.f / (2.0f * M_PI_ * std::pow(sigma, 2.0f)); + for (int y = 0; y < kernel_size; ++y) { float gx = static_cast(-ks_mid + y); - for (int x = 0; x < kernel->ne[1]; x++) { + for (int x = 0; x < kernel_size; ++x) { float gy = static_cast(-ks_mid + x); - float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal; - ggml_ext_tensor_set_f32(kernel, k_, x, y); + float k = std::exp(-((gx * gx + gy * gy) / (2.0f * std::pow(sigma, 2.0f)))) * normal; + preprocessing_set_4d(kernel, k, x, y, 0, 0); } } + return kernel; } -void grayscale(ggml_tensor* rgb_img, ggml_tensor* grayscale) { - for (int iy = 0; iy < rgb_img->ne[1]; iy++) { - for (int ix = 0; ix < rgb_img->ne[0]; ix++) { - float r = ggml_ext_tensor_get_f32(rgb_img, ix, iy); - float g = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 1); - float b = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 2); +static inline sd::Tensor convolve_tensor(const sd::Tensor& input, const sd::Tensor& kernel, int padding) { + GGML_ASSERT(input.dim() == 4); + GGML_ASSERT(kernel.dim() == 4); + GGML_ASSERT(input.shape()[3] == 1); + GGML_ASSERT(kernel.shape()[2] == 1); + GGML_ASSERT(kernel.shape()[3] == 1); + + sd::Tensor output(input.shape()); + int64_t width = input.shape()[0]; + int64_t height = input.shape()[1]; + int64_t channels = input.shape()[2]; + int64_t kernel_w = kernel.shape()[0]; + int64_t kernel_h = kernel.shape()[1]; + + for (int64_t c = 0; c < channels; ++c) { + for (int64_t y = 0; y < height; ++y) { + for (int64_t x = 0; x < width; ++x) { + float sum = 0.0f; + for (int64_t ky = 0; ky < kernel_h; ++ky) { + int64_t iy = y + ky - padding; + if (iy < 0 || iy >= height) { + continue; + } + for (int64_t kx = 0; kx < kernel_w; ++kx) { + int64_t ix = x + kx - padding; + if (ix < 0 || ix >= width) { + continue; + } + sum += preprocessing_get_4d(input, ix, iy, c, 0) * preprocessing_get_4d(kernel, kx, ky, 0, 0); + } + } + preprocessing_set_4d(output, sum, x, y, c, 0); + } + } + } + return output; +} + +static inline sd::Tensor grayscale_tensor(const sd::Tensor& rgb_img) { + GGML_ASSERT(rgb_img.dim() == 4); + GGML_ASSERT(rgb_img.shape()[2] >= 3); + sd::Tensor grayscale({rgb_img.shape()[0], rgb_img.shape()[1], 1, rgb_img.shape()[3]}); + for (int64_t iy = 0; iy < rgb_img.shape()[1]; ++iy) { + for (int64_t ix = 0; ix < rgb_img.shape()[0]; ++ix) { + float r = preprocessing_get_4d(rgb_img, ix, iy, 0, 0); + float g = preprocessing_get_4d(rgb_img, ix, iy, 1, 0); + float b = preprocessing_get_4d(rgb_img, ix, iy, 2, 0); float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b; - ggml_ext_tensor_set_f32(grayscale, gray, ix, iy); + preprocessing_set_4d(grayscale, gray, ix, iy, 0, 0); } } + return grayscale; } -void prop_hypot(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) { - int n_elements = static_cast(ggml_nelements(h)); - float* dx = (float*)x->data; - float* dy = (float*)y->data; - float* dh = (float*)h->data; - for (int i = 0; i < n_elements; i++) { - dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]); +static inline sd::Tensor tensor_hypot(const sd::Tensor& x, const sd::Tensor& y) { + sd::tensor_check_same_shape(x, y); + sd::Tensor out(x.shape()); + for (int64_t i = 0; i < out.numel(); ++i) { + out[i] = std::sqrt(x[i] * x[i] + y[i] * y[i]); } + return out; } -void prop_arctan2(ggml_tensor* x, ggml_tensor* y, ggml_tensor* h) { - int n_elements = static_cast(ggml_nelements(h)); - float* dx = (float*)x->data; - float* dy = (float*)y->data; - float* dh = (float*)h->data; - for (int i = 0; i < n_elements; i++) { - dh[i] = atan2f(dy[i], dx[i]); +static inline sd::Tensor tensor_arctan2(const sd::Tensor& x, const sd::Tensor& y) { + sd::tensor_check_same_shape(x, y); + sd::Tensor out(x.shape()); + for (int64_t i = 0; i < out.numel(); ++i) { + out[i] = std::atan2(y[i], x[i]); } + return out; } -void normalize_tensor(ggml_tensor* g) { - int n_elements = static_cast(ggml_nelements(g)); - float* dg = (float*)g->data; - float max = -INFINITY; - for (int i = 0; i < n_elements; i++) { - max = dg[i] > max ? dg[i] : max; +static inline void normalize_tensor(sd::Tensor* g) { + GGML_ASSERT(g != nullptr); + if (g->empty()) { + return; } - max = 1.0f / max; - for (int i = 0; i < n_elements; i++) { - dg[i] *= max; + float max_value = -std::numeric_limits::infinity(); + for (int64_t i = 0; i < g->numel(); ++i) { + max_value = std::max(max_value, (*g)[i]); } + if (max_value == 0.0f || !std::isfinite(max_value)) { + return; + } + *g *= (1.0f / max_value); } -void non_max_supression(ggml_tensor* result, ggml_tensor* G, ggml_tensor* D) { - for (int iy = 1; iy < result->ne[1] - 1; iy++) { - for (int ix = 1; ix < result->ne[0] - 1; ix++) { - float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_; - angle = angle < 0.0f ? angle += 180.0f : angle; +static inline sd::Tensor non_max_supression(const sd::Tensor& G, const sd::Tensor& D) { + GGML_ASSERT(G.shape() == D.shape()); + sd::Tensor result = sd::Tensor::zeros(G.shape()); + for (int64_t iy = 1; iy < result.shape()[1] - 1; ++iy) { + for (int64_t ix = 1; ix < result.shape()[0] - 1; ++ix) { + float angle = preprocessing_get_4d(D, ix, iy, 0, 0) * 180.0f / M_PI_; + angle = angle < 0.0f ? angle + 180.0f : angle; float q = 1.0f; float r = 1.0f; - // angle 0 - if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) { - q = ggml_ext_tensor_get_f32(G, ix, iy + 1); - r = ggml_ext_tensor_get_f32(G, ix, iy - 1); - } - // angle 45 - else if (22.5f >= angle && angle < 67.5f) { - q = ggml_ext_tensor_get_f32(G, ix + 1, iy - 1); - r = ggml_ext_tensor_get_f32(G, ix - 1, iy + 1); - } - // angle 90 - else if (67.5f >= angle && angle < 112.5) { - q = ggml_ext_tensor_get_f32(G, ix + 1, iy); - r = ggml_ext_tensor_get_f32(G, ix - 1, iy); - } - // angle 135 - else if (112.5 >= angle && angle < 157.5f) { - q = ggml_ext_tensor_get_f32(G, ix - 1, iy - 1); - r = ggml_ext_tensor_get_f32(G, ix + 1, iy + 1); + if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180.0f)) { + q = preprocessing_get_4d(G, ix, iy + 1, 0, 0); + r = preprocessing_get_4d(G, ix, iy - 1, 0, 0); + } else if (22.5f >= angle && angle < 67.5f) { + q = preprocessing_get_4d(G, ix + 1, iy - 1, 0, 0); + r = preprocessing_get_4d(G, ix - 1, iy + 1, 0, 0); + } else if (67.5f >= angle && angle < 112.5f) { + q = preprocessing_get_4d(G, ix + 1, iy, 0, 0); + r = preprocessing_get_4d(G, ix - 1, iy, 0, 0); + } else if (112.5f >= angle && angle < 157.5f) { + q = preprocessing_get_4d(G, ix - 1, iy - 1, 0, 0); + r = preprocessing_get_4d(G, ix + 1, iy + 1, 0, 0); } - float cur = ggml_ext_tensor_get_f32(G, ix, iy); - if ((cur >= q) && (cur >= r)) { - ggml_ext_tensor_set_f32(result, cur, ix, iy); - } else { - ggml_ext_tensor_set_f32(result, 0.0f, ix, iy); - } + float cur = preprocessing_get_4d(G, ix, iy, 0, 0); + preprocessing_set_4d(result, (cur >= q && cur >= r) ? cur : 0.0f, ix, iy, 0, 0); } } + return result; } -void threshold_hystersis(ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) { - int n_elements = static_cast(ggml_nelements(img)); - float* imd = (float*)img->data; - float max = -INFINITY; - for (int i = 0; i < n_elements; i++) { - max = imd[i] > max ? imd[i] : max; +static inline void threshold_hystersis(sd::Tensor* img, float high_threshold, float low_threshold, float weak, float strong) { + GGML_ASSERT(img != nullptr); + if (img->empty()) { + return; } - float ht = max * high_threshold; + float max_value = -std::numeric_limits::infinity(); + for (int64_t i = 0; i < img->numel(); ++i) { + max_value = std::max(max_value, (*img)[i]); + } + + float ht = max_value * high_threshold; float lt = ht * low_threshold; - for (int i = 0; i < n_elements; i++) { - float img_v = imd[i]; - if (img_v >= ht) { // strong pixel - imd[i] = strong; - } else if (img_v <= ht && img_v >= lt) { // strong pixel - imd[i] = weak; + for (int64_t i = 0; i < img->numel(); ++i) { + float img_v = (*img)[i]; + if (img_v >= ht) { + (*img)[i] = strong; + } else if (img_v <= ht && img_v >= lt) { + (*img)[i] = weak; } } - for (int iy = 0; iy < img->ne[1]; iy++) { - for (int ix = 0; ix < img->ne[0]; ix++) { - if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) { - ggml_ext_tensor_set_f32(img, ggml_ext_tensor_get_f32(img, ix, iy), ix, iy); - } else { - ggml_ext_tensor_set_f32(img, 0.0f, ix, iy); + for (int64_t iy = 0; iy < img->shape()[1]; ++iy) { + for (int64_t ix = 0; ix < img->shape()[0]; ++ix) { + if (!(ix >= 3 && ix <= img->shape()[0] - 3 && iy >= 3 && iy <= img->shape()[1] - 3)) { + preprocessing_set_4d(*img, 0.0f, ix, iy, 0, 0); } } } - // hysteresis - for (int iy = 1; iy < img->ne[1] - 1; iy++) { - for (int ix = 1; ix < img->ne[0] - 1; ix++) { - float imd_v = ggml_ext_tensor_get_f32(img, ix, iy); + for (int64_t iy = 1; iy < img->shape()[1] - 1; ++iy) { + for (int64_t ix = 1; ix < img->shape()[0] - 1; ++ix) { + float imd_v = preprocessing_get_4d(*img, ix, iy, 0, 0); if (imd_v == weak) { - if (ggml_ext_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix + 1, iy) == strong || - ggml_ext_tensor_get_f32(img, ix, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix, iy + 1) == strong || - ggml_ext_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix - 1, iy) == strong) { - ggml_ext_tensor_set_f32(img, strong, ix, iy); - } else { - ggml_ext_tensor_set_f32(img, 0.0f, ix, iy); - } + bool has_strong_neighbor = + preprocessing_get_4d(*img, ix + 1, iy - 1, 0, 0) == strong || + preprocessing_get_4d(*img, ix + 1, iy, 0, 0) == strong || + preprocessing_get_4d(*img, ix, iy - 1, 0, 0) == strong || + preprocessing_get_4d(*img, ix, iy + 1, 0, 0) == strong || + preprocessing_get_4d(*img, ix - 1, iy - 1, 0, 0) == strong || + preprocessing_get_4d(*img, ix - 1, iy, 0, 0) == strong; + preprocessing_set_4d(*img, has_strong_neighbor ? strong : 0.0f, ix, iy, 0, 0); } } } } bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) { - ggml_init_params params; - params.mem_size = static_cast(40 * img.width * img.height); // 10MB for 512x512 - params.mem_buffer = nullptr; - params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); - return false; - } - float kX[9] = { -1, 0, 1, -2, 0, 2, @@ -184,43 +246,33 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, 0, 0, 0, -1, -2, -1}; - // generate kernel - int kernel_size = 5; - ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1); - ggml_tensor* sf_kx = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); - memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx)); - ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1); - memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky)); - gaussian_kernel(gkernel); - ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1); - ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1); - ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray); - ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray); - ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray); - ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray); - sd_image_to_ggml_tensor(img, image); - grayscale(image, image_gray); - convolve(image_gray, image_gray, gkernel, 2); - convolve(image_gray, iX, sf_kx, 1); - convolve(image_gray, iY, sf_ky, 1); - prop_hypot(iX, iY, G); - normalize_tensor(G); - prop_arctan2(iX, iY, tetha); - non_max_supression(image_gray, G, tetha); - threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong); - // to RGB channels - for (uint32_t iy = 0; iy < img.height; iy++) { - for (uint32_t ix = 0; ix < img.width; ix++) { - float gray = ggml_ext_tensor_get_f32(image_gray, ix, iy); + sd::Tensor gkernel = gaussian_kernel_tensor(5); + sd::Tensor sf_kx({3, 3, 1, 1}, std::vector(kX, kX + 9)); + sd::Tensor sf_ky({3, 3, 1, 1}, std::vector(kY, kY + 9)); + + sd::Tensor image = sd_image_to_preprocessing_tensor(img); + sd::Tensor image_gray = grayscale_tensor(image); + image_gray = convolve_tensor(image_gray, gkernel, 2); + sd::Tensor iX = convolve_tensor(image_gray, sf_kx, 1); + sd::Tensor iY = convolve_tensor(image_gray, sf_ky, 1); + sd::Tensor G = tensor_hypot(iX, iY); + normalize_tensor(&G); + sd::Tensor theta = tensor_arctan2(iX, iY); + image_gray = non_max_supression(G, theta); + threshold_hystersis(&image_gray, high_threshold, low_threshold, weak, strong); + + for (uint32_t iy = 0; iy < img.height; ++iy) { + for (uint32_t ix = 0; ix < img.width; ++ix) { + float gray = preprocessing_get_4d(image_gray, ix, iy, 0, 0); gray = inverse ? 1.0f - gray : gray; - ggml_ext_tensor_set_f32(image, gray, ix, iy); - ggml_ext_tensor_set_f32(image, gray, ix, iy, 1); - ggml_ext_tensor_set_f32(image, gray, ix, iy, 2); + for (uint32_t c = 0; c < img.channel; ++c) { + preprocessing_set_4d(image, gray, ix, iy, c, 0); + } } } - ggml_tensor_to_sd_image(image, img.data); - ggml_free(work_ctx); + + preprocessing_tensor_to_sd_image(image, img.data); return true; } -#endif // __PREPROCESSING_HPP__ \ No newline at end of file +#endif // __PREPROCESSING_HPP__ diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp index 68af0e8e..83c8cec6 100644 --- a/src/qwen_image.hpp +++ b/src/qwen_image.hpp @@ -525,20 +525,21 @@ namespace Qwen { qwen_image.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor, + const std::vector>& ref_latents_tensor = {}, + bool increase_ref_index = false) { + ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); GGML_ASSERT(x->ne[3] == 1); - ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE); - - x = to_backend(x); - context = to_backend(context); - timesteps = to_backend(timesteps); - - for (int i = 0; i < ref_latents.size(); i++) { - ref_latents[i] = to_backend(ref_latents[i]); + GGML_ASSERT(!context_tensor.empty()); + ggml_tensor* context = make_input(context_tensor); + std::vector ref_latents; + ref_latents.reserve(ref_latents_tensor.size()); + for (const auto& ref_latent_tensor : ref_latents_tensor) { + ref_latents.push_back(make_input(ref_latent_tensor)); } pe_vec = Rope::gen_qwen_image_pe(static_cast(x->ne[1]), @@ -600,14 +601,12 @@ namespace Qwen { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context, + const std::vector>& ref_latents = {}, + bool increase_ref_index = false) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] @@ -615,7 +614,7 @@ namespace Qwen { return build_graph(x, timesteps, context, ref_latents, increase_ref_index); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -624,30 +623,37 @@ namespace Qwen { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { - // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1); + // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 16, 16, 16, 1); // ggml_set_f32(x, 0.01f); - auto x = load_tensor_from_file(work_ctx, "./qwen_image_x.bin"); - print_ggml_tensor(x); + auto x = sd::load_tensor_from_file_as_tensor("./qwen_image_x.bin"); + print_sd_tensor(x); std::vector timesteps_vec(1, 1000.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); - // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 3584, 256, 1); + // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3584, 256, 1); // ggml_set_f32(context, 0.01f); - auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin"); - print_ggml_tensor(context); + auto context = sd::load_tensor_from_file_as_tensor("./qwen_image_context.bin"); + print_sd_tensor(context); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, {}, false, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + {}, + false); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("qwen_image test done in %lldms", t1 - t0); } } diff --git a/src/sample-cache.cpp b/src/sample-cache.cpp new file mode 100644 index 00000000..5739178d --- /dev/null +++ b/src/sample-cache.cpp @@ -0,0 +1,361 @@ +#include "sample-cache.h" + +namespace sd_sample { + + static float get_cache_reuse_threshold(const sd_cache_params_t& params) { + float reuse_threshold = params.reuse_threshold; + if (reuse_threshold == INFINITY) { + if (params.mode == SD_CACHE_EASYCACHE) { + reuse_threshold = 0.2f; + } else if (params.mode == SD_CACHE_UCACHE) { + reuse_threshold = 1.0f; + } + } + return std::max(0.0f, reuse_threshold); + } + + bool SampleCacheRuntime::easycache_enabled() const { + return mode == SampleCacheMode::EASYCACHE; + } + + bool SampleCacheRuntime::ucache_enabled() const { + return mode == SampleCacheMode::UCACHE; + } + + bool SampleCacheRuntime::cachedit_enabled() const { + return mode == SampleCacheMode::CACHEDIT; + } + + static bool has_valid_cache_percent_range(const sd_cache_params_t& cache_params) { + if (cache_params.mode != SD_CACHE_EASYCACHE && cache_params.mode != SD_CACHE_UCACHE) { + return true; + } + + return cache_params.start_percent >= 0.0f && + cache_params.start_percent < 1.0f && + cache_params.end_percent > 0.0f && + cache_params.end_percent <= 1.0f && + cache_params.start_percent < cache_params.end_percent; + } + + static void init_easycache_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + Denoiser* denoiser) { + if (!sd_version_is_dit(version)) { + LOG_WARN("EasyCache requested but not supported for this model type"); + return; + } + + EasyCacheConfig config; + config.enabled = true; + config.reuse_threshold = get_cache_reuse_threshold(cache_params); + config.start_percent = cache_params.start_percent; + config.end_percent = cache_params.end_percent; + + runtime.easycache.init(config, denoiser); + if (!runtime.easycache.enabled()) { + LOG_WARN("EasyCache requested but could not be initialized for this run"); + return; + } + + runtime.mode = SampleCacheMode::EASYCACHE; + LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f", + config.reuse_threshold, + config.start_percent, + config.end_percent); + } + + static void init_ucache_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + Denoiser* denoiser, + const std::vector& sigmas) { + if (!sd_version_is_unet(version)) { + LOG_WARN("UCache requested but not supported for this model type (only UNET models)"); + return; + } + + UCacheConfig config; + config.enabled = true; + config.reuse_threshold = get_cache_reuse_threshold(cache_params); + config.start_percent = cache_params.start_percent; + config.end_percent = cache_params.end_percent; + config.error_decay_rate = std::max(0.0f, std::min(1.0f, cache_params.error_decay_rate)); + config.use_relative_threshold = cache_params.use_relative_threshold; + config.reset_error_on_compute = cache_params.reset_error_on_compute; + + runtime.ucache.init(config, denoiser); + if (!runtime.ucache.enabled()) { + LOG_WARN("UCache requested but could not be initialized for this run"); + return; + } + + runtime.ucache.set_sigmas(sigmas); + runtime.mode = SampleCacheMode::UCACHE; + LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s", + config.reuse_threshold, + config.start_percent, + config.end_percent, + config.error_decay_rate, + config.use_relative_threshold ? "true" : "false", + config.reset_error_on_compute ? "true" : "false"); + } + + static void init_cachedit_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + const std::vector& sigmas) { + if (!sd_version_is_dit(version)) { + LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)"); + return; + } + + DBCacheConfig dbcfg; + dbcfg.enabled = (cache_params.mode == SD_CACHE_DBCACHE || cache_params.mode == SD_CACHE_CACHE_DIT); + dbcfg.Fn_compute_blocks = cache_params.Fn_compute_blocks; + dbcfg.Bn_compute_blocks = cache_params.Bn_compute_blocks; + dbcfg.residual_diff_threshold = cache_params.residual_diff_threshold; + dbcfg.max_warmup_steps = cache_params.max_warmup_steps; + dbcfg.max_cached_steps = cache_params.max_cached_steps; + dbcfg.max_continuous_cached_steps = cache_params.max_continuous_cached_steps; + if (cache_params.scm_mask != nullptr && strlen(cache_params.scm_mask) > 0) { + dbcfg.steps_computation_mask = parse_scm_mask(cache_params.scm_mask); + } + dbcfg.scm_policy_dynamic = cache_params.scm_policy_dynamic; + + TaylorSeerConfig tcfg; + tcfg.enabled = (cache_params.mode == SD_CACHE_TAYLORSEER || cache_params.mode == SD_CACHE_CACHE_DIT); + tcfg.n_derivatives = cache_params.taylorseer_n_derivatives; + tcfg.skip_interval_steps = cache_params.taylorseer_skip_interval; + + runtime.cachedit.init(dbcfg, tcfg); + if (!runtime.cachedit.enabled()) { + LOG_WARN("CacheDIT requested but could not be initialized for this run"); + return; + } + + runtime.cachedit.set_sigmas(sigmas); + runtime.mode = SampleCacheMode::CACHEDIT; + LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d", + cache_params.mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params.mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"), + dbcfg.Fn_compute_blocks, + dbcfg.Bn_compute_blocks, + dbcfg.residual_diff_threshold, + dbcfg.max_warmup_steps); + } + + static void init_spectrum_runtime(SampleCacheRuntime& runtime, + SDVersion version, + const sd_cache_params_t& cache_params, + const std::vector& sigmas) { + if (!sd_version_is_unet(version) && !sd_version_is_dit(version)) { + LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)"); + return; + } + + SpectrumConfig config; + config.w = cache_params.spectrum_w; + config.m = cache_params.spectrum_m; + config.lam = cache_params.spectrum_lam; + config.window_size = cache_params.spectrum_window_size; + config.flex_window = cache_params.spectrum_flex_window; + config.warmup_steps = cache_params.spectrum_warmup_steps; + config.stop_percent = cache_params.spectrum_stop_percent; + + size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; + runtime.spectrum.init(config, total_steps); + runtime.spectrum_enabled = true; + + LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%", + config.w, config.m, config.lam, + config.window_size, config.flex_window, + config.warmup_steps, config.stop_percent * 100.0f); + } + + SampleCacheRuntime init_sample_cache_runtime(SDVersion version, + const sd_cache_params_t* cache_params, + Denoiser* denoiser, + const std::vector& sigmas) { + SampleCacheRuntime runtime; + if (cache_params == nullptr || cache_params->mode == SD_CACHE_DISABLED) { + return runtime; + } + + if (!has_valid_cache_percent_range(*cache_params)) { + LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)", + cache_params->start_percent, + cache_params->end_percent); + return runtime; + } + + switch (cache_params->mode) { + case SD_CACHE_EASYCACHE: + init_easycache_runtime(runtime, version, *cache_params, denoiser); + break; + case SD_CACHE_UCACHE: + init_ucache_runtime(runtime, version, *cache_params, denoiser, sigmas); + break; + case SD_CACHE_DBCACHE: + case SD_CACHE_TAYLORSEER: + case SD_CACHE_CACHE_DIT: + init_cachedit_runtime(runtime, version, *cache_params, sigmas); + break; + case SD_CACHE_SPECTRUM: + init_spectrum_runtime(runtime, version, *cache_params, sigmas); + break; + default: + break; + } + + return runtime; + } + + SampleStepCacheDispatcher::SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma) + : runtime(runtime), step(step), sigma(sigma), step_index(step > 0 ? (step - 1) : -1) { + if (step_index < 0) { + return; + } + + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + runtime.easycache.begin_step(step_index, sigma); + break; + case SampleCacheMode::UCACHE: + runtime.ucache.begin_step(step_index, sigma); + break; + case SampleCacheMode::CACHEDIT: + runtime.cachedit.begin_step(step_index, sigma); + break; + case SampleCacheMode::NONE: + break; + } + } + + bool SampleStepCacheDispatcher::before_condition(const void* condition, + const sd::Tensor& input, + sd::Tensor* output) { + if (step_index < 0 || condition == nullptr || output == nullptr) { + return false; + } + + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + return runtime.easycache.before_condition(condition, input, output, sigma, step_index); + case SampleCacheMode::UCACHE: + return runtime.ucache.before_condition(condition, input, output, sigma, step_index); + case SampleCacheMode::CACHEDIT: + return runtime.cachedit.before_condition(condition, input, output, sigma, step_index); + case SampleCacheMode::NONE: + return false; + } + + return false; + } + + void SampleStepCacheDispatcher::after_condition(const void* condition, + const sd::Tensor& input, + const sd::Tensor& output) { + if (step_index < 0 || condition == nullptr) { + return; + } + + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + runtime.easycache.after_condition(condition, input, output); + break; + case SampleCacheMode::UCACHE: + runtime.ucache.after_condition(condition, input, output); + break; + case SampleCacheMode::CACHEDIT: + runtime.cachedit.after_condition(condition, input, output); + break; + case SampleCacheMode::NONE: + break; + } + } + + bool SampleStepCacheDispatcher::is_step_skipped() const { + switch (runtime.mode) { + case SampleCacheMode::EASYCACHE: + return runtime.easycache.is_step_skipped(); + case SampleCacheMode::UCACHE: + return runtime.ucache.is_step_skipped(); + case SampleCacheMode::CACHEDIT: + return runtime.cachedit.is_step_skipped(); + case SampleCacheMode::NONE: + return false; + } + + return false; + } + + void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps) { + if (runtime.easycache_enabled()) { + if (runtime.easycache.total_steps_skipped > 0 && total_steps > 0) { + if (runtime.easycache.total_steps_skipped < static_cast(total_steps)) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.easycache.total_steps_skipped); + LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.easycache.total_steps_skipped, + total_steps, + speedup); + } else { + LOG_INFO("EasyCache skipped %d/%zu steps", + runtime.easycache.total_steps_skipped, + total_steps); + } + } else if (total_steps > 0) { + LOG_INFO("EasyCache completed without skipping steps"); + } + } + + if (runtime.ucache_enabled()) { + if (runtime.ucache.total_steps_skipped > 0 && total_steps > 0) { + if (runtime.ucache.total_steps_skipped < static_cast(total_steps)) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.ucache.total_steps_skipped); + LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.ucache.total_steps_skipped, + total_steps, + speedup); + } else { + LOG_INFO("UCache skipped %d/%zu steps", + runtime.ucache.total_steps_skipped, + total_steps); + } + } else if (total_steps > 0) { + LOG_INFO("UCache completed without skipping steps"); + } + } + + if (runtime.cachedit_enabled()) { + if (runtime.cachedit.total_steps_skipped > 0 && total_steps > 0) { + if (runtime.cachedit.total_steps_skipped < static_cast(total_steps)) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.cachedit.total_steps_skipped); + LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.cachedit.total_steps_skipped, + total_steps, + speedup); + } else { + LOG_INFO("CacheDIT skipped %d/%zu steps", + runtime.cachedit.total_steps_skipped, + total_steps); + } + } else if (total_steps > 0) { + LOG_INFO("CacheDIT completed without skipping steps"); + } + } + + if (runtime.spectrum_enabled && runtime.spectrum.total_steps_skipped > 0 && total_steps > 0) { + double speedup = static_cast(total_steps) / + static_cast(total_steps - runtime.spectrum.total_steps_skipped); + LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)", + runtime.spectrum.total_steps_skipped, + total_steps, + speedup); + } + } + +} // namespace sd_sample diff --git a/src/sample-cache.h b/src/sample-cache.h new file mode 100644 index 00000000..398ad065 --- /dev/null +++ b/src/sample-cache.h @@ -0,0 +1,61 @@ +#ifndef __SAMPLE_CACHE_H__ +#define __SAMPLE_CACHE_H__ + +#include + +#include "cache_dit.hpp" +#include "denoiser.hpp" +#include "easycache.hpp" +#include "model.h" +#include "spectrum.hpp" +#include "tensor.hpp" +#include "ucache.hpp" +#include "util.h" + +namespace sd_sample { + + enum class SampleCacheMode { + NONE, + EASYCACHE, + UCACHE, + CACHEDIT, + }; + + struct SampleCacheRuntime { + SampleCacheMode mode = SampleCacheMode::NONE; + + EasyCacheState easycache; + UCacheState ucache; + CacheDitConditionState cachedit; + SpectrumState spectrum; + + bool spectrum_enabled = false; + + bool easycache_enabled() const; + bool ucache_enabled() const; + bool cachedit_enabled() const; + }; + + struct SampleStepCacheDispatcher { + SampleCacheRuntime& runtime; + int step; + float sigma; + int step_index; + + SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma); + + bool before_condition(const void* condition, const sd::Tensor& input, sd::Tensor* output); + void after_condition(const void* condition, const sd::Tensor& input, const sd::Tensor& output); + bool is_step_skipped() const; + }; + + SampleCacheRuntime init_sample_cache_runtime(SDVersion version, + const sd_cache_params_t* cache_params, + Denoiser* denoiser, + const std::vector& sigmas); + + void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps); + +} // namespace sd_sample + +#endif // __SAMPLE_CACHE_H__ diff --git a/src/spectrum.hpp b/src/spectrum.hpp index 9542a8f3..add1796f 100644 --- a/src/spectrum.hpp +++ b/src/spectrum.hpp @@ -6,6 +6,7 @@ #include #include "ggml_extend.hpp" +#include "tensor.hpp" struct SpectrumConfig { float w = 0.40f; @@ -57,11 +58,8 @@ struct SpectrumState { return (num_cached + 1) % ws != 0; } - void update(const ggml_tensor* denoised) { - int64_t ne = ggml_nelements(denoised); - const float* data = (const float*)denoised->data; - - H_buf.emplace_back(data, data + ne); + void update(const sd::Tensor& denoised) { + H_buf.emplace_back(denoised.data(), denoised.data() + denoised.numel()); T_buf.push_back(taus(cnt)); while ((int)H_buf.size() > K) { @@ -76,13 +74,13 @@ struct SpectrumState { cnt++; } - void predict(ggml_tensor* denoised) { + void predict(sd::Tensor* denoised) { + GGML_ASSERT(denoised != nullptr); int64_t F = (int64_t)H_buf[0].size(); int K_curr = (int)H_buf.size(); int M1 = config.m + 1; float tau_at = taus(cnt); - // Design matrix X: K_curr x M1 (Chebyshev basis) std::vector X(K_curr * M1); for (int i = 0; i < K_curr; i++) { X[i * M1] = 1.0f; @@ -92,7 +90,6 @@ struct SpectrumState { X[i * M1 + j] = 2.0f * T_buf[i] * X[i * M1 + j - 1] - X[i * M1 + j - 2]; } - // x_star: Chebyshev basis at current tau std::vector x_star(M1); x_star[0] = 1.0f; if (M1 > 1) @@ -100,7 +97,6 @@ struct SpectrumState { for (int j = 2; j < M1; j++) x_star[j] = 2.0f * tau_at * x_star[j - 1] - x_star[j - 2]; - // XtX = X^T X + lambda I std::vector XtX(M1 * M1, 0.0f); for (int i = 0; i < M1; i++) { for (int j = 0; j < M1; j++) { @@ -111,7 +107,6 @@ struct SpectrumState { } } - // Cholesky decomposition std::vector L(M1 * M1, 0.0f); if (!cholesky_decompose(XtX.data(), L.data(), M1)) { float trace = 0.0f; @@ -122,18 +117,15 @@ struct SpectrumState { cholesky_decompose(XtX.data(), L.data(), M1); } - // Solve XtX v = x_star std::vector v(M1); cholesky_solve(L.data(), x_star.data(), v.data(), M1); - // Prediction weights per history entry std::vector weights(K_curr, 0.0f); for (int k = 0; k < K_curr; k++) for (int j = 0; j < M1; j++) weights[k] += X[k * M1 + j] * v[j]; - // Blend Chebyshev and Taylor predictions - float* out = (float*)denoised->data; + float* out = denoised->data(); float w_cheb = config.w; float w_taylor = 1.0f - w_cheb; const float* h_last = H_buf.back().data(); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index bbf2f979..a59ff23e 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -8,18 +8,15 @@ #include "util.h" #include "auto_encoder_kl.hpp" -#include "cache_dit.hpp" #include "conditioner.hpp" #include "control.hpp" #include "denoiser.hpp" #include "diffusion_model.hpp" -#include "easycache.hpp" #include "esrgan.hpp" #include "lora.hpp" #include "pmid.hpp" -#include "spectrum.hpp" +#include "sample-cache.h" #include "tae.hpp" -#include "ucache.hpp" #include "vae.hpp" #include "latent-preview.h" @@ -78,7 +75,7 @@ const char* sampling_methods_str[] = { void calculate_alphas_cumprod(float* alphas_cumprod, float linear_start = 0.00085f, - float linear_end = 0.0120, + float linear_end = 0.0120f, int timesteps = TIMESTEPS) { float ls_sqrt = sqrtf(linear_start); float le_sqrt = sqrtf(linear_end); @@ -95,387 +92,14 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { float reuse_threshold = params.reuse_threshold; if (reuse_threshold == INFINITY) { if (params.mode == SD_CACHE_EASYCACHE) { - reuse_threshold = 0.2; + reuse_threshold = 0.2f; } else if (params.mode == SD_CACHE_UCACHE) { - reuse_threshold = 1.0; + reuse_threshold = 1.0f; } } return std::max(0.0f, reuse_threshold); } -enum class SampleCacheMode { - NONE, - EASYCACHE, - UCACHE, - CACHEDIT, -}; - -struct SampleCacheRuntime { - SampleCacheMode mode = SampleCacheMode::NONE; - - EasyCacheState easycache; - UCacheState ucache; - CacheDitConditionState cachedit; - SpectrumState spectrum; - - bool spectrum_enabled = false; - - bool has_step_cache() const { - return mode != SampleCacheMode::NONE; - } - - bool easycache_enabled() const { - return mode == SampleCacheMode::EASYCACHE; - } - - bool ucache_enabled() const { - return mode == SampleCacheMode::UCACHE; - } - - bool cachedit_enabled() const { - return mode == SampleCacheMode::CACHEDIT; - } -}; - -static bool has_valid_cache_percent_range(const sd_cache_params_t& cache_params) { - if (cache_params.mode != SD_CACHE_EASYCACHE && cache_params.mode != SD_CACHE_UCACHE) { - return true; - } - - return cache_params.start_percent >= 0.0f && - cache_params.start_percent < 1.0f && - cache_params.end_percent > 0.0f && - cache_params.end_percent <= 1.0f && - cache_params.start_percent < cache_params.end_percent; -} - -static void init_easycache_runtime(SampleCacheRuntime& runtime, - SDVersion version, - const sd_cache_params_t& cache_params, - Denoiser* denoiser) { - if (!sd_version_is_dit(version)) { - LOG_WARN("EasyCache requested but not supported for this model type"); - return; - } - - EasyCacheConfig config; - config.enabled = true; - config.reuse_threshold = get_cache_reuse_threshold(cache_params); - config.start_percent = cache_params.start_percent; - config.end_percent = cache_params.end_percent; - - runtime.easycache.init(config, denoiser); - if (!runtime.easycache.enabled()) { - LOG_WARN("EasyCache requested but could not be initialized for this run"); - return; - } - - runtime.mode = SampleCacheMode::EASYCACHE; - LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f", - config.reuse_threshold, - config.start_percent, - config.end_percent); -} - -static void init_ucache_runtime(SampleCacheRuntime& runtime, - SDVersion version, - const sd_cache_params_t& cache_params, - Denoiser* denoiser, - const std::vector& sigmas) { - if (!sd_version_is_unet(version)) { - LOG_WARN("UCache requested but not supported for this model type (only UNET models)"); - return; - } - - UCacheConfig config; - config.enabled = true; - config.reuse_threshold = get_cache_reuse_threshold(cache_params); - config.start_percent = cache_params.start_percent; - config.end_percent = cache_params.end_percent; - config.error_decay_rate = std::max(0.0f, std::min(1.0f, cache_params.error_decay_rate)); - config.use_relative_threshold = cache_params.use_relative_threshold; - config.reset_error_on_compute = cache_params.reset_error_on_compute; - - runtime.ucache.init(config, denoiser); - if (!runtime.ucache.enabled()) { - LOG_WARN("UCache requested but could not be initialized for this run"); - return; - } - - runtime.ucache.set_sigmas(sigmas); - runtime.mode = SampleCacheMode::UCACHE; - LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s", - config.reuse_threshold, - config.start_percent, - config.end_percent, - config.error_decay_rate, - config.use_relative_threshold ? "true" : "false", - config.reset_error_on_compute ? "true" : "false"); -} - -static void init_cachedit_runtime(SampleCacheRuntime& runtime, - SDVersion version, - const sd_cache_params_t& cache_params, - const std::vector& sigmas) { - if (!sd_version_is_dit(version)) { - LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)"); - return; - } - - DBCacheConfig dbcfg; - dbcfg.enabled = (cache_params.mode == SD_CACHE_DBCACHE || - cache_params.mode == SD_CACHE_CACHE_DIT); - dbcfg.Fn_compute_blocks = cache_params.Fn_compute_blocks; - dbcfg.Bn_compute_blocks = cache_params.Bn_compute_blocks; - dbcfg.residual_diff_threshold = cache_params.residual_diff_threshold; - dbcfg.max_warmup_steps = cache_params.max_warmup_steps; - dbcfg.max_cached_steps = cache_params.max_cached_steps; - dbcfg.max_continuous_cached_steps = cache_params.max_continuous_cached_steps; - if (cache_params.scm_mask != nullptr && strlen(cache_params.scm_mask) > 0) { - dbcfg.steps_computation_mask = parse_scm_mask(cache_params.scm_mask); - } - dbcfg.scm_policy_dynamic = cache_params.scm_policy_dynamic; - - TaylorSeerConfig tcfg; - tcfg.enabled = (cache_params.mode == SD_CACHE_TAYLORSEER || - cache_params.mode == SD_CACHE_CACHE_DIT); - tcfg.n_derivatives = cache_params.taylorseer_n_derivatives; - tcfg.skip_interval_steps = cache_params.taylorseer_skip_interval; - - runtime.cachedit.init(dbcfg, tcfg); - if (!runtime.cachedit.enabled()) { - LOG_WARN("CacheDIT requested but could not be initialized for this run"); - return; - } - - runtime.cachedit.set_sigmas(sigmas); - runtime.mode = SampleCacheMode::CACHEDIT; - LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d", - cache_params.mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params.mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"), - dbcfg.Fn_compute_blocks, - dbcfg.Bn_compute_blocks, - dbcfg.residual_diff_threshold, - dbcfg.max_warmup_steps); -} - -static void init_spectrum_runtime(SampleCacheRuntime& runtime, - SDVersion version, - const sd_cache_params_t& cache_params, - const std::vector& sigmas) { - if (!sd_version_is_unet(version) && !sd_version_is_dit(version)) { - LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)"); - return; - } - - SpectrumConfig config; - config.w = cache_params.spectrum_w; - config.m = cache_params.spectrum_m; - config.lam = cache_params.spectrum_lam; - config.window_size = cache_params.spectrum_window_size; - config.flex_window = cache_params.spectrum_flex_window; - config.warmup_steps = cache_params.spectrum_warmup_steps; - config.stop_percent = cache_params.spectrum_stop_percent; - - size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; - runtime.spectrum.init(config, total_steps); - runtime.spectrum_enabled = true; - - LOG_INFO("Spectrum enabled - w: %.2f, m: %d, lam: %.2f, window: %d, flex: %.2f, warmup: %d, stop: %.0f%%", - config.w, config.m, config.lam, - config.window_size, config.flex_window, - config.warmup_steps, config.stop_percent * 100.0f); -} - -static SampleCacheRuntime init_sample_cache_runtime(SDVersion version, - const sd_cache_params_t* cache_params, - Denoiser* denoiser, - const std::vector& sigmas) { - SampleCacheRuntime runtime; - if (cache_params == nullptr || cache_params->mode == SD_CACHE_DISABLED) { - return runtime; - } - - if (!has_valid_cache_percent_range(*cache_params)) { - LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)", - cache_params->start_percent, - cache_params->end_percent); - return runtime; - } - - switch (cache_params->mode) { - case SD_CACHE_EASYCACHE: - init_easycache_runtime(runtime, version, *cache_params, denoiser); - break; - case SD_CACHE_UCACHE: - init_ucache_runtime(runtime, version, *cache_params, denoiser, sigmas); - break; - case SD_CACHE_DBCACHE: - case SD_CACHE_TAYLORSEER: - case SD_CACHE_CACHE_DIT: - init_cachedit_runtime(runtime, version, *cache_params, sigmas); - break; - case SD_CACHE_SPECTRUM: - init_spectrum_runtime(runtime, version, *cache_params, sigmas); - break; - default: - break; - } - - return runtime; -} - -struct SampleStepCacheDispatcher { - SampleCacheRuntime& runtime; - int step; - float sigma; - int step_index; - - SampleStepCacheDispatcher(SampleCacheRuntime& runtime, int step, float sigma) - : runtime(runtime), step(step), sigma(sigma), step_index(step > 0 ? (step - 1) : -1) { - if (step_index < 0) { - return; - } - - switch (runtime.mode) { - case SampleCacheMode::EASYCACHE: - runtime.easycache.begin_step(step_index, sigma); - break; - case SampleCacheMode::UCACHE: - runtime.ucache.begin_step(step_index, sigma); - break; - case SampleCacheMode::CACHEDIT: - runtime.cachedit.begin_step(step_index, sigma); - break; - case SampleCacheMode::NONE: - break; - } - } - - bool before_condition(const SDCondition* condition, ggml_tensor* input, ggml_tensor* output) { - if (step_index < 0 || condition == nullptr || input == nullptr || output == nullptr) { - return false; - } - - switch (runtime.mode) { - case SampleCacheMode::EASYCACHE: - return runtime.easycache.before_condition(condition, input, output, sigma, step_index); - case SampleCacheMode::UCACHE: - return runtime.ucache.before_condition(condition, input, output, sigma, step_index); - case SampleCacheMode::CACHEDIT: - return runtime.cachedit.before_condition(condition, input, output, sigma, step_index); - case SampleCacheMode::NONE: - return false; - } - - return false; - } - - void after_condition(const SDCondition* condition, ggml_tensor* input, ggml_tensor* output) { - if (step_index < 0 || condition == nullptr || input == nullptr || output == nullptr) { - return; - } - - switch (runtime.mode) { - case SampleCacheMode::EASYCACHE: - runtime.easycache.after_condition(condition, input, output); - break; - case SampleCacheMode::UCACHE: - runtime.ucache.after_condition(condition, input, output); - break; - case SampleCacheMode::CACHEDIT: - runtime.cachedit.after_condition(condition, input, output); - break; - case SampleCacheMode::NONE: - break; - } - } - - bool is_step_skipped() const { - switch (runtime.mode) { - case SampleCacheMode::EASYCACHE: - return runtime.easycache.is_step_skipped(); - case SampleCacheMode::UCACHE: - return runtime.ucache.is_step_skipped(); - case SampleCacheMode::CACHEDIT: - return runtime.cachedit.is_step_skipped(); - case SampleCacheMode::NONE: - return false; - } - - return false; - } -}; - -static void log_sample_cache_summary(const SampleCacheRuntime& runtime, size_t total_steps) { - if (runtime.easycache_enabled()) { - if (runtime.easycache.total_steps_skipped > 0 && total_steps > 0) { - if (runtime.easycache.total_steps_skipped < static_cast(total_steps)) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - runtime.easycache.total_steps_skipped); - LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)", - runtime.easycache.total_steps_skipped, - total_steps, - speedup); - } else { - LOG_INFO("EasyCache skipped %d/%zu steps", - runtime.easycache.total_steps_skipped, - total_steps); - } - } else if (total_steps > 0) { - LOG_INFO("EasyCache completed without skipping steps"); - } - } - - if (runtime.ucache_enabled()) { - if (runtime.ucache.total_steps_skipped > 0 && total_steps > 0) { - if (runtime.ucache.total_steps_skipped < static_cast(total_steps)) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - runtime.ucache.total_steps_skipped); - LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)", - runtime.ucache.total_steps_skipped, - total_steps, - speedup); - } else { - LOG_INFO("UCache skipped %d/%zu steps", - runtime.ucache.total_steps_skipped, - total_steps); - } - } else if (total_steps > 0) { - LOG_INFO("UCache completed without skipping steps"); - } - } - - if (runtime.cachedit_enabled()) { - if (runtime.cachedit.total_steps_skipped > 0 && total_steps > 0) { - if (runtime.cachedit.total_steps_skipped < static_cast(total_steps)) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - runtime.cachedit.total_steps_skipped); - LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup), accum_diff: %.4f", - runtime.cachedit.total_steps_skipped, - total_steps, - speedup, - runtime.cachedit.accumulated_residual_diff); - } else { - LOG_INFO("CacheDIT skipped %d/%zu steps, accum_diff: %.4f", - runtime.cachedit.total_steps_skipped, - total_steps, - runtime.cachedit.accumulated_residual_diff); - } - } else if (total_steps > 0) { - LOG_INFO("CacheDIT completed without skipping steps"); - } - } - - if (runtime.spectrum_enabled && runtime.spectrum.total_steps_skipped > 0 && total_steps > 0) { - double speedup = static_cast(total_steps) / - static_cast(total_steps - runtime.spectrum.total_steps_skipped); - LOG_INFO("Spectrum skipped %d/%zu steps (%.2fx estimated speedup)", - runtime.spectrum.total_steps_skipped, - total_steps, - speedup); - } -} - /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -1279,7 +903,7 @@ public: if (pred_type == PREDICTION_COUNT) { if (sd_version_is_sd2(version)) { // check is_using_v_parameterization_for_sd2 - if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) { + if (is_using_v_parameterization_for_sd2(sd_version_is_inpaint(version))) { pred_type = V_PRED; } else { pred_type = EPS_PRED; @@ -1369,43 +993,31 @@ public: return true; } - bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) { - ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); - ggml_set_f32(x_t, 0.5); - ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); - ggml_set_f32(c, 0.5); - - ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); - ggml_set_f32(timesteps, 999); - - ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : nullptr; - if (concat != nullptr) { - ggml_set_f32(concat, 0); + bool is_using_v_parameterization_for_sd2(bool is_inpaint = false) { + sd::Tensor x_t = sd::full({8, 8, 4, 1}, 0.5f); + sd::Tensor c = sd::full({1024, 2, 1, 1}, 0.5f); + sd::Tensor steps = sd::full({1}, 999.0f); + sd::Tensor concat; + if (is_inpaint) { + concat = sd::zeros({8, 8, 5, 1}); } - int64_t t0 = ggml_time_ms(); - ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); + int64_t t0 = ggml_time_ms(); + sd::Tensor out; DiffusionParams diffusion_params; - diffusion_params.x = x_t; - diffusion_params.timesteps = timesteps; - diffusion_params.context = c; - diffusion_params.c_concat = concat; - diffusion_model->compute(n_threads, diffusion_params, &out); + diffusion_params.x = &x_t; + diffusion_params.timesteps = &steps; + diffusion_params.context = &c; + if (!concat.empty()) { + diffusion_params.c_concat = &concat; + } + auto out_opt = diffusion_model->compute(n_threads, diffusion_params); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); diffusion_model->free_compute_buffer(); - double result = 0.f; - { - float* vec_x = (float*)x_t->data; - float* vec_out = (float*)out->data; - - int64_t n = ggml_nelements(out); - - for (int i = 0; i < n; i++) { - result += ((double)vec_out[i] - (double)vec_x[i]); - } - result /= n; - } - int64_t t1 = ggml_time_ms(); + double result = static_cast((out - x_t).mean()); + int64_t t1 = ggml_time_ms(); LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000); return result < -1; } @@ -1643,8 +1255,7 @@ public: } } - SDCondition get_pmid_conditon(ggml_context* work_ctx, - sd_pm_params_t pm_params, + SDCondition get_pmid_conditon(sd_pm_params_t pm_params, ConditionerParams& condition_params) { SDCondition id_cond; if (use_pmid) { @@ -1663,60 +1274,60 @@ public: if (pm_params.id_images_count > 0) { int clip_image_size = 224; pmid_model->style_strength = pm_params.style_strength; - - auto id_image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, clip_image_size, clip_image_size, 3, pm_params.id_images_count); - - std::vector processed_id_images; + sd::Tensor id_image_tensor; for (int i = 0; i < pm_params.id_images_count; i++) { - sd_image_f32_t id_image = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]); - sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size); - free(id_image.data); - id_image.data = nullptr; - processed_id_images.push_back(processed_id_image); + auto id_image = sd_image_to_tensor(pm_params.id_images[i]); + auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size); + if (id_image_tensor.empty()) { + id_image_tensor = processed_id_image; + } else { + id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3); + } } - ggml_ext_tensor_iter(id_image_tensor, [&](ggml_tensor* id_image_tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = sd_image_get_f32(processed_id_images[i3], i0, i1, i2, false); - ggml_ext_tensor_set_f32(id_image_tensor, value, i0, i1, i2, i3); - }); - - for (auto& image : processed_id_images) { - free(image.data); - image.data = nullptr; - } - processed_id_images.clear(); - int64_t t0 = ggml_time_ms(); condition_params.num_input_imgs = pm_params.id_images_count; - auto cond_tup = cond_stage_model->get_learned_condition_with_trigger(work_ctx, - n_threads, + auto cond_tup = cond_stage_model->get_learned_condition_with_trigger(n_threads, condition_params); id_cond = std::get<0>(cond_tup); auto class_tokens_mask = std::get<1>(cond_tup); - ggml_tensor* id_embeds = nullptr; + sd::Tensor id_embeds; if (pmv2 && pm_params.id_embed_path != nullptr) { - id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path); + try { + id_embeds = sd::load_tensor_from_file_as_tensor(pm_params.id_embed_path); + } catch (const std::exception&) { + id_embeds = {}; + } } - if (pmv2 && id_embeds == nullptr) { + if (pmv2 && id_embeds.empty()) { LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2"); LOG_WARN("Turn off PhotoMaker"); use_pmid = false; } else { - if (pmv2 && pm_params.id_images_count != id_embeds->ne[1]) { - LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, id_embeds->ne[1]); + if (pmv2 && pm_params.id_images_count != id_embeds.shape()[1]) { + LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, static_cast(id_embeds.shape()[1])); LOG_WARN("Turn off PhotoMaker"); use_pmid = false; } else { - ggml_tensor* res = nullptr; - pmid_model->compute(n_threads, id_image_tensor, id_cond.c_crossattn, id_embeds, class_tokens_mask, &res, work_ctx); - id_cond.c_crossattn = res; - int64_t t1 = ggml_time_ms(); - LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0); + auto res = pmid_model->compute(n_threads, + id_image_tensor, + id_cond.c_crossattn, + id_embeds, + class_tokens_mask); + if (res.empty()) { + LOG_ERROR("Photomaker ID Stacking failed"); + LOG_WARN("Turn off PhotoMaker"); + use_pmid = false; + } else { + id_cond.c_crossattn = std::move(res); + int64_t t1 = ggml_time_ms(); + LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0); + // Encode input prompt without the trigger word for delayed conditioning + condition_params.text = cond_stage_model->remove_trigger_from_prompt(condition_params.text); + } if (free_params_immediately) { pmid_model->free_params_buffer(); } - // Encode input prompt without the trigger word for delayed conditioning - condition_params.text = cond_stage_model->remove_trigger_from_prompt(work_ctx, condition_params.text); } } } else { @@ -1728,108 +1339,37 @@ public: return id_cond; } - ggml_tensor* get_clip_vision_output(ggml_context* work_ctx, - sd_image_t init_image, - bool return_pooled = true, - int clip_skip = -1, - bool zero_out_masked = false) { - ggml_tensor* output = nullptr; + sd::Tensor get_clip_vision_output(const sd::Tensor& image, + bool return_pooled = true, + int clip_skip = -1, + bool zero_out_masked = false) { + sd::Tensor output; if (zero_out_masked) { if (return_pooled) { - output = ggml_new_tensor_1d(work_ctx, - GGML_TYPE_F32, - clip_vision->vision_model.projection_dim); + output = sd::zeros({clip_vision->vision_model.projection_dim}); } else { - output = ggml_new_tensor_2d(work_ctx, - GGML_TYPE_F32, - clip_vision->vision_model.hidden_size, - 257); + output = sd::zeros({clip_vision->vision_model.hidden_size, 257}); } - - ggml_set_f32(output, 0.f); } else { - sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); - sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size); - free(image.data); - image.data = nullptr; - - ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1); - sd_image_f32_to_ggml_tensor(resized_image, pixel_values, false); - free(resized_image.data); - resized_image.data = nullptr; - - // print_ggml_tensor(pixel_values); - clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx); - // print_ggml_tensor(c_crossattn); + auto pixel_values = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size); + auto output_opt = clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip); + if (output_opt.empty()) { + LOG_ERROR("clip_vision compute failed"); + return {}; + } + output = std::move(output_opt); } return output; } - SDCondition get_svd_condition(ggml_context* work_ctx, - sd_image_t init_image, - int width, - int height, - int fps = 6, - int motion_bucket_id = 127, - float augmentation_level = 0.f, - bool zero_out_masked = false) { - // c_crossattn - int64_t t0 = ggml_time_ms(); - ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked); - - // c_concat - ggml_tensor* c_concat = nullptr; - { - if (zero_out_masked) { - c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / get_vae_scale_factor(), height / get_vae_scale_factor(), 4, 1); - ggml_set_f32(c_concat, 0.f); - } else { - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - - if (width != init_image.width || height != init_image.height) { - sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); - sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height); - free(image.data); - image.data = nullptr; - sd_image_f32_to_ggml_tensor(resized_image, init_img, false); - free(resized_image.data); - resized_image.data = nullptr; - } else { - sd_image_to_ggml_tensor(init_image, init_img); - } - if (augmentation_level > 0.f) { - ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img); - ggml_ext_im_set_randn_f32(noise, rng); - // encode_pixels += torch.randn_like(pixels) * augmentation_level - ggml_ext_tensor_scale_inplace(noise, augmentation_level); - ggml_ext_tensor_add_inplace(init_img, noise); - } - c_concat = encode_first_stage(work_ctx, init_img); - } - } - - // y - ggml_tensor* y = nullptr; - { - y = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels()); - int out_dim = 256; - int fps_id = fps - 1; - std::vector timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level}; - set_timestep_embedding(timesteps, y, out_dim); - } - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0); - return {c_crossattn, y, c_concat}; - } - std::vector process_timesteps(const std::vector& timesteps, - ggml_tensor* init_latent, - ggml_tensor* denoise_mask) { + const sd::Tensor& init_latent, + const sd::Tensor& denoise_mask) { if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") { - auto new_timesteps = std::vector(init_latent->ne[2], timesteps[0]); + auto new_timesteps = std::vector(static_cast(init_latent.shape()[2]), timesteps[0]); - if (denoise_mask != nullptr) { - float value = ggml_ext_tensor_get_f32(denoise_mask, 0, 0, 0, 0); + if (!denoise_mask.empty()) { + float value = denoise_mask.dim() == 5 ? denoise_mask.index(0, 0, 0, 0, 0) : denoise_mask.index(0, 0, 0, 0); if (value == 0.f) { new_timesteps[0] = 0.f; } @@ -1840,40 +1380,19 @@ public: } } - // a = a * mask + b * (1 - mask) - void apply_mask(ggml_tensor* a, ggml_tensor* b, ggml_tensor* mask) { - for (int64_t i0 = 0; i0 < a->ne[0]; i0++) { - for (int64_t i1 = 0; i1 < a->ne[1]; i1++) { - for (int64_t i2 = 0; i2 < a->ne[2]; i2++) { - for (int64_t i3 = 0; i3 < a->ne[3]; i3++) { - float a_value = ggml_ext_tensor_get_f32(a, i0, i1, i2, i3); - float b_value = ggml_ext_tensor_get_f32(b, i0, i1, i2, i3); - float mask_value = ggml_ext_tensor_get_f32(mask, i0 % mask->ne[0], i1 % mask->ne[1], i2 % mask->ne[2], i3 % mask->ne[3]); - ggml_ext_tensor_set_f32(a, a_value * mask_value + b_value * (1 - mask_value), i0, i1, i2, i3); - } - } - } - } - } - - void preview_image(ggml_context* work_ctx, - int step, - ggml_tensor* latents, + void preview_image(int step, + const sd::Tensor& latents, enum SDVersion version, preview_t preview_mode, - ggml_tensor* result, std::function step_callback, void* step_callback_data, bool is_noisy) { - const uint32_t channel = 3; - uint32_t width = static_cast(latents->ne[0]); - uint32_t height = static_cast(latents->ne[1]); - uint32_t dim = static_cast(latents->ne[ggml_n_dims(latents) - 1]); - if (preview_mode == PREVIEW_PROJ) { - int patch_sz = 1; - const float(*latent_rgb_proj)[channel] = nullptr; - float* latent_rgb_bias = nullptr; + int patch_sz = 1; + const float(*latent_rgb_proj)[3] = nullptr; + float* latent_rgb_bias = nullptr; + bool is_video = preview_latent_tensor_is_video(latents); + uint32_t dim = is_video ? static_cast(latents.shape()[3]) : static_cast(latents.shape()[2]); if (dim == 128) { if (sd_version_is_flux2(version)) { @@ -1887,12 +1406,9 @@ public: latent_rgb_bias = wan_22_latent_rgb_bias; } else { LOG_WARN("No latent to RGB projection known for this model"); - // unknown model return; } } else if (dim == 16) { - // 16 channels VAE -> Flux or SD3 - if (sd_version_is_sd3(version)) { latent_rgb_proj = sd3_latent_rgb_proj; latent_rgb_bias = sd3_latent_rgb_bias; @@ -1904,12 +1420,9 @@ public: latent_rgb_bias = wan_21_latent_rgb_bias; } else { LOG_WARN("No latent to RGB projection known for this model"); - // unknown model return; } - } else if (dim == 4) { - // 4 channels VAE if (sd_version_is_sdxl(version)) { latent_rgb_proj = sdxl_latent_rgb_proj; latent_rgb_bias = sdxl_latent_rgb_bias; @@ -1917,459 +1430,394 @@ public: latent_rgb_proj = sd_latent_rgb_proj; latent_rgb_bias = sd_latent_rgb_bias; } else { - // unknown model LOG_WARN("No latent to RGB projection known for this model"); return; } - } else if (dim == 3) { - // Do nothing, assuming already RGB latents - } else { + } else if (dim != 3) { LOG_WARN("No latent to RGB projection known for this model"); - // unknown latent space return; } - uint32_t frames = 1; - if (ggml_n_dims(latents) == 4) { - frames = static_cast(latents->ne[2]); - } - - uint32_t img_width = width * patch_sz; - uint32_t img_height = height * patch_sz; - - uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t)); + uint32_t frames = is_video ? static_cast(latents.shape()[2]) : 1; + uint32_t img_width = static_cast(latents.shape()[0]) * patch_sz; + uint32_t img_height = static_cast(latents.shape()[1]) * patch_sz; + uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * 3 * sizeof(uint8_t)); + GGML_ASSERT(data != nullptr); preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz); sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); + GGML_ASSERT(images != nullptr); for (uint32_t i = 0; i < frames; i++) { - images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel}; + images[i] = {img_width, img_height, 3, data + i * img_width * img_height * 3}; } step_callback(step, frames, images, is_noisy, step_callback_data); free(data); free(images); - } else { - if (preview_mode == PREVIEW_VAE || preview_mode == PREVIEW_TAE) { - if (preview_vae) { - latents = preview_vae->diffusion_to_vae_latents(work_ctx, latents); - result = preview_vae->decode(n_threads, work_ctx, latents, vae_tiling_params, false, circular_x, circular_y, result, true); - } else { - latents = first_stage_model->diffusion_to_vae_latents(work_ctx, latents); - result = first_stage_model->decode(n_threads, work_ctx, latents, vae_tiling_params, false, circular_x, circular_y, result, true); - } + return; + } + + if (preview_mode == PREVIEW_VAE || preview_mode == PREVIEW_TAE) { + sd::Tensor vae_latents; + sd::Tensor decoded; + bool is_video = preview_latent_tensor_is_video(latents); + if (preview_vae) { + vae_latents = preview_vae->diffusion_to_vae_latents(latents); + decoded = preview_vae->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true); } else { + vae_latents = first_stage_model->diffusion_to_vae_latents(latents); + decoded = first_stage_model->decode(n_threads, vae_latents, vae_tiling_params, is_video, circular_x, circular_y, true); + } + if (decoded.empty()) { + LOG_ERROR("preview decode failed at step %d", step); return; } - ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f); - uint32_t frames = 1; - if (ggml_n_dims(latents) == 4) { - frames = static_cast(result->ne[2]); - } - + is_video = preview_latent_tensor_is_video(decoded); + uint32_t frames = is_video ? static_cast(decoded.shape()[2]) : 1; sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); - // print_ggml_tensor(result,true); - for (size_t i = 0; i < frames; i++) { - images[i].width = static_cast(result->ne[0]); - images[i].height = static_cast(result->ne[1]); - images[i].channel = 3; - images[i].data = ggml_tensor_to_sd_image(result, static_cast(i), ggml_n_dims(latents) == 4); + GGML_ASSERT(images != nullptr); + for (uint32_t i = 0; i < frames; ++i) { + images[i] = tensor_to_sd_image(decoded, static_cast(i)); } step_callback(step, frames, images, is_noisy, step_callback_data); - - ggml_ext_tensor_scale_inplace(result, 0); - for (uint32_t i = 0; i < frames; i++) { + for (uint32_t i = 0; i < frames; ++i) { free(images[i].data); } - free(images); + return; + } + + if (preview_mode != PREVIEW_NONE) { + LOG_WARN("Unsupported preview mode: %d", static_cast(preview_mode)); } } - ggml_tensor* sample(ggml_context* work_ctx, - std::shared_ptr work_diffusion_model, - bool inverse_noise_scaling, - ggml_tensor* init_latent, - ggml_tensor* noise, - SDCondition cond, - SDCondition uncond, - SDCondition img_cond, - ggml_tensor* control_hint, - float control_strength, - sd_guidance_params_t guidance, - float eta, - int shifted_timestep, - sample_method_t method, - const std::vector& sigmas, - int start_merge_step, - SDCondition id_cond, - std::vector ref_latents = {}, - bool increase_ref_index = false, - ggml_tensor* denoise_mask = nullptr, - ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f, - const sd_cache_params_t* cache_params = nullptr) { - if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) { - LOG_WARN("timestep shifting is only supported for SDXL models!"); - shifted_timestep = 0; + std::vector prepare_sample_timesteps(float sigma, + int shifted_timestep) { + float t = denoiser->sigma_to_t(sigma); + if (shifted_timestep > 0) { + float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS)); + int64_t shifted_t = static_cast(roundf(shifted_t_float)); + shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); + LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); + return std::vector{(float)shifted_t}; } + if (sd_version_is_anima(version)) { + return std::vector{t / static_cast(TIMESTEPS)}; + } + if (sd_version_is_z_image(version)) { + return std::vector{1000.f - t}; + } + return std::vector{t}; + } + + void adjust_sample_step_scalings(int shifted_timestep, + const std::vector& timesteps_vec, + float c_in, + float* c_skip, + float* c_out) { + GGML_ASSERT(c_skip != nullptr); + GGML_ASSERT(c_out != nullptr); + if (shifted_timestep <= 0) { + return; + } + + int64_t shifted_t_idx = static_cast(roundf(timesteps_vec[0])); + float shifted_sigma = denoiser->t_to_sigma((float)shifted_t_idx); + std::vector shifted_scaling = denoiser->get_scalings(shifted_sigma); + float shifted_c_skip = shifted_scaling[0]; + float shifted_c_out = shifted_scaling[1]; + float shifted_c_in = shifted_scaling[2]; + + *c_skip = shifted_c_skip * c_in / shifted_c_in; + *c_out = shifted_c_out; + } + + struct SamplePreviewContext { + sd_preview_cb_t callback = nullptr; + void* data = nullptr; + preview_t mode = PREVIEW_NONE; + }; + + SamplePreviewContext prepare_sample_preview_context() { + return SamplePreviewContext{sd_get_preview_callback(), + sd_get_preview_callback_data(), + sd_get_preview_mode()}; + } + + void report_sample_progress(int step, size_t total_steps, int64_t t0) { + int64_t t1 = ggml_time_us(); + if (step > 0 || step == -(int)total_steps) { + int showstep = std::abs(step); + pretty_progress(showstep, (int)total_steps, (t1 - t0) / 1000000.f / showstep); + } + } + + void compute_sample_controls(const sd::Tensor& control_image, + const sd::Tensor& noised_input, + const sd::Tensor& timesteps_tensor, + const SDCondition& condition, + std::vector>* controls) { + GGML_ASSERT(controls != nullptr); + controls->clear(); + if (control_image.empty() || control_net == nullptr) { + return; + } + + auto control_result = control_net->compute(n_threads, + noised_input, + control_image, + timesteps_tensor, + condition.c_crossattn, + condition.c_vector); + if (!control_result.has_value()) { + LOG_ERROR("controlnet compute failed"); + return; + } + + *controls = std::move(*control_result); + } + + sd::Tensor sample(const std::shared_ptr& work_diffusion_model, + bool inverse_noise_scaling, + const sd::Tensor& init_latent, + sd::Tensor noise, + const SDCondition& cond, + const SDCondition& uncond, + const SDCondition& img_cond, + const SDCondition& id_cond, + const sd::Tensor& control_image, + float control_strength, + const sd_guidance_params_t& guidance, + float eta, + int shifted_timestep, + sample_method_t method, + const std::vector& sigmas, + int start_merge_step, + const std::vector>& ref_latents, + bool increase_ref_index, + const sd::Tensor& denoise_mask, + const sd::Tensor& vace_context, + float vace_strength, + const sd_cache_params_t* cache_params) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); - - float cfg_scale = guidance.txt_cfg; - if (cfg_scale < 1.f) { - if (cfg_scale == 0.f) { - // Diffusers follow the convention from the original paper - // (https://arxiv.org/abs/2207.12598v1), so many distilled model docs - // recommend 0 as guidance; warn the user that it'll disable prompt folowing - LOG_WARN("unconditioned mode, images won't follow the prompt (use cfg-scale=1 for distilled models)"); - } else { - LOG_WARN("cfg value out of expected range may produce unexpected results"); - } - } - - float img_cfg_scale = std::isfinite(guidance.img_cfg) ? guidance.img_cfg : guidance.txt_cfg; + float cfg_scale = guidance.txt_cfg; + float img_cfg_scale = guidance.img_cfg; float slg_scale = guidance.slg.scale; - if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) { - LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance..."); - img_cfg_scale = cfg_scale; + sd_sample::SampleCacheRuntime cache_runtime = sd_sample::init_sample_cache_runtime(version, + cache_params, + denoiser.get(), + sigmas); + size_t steps = sigmas.size() - 1; + bool has_skiplayer = slg_scale != 0.0f && !skip_layers.empty(); + if (has_skiplayer && !sd_version_is_dit(version)) { + has_skiplayer = false; + LOG_WARN("SLG is incompatible with this model type"); } - SampleCacheRuntime cache_runtime = init_sample_cache_runtime(version, cache_params, denoiser.get(), sigmas); + int64_t t0 = ggml_time_us(); + sd::Tensor x_t = !noise.empty() + ? denoiser->noise_scaling(sigmas[0], noise, init_latent) + : init_latent; + sd::Tensor denoised = x_t; + SamplePreviewContext preview = prepare_sample_preview_context(); - size_t steps = sigmas.size() - 1; - ggml_tensor* x = ggml_ext_dup_and_cpy_tensor(work_ctx, init_latent); - - if (noise) { - x = denoiser->noise_scaling(sigmas[0], noise, x); - } - - ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x); - - bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != nullptr; - bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != nullptr; - bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; - - // denoise wrapper - ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); - ggml_tensor* out_uncond = nullptr; - ggml_tensor* out_skip = nullptr; - ggml_tensor* out_img_cond = nullptr; - - if (has_unconditioned) { - out_uncond = ggml_dup_tensor(work_ctx, x); - } - if (has_skiplayer) { - if (sd_version_is_dit(version)) { - out_skip = ggml_dup_tensor(work_ctx, x); - } else { - has_skiplayer = false; - LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); - } - } - if (has_img_cond) { - out_img_cond = ggml_dup_tensor(work_ctx, x); - } - ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); - - int64_t t0 = ggml_time_us(); - - ggml_tensor* preview_tensor = nullptr; - auto sd_preview_mode = sd_get_preview_mode(); - if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { - int64_t W = x->ne[0] * get_vae_scale_factor(); - int64_t H = x->ne[1] * get_vae_scale_factor(); - if (ggml_n_dims(x) == 4) { - // assuming video mode (if batch processing gets implemented this will break) - int64_t T = x->ne[2]; - if (sd_version_is_wan(version)) { - T = ((T - 1) * 4) + 1; - } - preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, - W, - H, - T, - 3); - } else { - preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, - W, - H, - 3, - x->ne[3]); - } - } - - auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { - auto sd_preview_cb = sd_get_preview_callback(); - auto sd_preview_cb_data = sd_get_preview_callback_data(); - auto sd_preview_mode = sd_get_preview_mode(); + auto denoise = [&](const sd::Tensor& x, float sigma, int step) -> sd::Tensor { if (step == 1 || step == -1) { pretty_progress(0, (int)steps, 0); } - DiffusionParams diffusion_params; - SampleStepCacheDispatcher step_cache(cache_runtime, step, sigma); - std::vector scaling = denoiser->get_scalings(sigma); GGML_ASSERT(scaling.size() == 3); float c_skip = scaling[0]; float c_out = scaling[1]; float c_in = scaling[2]; - float t = denoiser->sigma_to_t(sigma); - std::vector timesteps_vec; - if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { - float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS)); - int64_t shifted_t = static_cast(roundf(shifted_t_float)); - shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); - LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); - timesteps_vec.assign(1, (float)shifted_t); - } else if (sd_version_is_anima(version)) { - // Anima uses normalized flow timesteps. - timesteps_vec.assign(1, t / static_cast(TIMESTEPS)); - } else if (sd_version_is_z_image(version)) { - timesteps_vec.assign(1, 1000.f - t); - } else { - timesteps_vec.assign(1, t); + std::vector timesteps_vec = prepare_sample_timesteps(sigma, shifted_timestep); + timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); + adjust_sample_step_scalings(shifted_timestep, timesteps_vec, c_in, &c_skip, &c_out); + + sd::Tensor timesteps_tensor({static_cast(timesteps_vec.size())}, timesteps_vec); + sd::Tensor guidance_tensor({1}, std::vector{guidance.distilled_guidance}); + sd::Tensor noised_input = x * c_in; + if (!denoise_mask.empty() && version == VERSION_WAN2_2_TI2V) { + noised_input = noised_input * denoise_mask + init_latent * (1.0f - denoise_mask); } - timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); - if (cache_runtime.spectrum_enabled && cache_runtime.spectrum.should_predict()) { - cache_runtime.spectrum.predict(denoised); - - if (denoise_mask != nullptr) { - apply_mask(denoised, init_latent, denoise_mask); + cache_runtime.spectrum.predict(&denoised); + if (!denoise_mask.empty()) { + denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask); } - - if (sd_preview_cb != nullptr && sd_should_preview_denoised()) { - if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false); - } - } - - int64_t t1 = ggml_time_us(); - if (step > 0 || step == -(int)steps) { - int showstep = std::abs(step); - pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep); + if (sd_should_preview_denoised() && preview.callback != nullptr) { + preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false); } + report_sample_progress(step, steps, t0); return denoised; } - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); - std::vector guidance_vec(1, guidance.distilled_guidance); - auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); - - copy_ggml_tensor(noised_input, input); - // noised_input = noised_input * c_in - ggml_ext_tensor_scale_inplace(noised_input, c_in); - - if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) { - apply_mask(noised_input, init_latent, denoise_mask); - } - if (sd_preview_cb != nullptr && sd_should_preview_noisy()) { - if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, true); - } + if (sd_should_preview_noisy() && preview.callback != nullptr) { + preview_image(step, noised_input, version, preview.mode, preview.callback, preview.data, true); } - std::vector controls; - - if (control_hint != nullptr && control_net != nullptr) { - if (control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector)) { - controls = control_net->controls; - } else { - LOG_ERROR("controlnet compute failed"); - } - // print_ggml_tensor(controls[12]); - // GGML_ASSERT(0); - } - - diffusion_params.x = noised_input; - diffusion_params.timesteps = timesteps; - diffusion_params.guidance = guidance_tensor; - diffusion_params.ref_latents = ref_latents; + sd::Tensor cond_out; + sd::Tensor uncond_out; + sd::Tensor img_cond_out; + sd::Tensor skip_cond_out; + sd_sample::SampleStepCacheDispatcher step_cache(cache_runtime, step, sigma); + std::vector> controls; + DiffusionParams diffusion_params; + diffusion_params.x = &noised_input; + diffusion_params.timesteps = ×teps_tensor; + diffusion_params.guidance = &guidance_tensor; + diffusion_params.ref_latents = &ref_latents; diffusion_params.increase_ref_index = increase_ref_index; - diffusion_params.controls = controls; + diffusion_params.controls = &controls; diffusion_params.control_strength = control_strength; - diffusion_params.vace_context = vace_context; + diffusion_params.vace_context = vace_context.empty() ? nullptr : &vace_context; diffusion_params.vace_strength = vace_strength; + diffusion_params.skip_layers = nullptr; - auto run_diffusion_condition = [&](const SDCondition* condition, ggml_tensor** output_tensor) -> bool { - if (step_cache.before_condition(condition, diffusion_params.x, *output_tensor)) { - return true; + compute_sample_controls(control_image, + noised_input, + timesteps_tensor, + cond, + &controls); + + auto run_condition = [&](const SDCondition& condition, + const sd::Tensor* c_concat_override = nullptr, + const std::vector* local_skip_layers = nullptr) -> sd::Tensor { + diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn; + diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat); + diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector; + diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids; + diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights; + diffusion_params.skip_layers = local_skip_layers; + + sd::Tensor cached_output; + if (step_cache.before_condition(&condition, noised_input, &cached_output)) { + return std::move(cached_output); } - if (!work_diffusion_model->compute(n_threads, - diffusion_params, - output_tensor)) { + auto output_opt = work_diffusion_model->compute(n_threads, diffusion_params); + if (output_opt.empty()) { LOG_ERROR("diffusion model compute failed"); - return false; + return sd::Tensor(); } - step_cache.after_condition(condition, diffusion_params.x, *output_tensor); - return true; + step_cache.after_condition(&condition, noised_input, output_opt); + return output_opt; }; - const SDCondition* active_condition = nullptr; - ggml_tensor** active_output = &out_cond; if (start_merge_step == -1 || step <= start_merge_step) { - // cond - diffusion_params.context = cond.c_crossattn; - diffusion_params.c_concat = cond.c_concat; - diffusion_params.y = cond.c_vector; - active_condition = &cond; + cond_out = run_condition(cond); + if (cond_out.empty()) { + return {}; + } } else { - diffusion_params.context = id_cond.c_crossattn; - diffusion_params.c_concat = cond.c_concat; - diffusion_params.y = id_cond.c_vector; - active_condition = &id_cond; - } - - if (!run_diffusion_condition(active_condition, active_output)) { - return nullptr; - } - - bool current_step_skipped = step_cache.is_step_skipped(); - - float* negative_data = nullptr; - if (has_unconditioned) { - // uncond - if (!current_step_skipped && control_hint != nullptr && control_net != nullptr) { - if (control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector)) { - controls = control_net->controls; - } else { - LOG_ERROR("controlnet compute failed"); - } + GGML_ASSERT(!id_cond.empty()); + cond_out = run_condition(id_cond, + cond.c_concat.empty() ? nullptr : &cond.c_concat); + if (cond_out.empty()) { + return {}; } - current_step_skipped = step_cache.is_step_skipped(); - diffusion_params.controls = controls; - diffusion_params.context = uncond.c_crossattn; - diffusion_params.c_concat = uncond.c_concat; - diffusion_params.y = uncond.c_vector; - if (!run_diffusion_condition(&uncond, &out_uncond)) { - return nullptr; - } - negative_data = (float*)out_uncond->data; } - float* img_cond_data = nullptr; - if (has_img_cond) { - diffusion_params.context = img_cond.c_crossattn; - diffusion_params.c_concat = img_cond.c_concat; - diffusion_params.y = img_cond.c_vector; - if (!run_diffusion_condition(&img_cond, &out_img_cond)) { - return nullptr; + if (!uncond.empty()) { + if (!step_cache.is_step_skipped()) { + compute_sample_controls(control_image, + noised_input, + timesteps_tensor, + uncond, + &controls); + } + uncond_out = run_condition(uncond); + if (uncond_out.empty()) { + return {}; } - img_cond_data = (float*)out_img_cond->data; } - - int step_count = static_cast(sigmas.size()); - bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count); - float* skip_layer_data = has_skiplayer ? (float*)out_skip->data : nullptr; + if (!img_cond.empty()) { + img_cond_out = run_condition(img_cond, + cond.c_concat.empty() ? nullptr : &cond.c_concat); + if (img_cond_out.empty()) { + return {}; + } + } + bool is_skiplayer_step = has_skiplayer && + step > (int)(guidance.slg.layer_start * static_cast(sigmas.size())) && + step < (int)(guidance.slg.layer_end * static_cast(sigmas.size())); if (is_skiplayer_step) { LOG_DEBUG("Skipping layers at step %d\n", step); if (!step_cache.is_step_skipped()) { - // skip layer (same as conditioned) - diffusion_params.context = cond.c_crossattn; - diffusion_params.c_concat = cond.c_concat; - diffusion_params.y = cond.c_vector; - diffusion_params.skip_layers = skip_layers; - if (!work_diffusion_model->compute(n_threads, - diffusion_params, - &out_skip)) { - LOG_ERROR("diffusion model compute failed"); - return nullptr; + skip_cond_out = run_condition(cond, + cond.c_concat.empty() ? nullptr : &cond.c_concat, + &skip_layers); + if (skip_cond_out.empty()) { + return {}; } } - skip_layer_data = (float*)out_skip->data; - } - float* vec_denoised = (float*)denoised->data; - float* vec_input = (float*)input->data; - float* positive_data = (float*)out_cond->data; - int ne_elements = (int)ggml_nelements(denoised); - - if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { - int64_t shifted_t_idx = static_cast(roundf(timesteps_vec[0])); - float shifted_sigma = denoiser->t_to_sigma((float)shifted_t_idx); - std::vector shifted_scaling = denoiser->get_scalings(shifted_sigma); - float shifted_c_skip = shifted_scaling[0]; - float shifted_c_out = shifted_scaling[1]; - float shifted_c_in = shifted_scaling[2]; - - c_skip = shifted_c_skip * c_in / shifted_c_in; - c_out = shifted_c_out; } - for (int i = 0; i < ne_elements; i++) { - float latent_result = positive_data[i]; - if (has_unconditioned) { - // out_uncond + cfg_scale * (out_cond - out_uncond) - if (has_img_cond) { - // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond) - latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]); - } else { - // img_cfg_scale == cfg_scale - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); - } - } else if (has_img_cond) { - // img_cfg_scale == 1 - latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]); + GGML_ASSERT(!cond_out.empty()); + sd::Tensor latent_result = cond_out; + if (!uncond_out.empty()) { + if (!img_cond_out.empty()) { + latent_result = uncond_out + + img_cfg_scale * (img_cond_out - uncond_out) + + cfg_scale * (cond_out - img_cond_out); + } else { + latent_result = uncond_out + cfg_scale * (cond_out - uncond_out); } - if (is_skiplayer_step) { - latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; - } - // v = latent_result, eps = latent_result - // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) - vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; + } else if (!img_cond_out.empty()) { + latent_result = img_cond_out + cfg_scale * (cond_out - img_cond_out); } + if (is_skiplayer_step && !skip_cond_out.empty()) { + latent_result += (cond_out - skip_cond_out) * slg_scale; + } + denoised = latent_result * c_out + x * c_skip; if (cache_runtime.spectrum_enabled) { cache_runtime.spectrum.update(denoised); } - - if (denoise_mask != nullptr) { - apply_mask(denoised, init_latent, denoise_mask); + if (!denoise_mask.empty()) { + denoised = denoised * denoise_mask + init_latent * (1.0f - denoise_mask); } - - if (sd_preview_cb != nullptr && sd_should_preview_denoised()) { - if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false); - } - } - - int64_t t1 = ggml_time_us(); - if (step > 0 || step == -(int)steps) { - int showstep = std::abs(step); - pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep); - // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); + if (sd_should_preview_denoised() && preview.callback != nullptr) { + preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false); } + report_sample_progress(step, steps, t0); return denoised; }; - if (!sample_k_diffusion(method, denoise, work_ctx, x, sigmas, sampler_rng, eta)) { + auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta); + if (x0_opt.empty()) { LOG_ERROR("Diffusion model sampling failed"); if (control_net) { control_net->free_control_ctx(); control_net->free_compute_buffer(); } - diffusion_model->free_compute_buffer(); - return NULL; + if (work_diffusion_model) { + work_diffusion_model->free_compute_buffer(); + } + return {}; } - size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0; - log_sample_cache_summary(cache_runtime, total_steps); - + auto x0 = std::move(x0_opt); + sd_sample::log_sample_cache_summary(cache_runtime, steps); if (inverse_noise_scaling) { - x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); + x0 = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x0); } if (control_net) { control_net->free_control_ctx(); control_net->free_compute_buffer(); } - work_diffusion_model->free_compute_buffer(); - return x; + if (work_diffusion_model) { + work_diffusion_model->free_compute_buffer(); + } + return x0; } int get_vae_scale_factor() { @@ -2409,11 +1857,10 @@ public: return (h / vae_scale_factor) * (w / vae_scale_factor); } - ggml_tensor* generate_init_latent(ggml_context* work_ctx, - int width, - int height, - int frames = 1, - bool video = false) { + sd::Tensor generate_init_latent(int width, + int height, + int frames = 1, + bool video = false) { int vae_scale_factor = get_vae_scale_factor(); int W = width / vae_scale_factor; int H = height / vae_scale_factor; @@ -2422,34 +1869,35 @@ public: T = ((T - 1) / 4) + 1; } int C = get_latent_channel(); - ggml_tensor* init_latent; if (video) { - init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C); - } else { - init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + return sd::zeros({W, H, T, C, 1}); } - ggml_set_f32(init_latent, 0.f); - return init_latent; + return sd::zeros({W, H, C, 1}); } - ggml_tensor* encode_to_vae_latents(ggml_context* work_ctx, ggml_tensor* x) { - ggml_tensor* vae_output = first_stage_model->encode(n_threads, work_ctx, x, vae_tiling_params, circular_x, circular_y); - ggml_tensor* latents = first_stage_model->vae_output_to_latents(work_ctx, vae_output, rng); + sd::Tensor encode_to_vae_latents(const sd::Tensor& x) { + auto latents = first_stage_model->encode(n_threads, x, vae_tiling_params, circular_x, circular_y); + if (latents.empty()) { + return {}; + } + latents = first_stage_model->vae_output_to_latents(latents, rng); return latents; } - ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { - ggml_tensor* latents = encode_to_vae_latents(work_ctx, x); + sd::Tensor encode_first_stage(const sd::Tensor& x) { + auto latents = encode_to_vae_latents(x); + if (latents.empty()) { + return {}; + } if (version != VERSION_SD1_PIX2PIX) { - latents = first_stage_model->vae_to_diffuison_latents(work_ctx, latents); + latents = first_stage_model->vae_to_diffusion_latents(latents); } return latents; } - ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) { - x = first_stage_model->diffusion_to_vae_latents(work_ctx, x); - x = first_stage_model->decode(n_threads, work_ctx, x, vae_tiling_params, decode_video, circular_x, circular_y); - return x; + sd::Tensor decode_first_stage(const sd::Tensor& x, bool decode_video = false) { + auto latents = first_stage_model->diffusion_to_vae_latents(x); + return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); } void set_flow_shift(float flow_shift = INFINITY) { @@ -2966,667 +2414,216 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me return DISCRETE_SCHEDULER; } -sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, - ggml_context* work_ctx, - ggml_tensor* init_latent, - std::string prompt, - std::string negative_prompt, - int clip_skip, - sd_guidance_params_t guidance, - float eta, - int shifted_timestep, - int width, - int height, - enum sample_method_t sample_method, - const std::vector& sigmas, - int64_t seed, - int batch_count, - sd_image_t control_image, - float control_strength, - sd_pm_params_t pm_params, - std::vector ref_images, - std::vector ref_latents, - bool increase_ref_index, - ggml_tensor* concat_latent = nullptr, - ggml_tensor* denoise_mask = nullptr, - const sd_cache_params_t* cache_params = nullptr) { - if (seed < 0) { - // Generally, when using the provided command line, the seed is always >0. - // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library - // by a third party with a seed <0, let's incorporate randomization here. - srand((int)time(nullptr)); - seed = rand(); +static int64_t resolve_seed(int64_t seed) { + if (seed >= 0) { + return seed; } - - if (!std::isfinite(guidance.img_cfg)) { - guidance.img_cfg = guidance.txt_cfg; - } - - int sample_steps = static_cast(sigmas.size() - 1); - - int64_t t0 = ggml_time_ms(); - - ConditionerParams condition_params; - condition_params.text = prompt; - condition_params.clip_skip = clip_skip; - condition_params.width = width; - condition_params.height = height; - condition_params.ref_images = ref_images; - condition_params.adm_in_channels = static_cast(sd_ctx->sd->diffusion_model->get_adm_in_channels()); - - // Photo Maker - SDCondition id_cond = sd_ctx->sd->get_pmid_conditon(work_ctx, pm_params, condition_params); - - // Get learned condition - condition_params.zero_out_masked = false; - SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, - sd_ctx->sd->n_threads, - condition_params); - - SDCondition uncond; - if (guidance.txt_cfg != 1.0 || - (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { - bool zero_out_masked = false; - if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) { - zero_out_masked = true; - } - condition_params.text = negative_prompt; - condition_params.zero_out_masked = zero_out_masked; - uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, - sd_ctx->sd->n_threads, - condition_params); - } - int64_t t1 = ggml_time_ms(); - LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0); - - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->cond_stage_model->free_params_buffer(); - } - - // Control net hint - ggml_tensor* image_hint = nullptr; - if (control_image.data != nullptr) { - image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_ggml_tensor(control_image, image_hint); - } - - // Sample - std::vector final_latents; // collect latents to decode - int C = sd_ctx->sd->get_latent_channel(); - int W = width / sd_ctx->sd->get_vae_scale_factor(); - int H = height / sd_ctx->sd->get_vae_scale_factor(); - - ggml_tensor* control_latent = nullptr; - if (sd_version_is_control(sd_ctx->sd->version) && image_hint != nullptr) { - control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint); - ggml_ext_tensor_scale_inplace(control_latent, control_strength); - } - - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - int64_t mask_channels = 1; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - mask_channels = 8 * 8; // flatten the whole mask - } else if (sd_ctx->sd->version == VERSION_FLEX_2) { - mask_channels = 1 + init_latent->ne[2]; - } - auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); - // no mask, set the whole image as masked - for (int64_t x = 0; x < empty_latent->ne[0]; x++) { - for (int64_t y = 0; y < empty_latent->ne[1]; y++) { - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - // TODO: this might be wrong - for (int64_t c = 0; c < init_latent->ne[2]; c++) { - ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c); - } - for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) { - ggml_ext_tensor_set_f32(empty_latent, 1, x, y, c); - } - } else if (sd_ctx->sd->version == VERSION_FLEX_2) { - for (int64_t c = 0; c < empty_latent->ne[2]; c++) { - // 0x16,1x1,0x16 - ggml_ext_tensor_set_f32(empty_latent, c == init_latent->ne[2], x, y, c); - } - } else { - ggml_ext_tensor_set_f32(empty_latent, 1, x, y, 0); - for (int64_t c = 1; c < empty_latent->ne[2]; c++) { - ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c); - } - } - } - } - - if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != nullptr && sd_ctx->sd->control_net == nullptr) { - bool no_inpaint = concat_latent == nullptr; - if (no_inpaint) { - concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); - } - // fill in the control image here - for (int64_t x = 0; x < control_latent->ne[0]; x++) { - for (int64_t y = 0; y < control_latent->ne[1]; y++) { - if (no_inpaint) { - for (int64_t c = 0; c < concat_latent->ne[2] - control_latent->ne[2]; c++) { - // 0x16,1x1,0x16 - ggml_ext_tensor_set_f32(concat_latent, c == init_latent->ne[2], x, y, c); - } - } - for (int64_t c = 0; c < control_latent->ne[2]; c++) { - float v = ggml_ext_tensor_get_f32(control_latent, x, y, c); - ggml_ext_tensor_set_f32(concat_latent, v, x, y, concat_latent->ne[2] - control_latent->ne[2] + c); - } - } - } - } else if (concat_latent == nullptr) { - concat_latent = empty_latent; - } - cond.c_concat = concat_latent; - uncond.c_concat = empty_latent; - denoise_mask = nullptr; - } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) { - auto empty_latent = ggml_dup_tensor(work_ctx, init_latent); - ggml_set_f32(empty_latent, 0); - uncond.c_concat = empty_latent; - cond.c_concat = ref_latents[0]; - if (cond.c_concat == nullptr) { - cond.c_concat = empty_latent; - } - } else if (sd_version_is_control(sd_ctx->sd->version)) { - auto empty_latent = ggml_dup_tensor(work_ctx, init_latent); - ggml_set_f32(empty_latent, 0); - uncond.c_concat = empty_latent; - if (sd_ctx->sd->control_net == nullptr) { - cond.c_concat = control_latent; - } - if (cond.c_concat == nullptr) { - cond.c_concat = empty_latent; - } - } - SDCondition img_cond; - if (uncond.c_crossattn != nullptr && - (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { - img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); - } - for (int b = 0; b < batch_count; b++) { - int64_t sampling_start = ggml_time_ms(); - int64_t cur_seed = seed + b; - LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed); - - sd_ctx->sd->rng->manual_seed(cur_seed); - sd_ctx->sd->sampler_rng->manual_seed(cur_seed); - ggml_tensor* x_t = init_latent; - ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); - ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng); - - int start_merge_step = -1; - if (sd_ctx->sd->use_pmid) { - start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps); - // if (start_merge_step > 30) - // start_merge_step = 30; - LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); - } - - ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, - sd_ctx->sd->diffusion_model, - true, - x_t, - noise, - cond, - uncond, - img_cond, - image_hint, - control_strength, - guidance, - eta, - shifted_timestep, - sample_method, - sigmas, - start_merge_step, - id_cond, - ref_latents, - increase_ref_index, - denoise_mask, - nullptr, - 1.0f, - cache_params); - int64_t sampling_end = ggml_time_ms(); - if (x_0 != nullptr) { - // print_ggml_tensor(x_0); - LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - final_latents.push_back(x_0); - } else { - LOG_ERROR("sampling for image %d/%d failed after %.2fs", b + 1, batch_count, (sampling_end - sampling_start) * 1.0f / 1000); - } - } - - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } - int64_t t3 = ggml_time_ms(); - LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000); - - // Decode to image - LOG_INFO("decoding %zu latents", final_latents.size()); - std::vector decoded_images; // collect decoded images - for (size_t i = 0; i < final_latents.size(); i++) { - t1 = ggml_time_ms(); - ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); - // print_ggml_tensor(img); - if (img != nullptr) { - decoded_images.push_back(img); - } - int64_t t2 = ggml_time_ms(); - LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000); - } - - int64_t t4 = ggml_time_ms(); - LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->first_stage_model->free_params_buffer(); - } - - sd_ctx->sd->lora_stat(); - - sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); - if (result_images == nullptr) { - ggml_free(work_ctx); - return nullptr; - } - memset(result_images, 0, batch_count * sizeof(sd_image_t)); - - for (size_t i = 0; i < decoded_images.size(); i++) { - result_images[i].width = width; - result_images[i].height = height; - result_images[i].channel = 3; - result_images[i].data = ggml_tensor_to_sd_image(decoded_images[i]); - } - ggml_free(work_ctx); - - return result_images; + srand((int)time(nullptr)); + return rand(); } -sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { - sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params; +static enum sample_method_t resolve_sample_method(sd_ctx_t* sd_ctx, enum sample_method_t sample_method) { + if (sample_method == SAMPLE_METHOD_COUNT) { + return sd_get_default_sample_method(sd_ctx); + } + return sample_method; +} - int width = sd_img_gen_params->width; - int height = sd_img_gen_params->height; +static scheduler_t resolve_scheduler(sd_ctx_t* sd_ctx, + scheduler_t scheduler, + enum sample_method_t sample_method) { + if (scheduler == SCHEDULER_COUNT) { + return sd_get_default_scheduler(sd_ctx, sample_method); + } + return scheduler; +} - int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); - int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); - int spatial_multiple = vae_scale_factor * diffusion_model_down_factor; +struct GenerationRequest { + std::string prompt; + std::string negative_prompt; + int width = -1; + int height = -1; + int clip_skip = -1; + int vae_scale_factor = -1; + int diffusion_model_down_factor = -1; + int64_t seed = -1; + bool use_uncond = false; + bool use_img_cond = false; + bool use_high_noise_uncond = false; + bool use_high_noise_img_cond = false; + const sd_cache_params_t* cache_params = nullptr; + int batch_count = 1; + int shifted_timestep = 0; + float strength = 1.f; + float control_strength = 0.f; + float eta = 0.f; + bool increase_ref_index = false; + bool auto_resize_ref_image = false; + sd_guidance_params_t guidance = {}; + sd_guidance_params_t high_noise_guidance = {}; + sd_pm_params_t pm_params = {}; + int frames = -1; + float vace_strength = 1.f; + + GenerationRequest(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { + prompt = SAFE_STR(sd_img_gen_params->prompt); + negative_prompt = SAFE_STR(sd_img_gen_params->negative_prompt); + width = sd_img_gen_params->width; + height = sd_img_gen_params->height; + vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); + diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); + seed = sd_img_gen_params->seed; + batch_count = sd_img_gen_params->batch_count; + clip_skip = sd_img_gen_params->clip_skip; + shifted_timestep = sd_img_gen_params->sample_params.shifted_timestep; + strength = sd_img_gen_params->strength; + control_strength = sd_img_gen_params->control_strength; + eta = sd_img_gen_params->sample_params.eta; + increase_ref_index = sd_img_gen_params->increase_ref_index; + auto_resize_ref_image = sd_img_gen_params->auto_resize_ref_image; + guidance = sd_img_gen_params->sample_params.guidance; + pm_params = sd_img_gen_params->pm_params; + cache_params = &sd_img_gen_params->cache; + resolve(sd_ctx); + } + + GenerationRequest(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) { + prompt = SAFE_STR(sd_vid_gen_params->prompt); + negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt); + width = sd_vid_gen_params->width; + height = sd_vid_gen_params->height; + frames = (sd_vid_gen_params->video_frames - 1) / 4 * 4 + 1; + clip_skip = sd_vid_gen_params->clip_skip; + vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); + diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); + seed = sd_vid_gen_params->seed; + cache_params = &sd_vid_gen_params->cache; + vace_strength = sd_vid_gen_params->vace_strength; + guidance = sd_vid_gen_params->sample_params.guidance; + high_noise_guidance = sd_vid_gen_params->high_noise_sample_params.guidance; + resolve(sd_ctx); + } + + void align_generation_request_size() { + int spatial_multiple = vae_scale_factor * diffusion_model_down_factor; + int width_offset = align_up_offset(width, spatial_multiple); + int height_offset = align_up_offset(height, spatial_multiple); + if (width_offset <= 0 && height_offset <= 0) { + return; + } + + int original_width = width; + int original_height = height; - int width_offset = align_up_offset(width, spatial_multiple); - int height_offset = align_up_offset(height, spatial_multiple); - if (width_offset > 0 || height_offset > 0) { width += width_offset; height += height_offset; - LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple); + LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", + original_width, + original_height, + width, + height, + spatial_multiple); } - bool circular_x = sd_ctx->sd->circular_x; - bool circular_y = sd_ctx->sd->circular_y; - - if (!sd_img_gen_params->vae_tiling_params.enabled) { - if (sd_ctx->sd->first_stage_model) { - sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); - } - if (sd_ctx->sd->preview_vae) { - sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); - } - } else { - int tile_size_x, tile_size_y; - float _overlap; - int latent_size_x = width / sd_ctx->sd->get_vae_scale_factor(); - int latent_size_y = height / sd_ctx->sd->get_vae_scale_factor(); - sd_ctx->sd->first_stage_model->get_tile_sizes(tile_size_x, tile_size_y, _overlap, sd_img_gen_params->vae_tiling_params, latent_size_x, latent_size_y); - - // force disable circular padding for vae if tiling is enabled unless latent is smaller than tile size - // otherwise it will cause artifacts at the edges of the tiles - sd_ctx->sd->circular_x = sd_ctx->sd->circular_x && (tile_size_x >= latent_size_x); - sd_ctx->sd->circular_y = sd_ctx->sd->circular_y && (tile_size_y >= latent_size_y); - - if (sd_ctx->sd->first_stage_model) { - sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); - } - if (sd_ctx->sd->preview_vae) { - sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + static void resolve_guidance(sd_ctx_t* sd_ctx, + sd_guidance_params_t* guidance, + bool* use_uncond, + bool* use_img_cond, + const char* stage_name = nullptr) { + GGML_ASSERT(guidance != nullptr); + GGML_ASSERT(use_uncond != nullptr); + GGML_ASSERT(use_img_cond != nullptr); + // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond) + // img_cfg == txt_cfg means that img_cfg is not used + if (!std::isfinite(guidance->img_cfg)) { + guidance->img_cfg = guidance->txt_cfg; } - // disable circular tiling if it's enabled for the VAE - sd_ctx->sd->circular_x = circular_x && (tile_size_x < latent_size_x); - sd_ctx->sd->circular_y = circular_y && (tile_size_y < latent_size_y); - } - - LOG_DEBUG("generate_image %dx%d", width, height); - if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { - return nullptr; - } - - ggml_init_params params; - params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = nullptr; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); - - ggml_context* work_ctx = ggml_init(params); - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); - return nullptr; - } - - int64_t seed = sd_img_gen_params->seed; - if (seed < 0) { - srand((int)time(nullptr)); - seed = rand(); - } - sd_ctx->sd->rng->manual_seed(seed); - sd_ctx->sd->sampler_rng->manual_seed(seed); - - size_t t0 = ggml_time_ms(); - - sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift); - - // Apply lora - sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count); - - enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method; - if (sample_method == SAMPLE_METHOD_COUNT) { - sample_method = sd_get_default_sample_method(sd_ctx); - } - LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - - int sample_steps = sd_img_gen_params->sample_params.sample_steps; - std::vector sigmas; - if (sd_img_gen_params->sample_params.custom_sigmas_count > 0) { - sigmas = std::vector(sd_img_gen_params->sample_params.custom_sigmas, - sd_img_gen_params->sample_params.custom_sigmas + sd_img_gen_params->sample_params.custom_sigmas_count); - if (sample_steps != sigmas.size() - 1) { - sample_steps = static_cast(sigmas.size()) - 1; - LOG_WARN("sample_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps); + if (!sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version)) { + guidance->img_cfg = guidance->txt_cfg; } - } else { - scheduler_t scheduler = sd_img_gen_params->sample_params.scheduler; - if (scheduler == SCHEDULER_COUNT) { - scheduler = sd_get_default_scheduler(sd_ctx, sample_method); + + if (guidance->txt_cfg != 1.f) { + *use_uncond = true; } - sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps, - sd_ctx->sd->get_image_seq_len(height, width), - scheduler, - sd_ctx->sd->version); - } - ggml_tensor* init_latent = nullptr; - ggml_tensor* concat_latent = nullptr; - ggml_tensor* denoise_mask = nullptr; - if (sd_img_gen_params->init_image.data) { - LOG_INFO("IMG2IMG"); + if (guidance->img_cfg != guidance->txt_cfg) { + *use_img_cond = true; + *use_uncond = true; + } - size_t t_enc = static_cast(sample_steps * sd_img_gen_params->strength); - if (t_enc == sample_steps) - t_enc--; - LOG_INFO("target t_enc is %zu steps", t_enc); - std::vector sigma_sched; - sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); - sigmas = sigma_sched; - - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); - - sd_image_to_ggml_tensor(sd_img_gen_params->mask_image, mask_img); - sd_image_to_ggml_tensor(sd_img_gen_params->init_image, init_img); - - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - int64_t mask_channels = 1; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - mask_channels = vae_scale_factor * vae_scale_factor; // flatten the whole mask - } else if (sd_ctx->sd->version == VERSION_FLEX_2) { - mask_channels = 1 + sd_ctx->sd->get_latent_channel(); - } - ggml_tensor* masked_latent = nullptr; - - if (sd_ctx->sd->version != VERSION_FLEX_2) { - // most inpaint models mask before vae - ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - ggml_ext_tensor_apply_mask(init_img, mask_img, masked_img); - masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + if (guidance->txt_cfg < 1.f) { + const char* prefix = stage_name == nullptr ? "" : stage_name; + if (guidance->txt_cfg == 0.f) { + LOG_WARN("%sunconditioned mode, images won't follow the prompt (use cfg-scale=1 for distilled models)", + prefix); } else { - // mask after vae - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1); - ggml_ext_tensor_apply_mask(init_latent, mask_img, masked_latent, 0.); - } - concat_latent = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - masked_latent->ne[0], - masked_latent->ne[1], - mask_channels + masked_latent->ne[2], - 1); - for (int ix = 0; ix < masked_latent->ne[0]; ix++) { - for (int iy = 0; iy < masked_latent->ne[1]; iy++) { - int mx = ix * vae_scale_factor; - int my = iy * vae_scale_factor; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - for (int k = 0; k < masked_latent->ne[2]; k++) { - float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k); - ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k); - } - // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image - for (int x = 0; x < vae_scale_factor; x++) { - for (int y = 0; y < vae_scale_factor; y++) { - float m = ggml_ext_tensor_get_f32(mask_img, mx + x, my + y); - // TODO: check if the way the mask is flattened is correct (is it supposed to be x*vae_scale_factor+y or x+vae_scale_factor*y?) - // python code was using "b (h vae_scale_factor) (w vae_scale_factor) -> b (vae_scale_factor vae_scale_factor) h w" - ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * vae_scale_factor + y); - } - } - } else if (sd_ctx->sd->version == VERSION_FLEX_2) { - float m = ggml_ext_tensor_get_f32(mask_img, mx, my); - // masked image - for (int k = 0; k < masked_latent->ne[2]; k++) { - float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k); - ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k); - } - // downsampled mask - ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2]); - // control (todo: support this) - for (int k = 0; k < masked_latent->ne[2]; k++) { - ggml_ext_tensor_set_f32(concat_latent, 0, ix, iy, masked_latent->ne[2] + 1 + k); - } - } else { - float m = ggml_ext_tensor_get_f32(mask_img, mx, my); - ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, 0); - for (int k = 0; k < masked_latent->ne[2]; k++) { - float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k); - ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels); - } - } - } - } - } else { - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - } - - { - // LOG_WARN("Inpainting with a base model is not great"); - denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / vae_scale_factor, height / vae_scale_factor, 1, 1); - for (int ix = 0; ix < denoise_mask->ne[0]; ix++) { - for (int iy = 0; iy < denoise_mask->ne[1]; iy++) { - int mx = ix * vae_scale_factor; - int my = iy * vae_scale_factor; - float m = ggml_ext_tensor_get_f32(mask_img, mx, my); - ggml_ext_tensor_set_f32(denoise_mask, m, ix, iy); - } + LOG_WARN("%scfg value out of expected range may produce unexpected results", prefix); } } - } else { - LOG_INFO("TXT2IMG"); - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); - } - init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height); } - sd_guidance_params_t guidance = sd_img_gen_params->sample_params.guidance; - std::vector ref_images; - for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) { - ref_images.push_back(&sd_img_gen_params->ref_images[i]); - } + void resolve(sd_ctx_t* sd_ctx) { + align_generation_request_size(); + seed = resolve_seed(seed); - std::vector empty_image_data; - sd_image_t empty_image = {(uint32_t)width, (uint32_t)height, 3, nullptr}; - if (ref_images.empty() && sd_version_is_unet_edit(sd_ctx->sd->version)) { - LOG_WARN("This model needs at least one reference image; using an empty reference"); - empty_image_data.resize(width * height * 3); - ref_images.push_back(&empty_image); - empty_image.data = empty_image_data.data(); - guidance.img_cfg = 0.f; - } - - if (ref_images.size() > 0) { - LOG_INFO("EDIT mode"); - } - - std::vector ref_latents; - for (int i = 0; i < ref_images.size(); i++) { - ggml_tensor* img; - if (sd_img_gen_params->auto_resize_ref_image) { - LOG_DEBUG("auto resize ref images"); - sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]); - int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height); - double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height); - double vae_height = vae_width * ref_image.height / ref_image.width; - - int factor = 16; - if (sd_version_is_qwen_image(sd_ctx->sd->version)) { - factor = 32; - } - - vae_height = round(vae_height / factor) * factor; - vae_width = round(vae_width / factor) * factor; - - sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast(vae_width), static_cast(vae_height)); - free(ref_image.data); - ref_image.data = nullptr; - - LOG_DEBUG("resize vae ref image %d from %dx%d to %dx%d", i, ref_image.height, ref_image.width, resized_image.height, resized_image.width); - - img = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - resized_image.width, - resized_image.height, - 3, - 1); - sd_image_f32_to_ggml_tensor(resized_image, img); - free(resized_image.data); - resized_image.data = nullptr; - } else { - img = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - ref_images[i]->width, - ref_images[i]->height, - 3, - 1); - sd_image_to_ggml_tensor(*ref_images[i], img); + resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_cond); + if (sd_ctx->sd->high_noise_diffusion_model) { + resolve_guidance(sd_ctx, + &high_noise_guidance, + &use_high_noise_uncond, + &use_high_noise_img_cond, + "high noise: "); } - // print_ggml_tensor(img, false, "img"); - - ggml_tensor* latent = sd_ctx->sd->encode_first_stage(work_ctx, img); - ref_latents.push_back(latent); - } - - if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) { - size_t t1 = ggml_time_ms(); - LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - } - - sd_image_t* result_images = generate_image_internal(sd_ctx, - work_ctx, - init_latent, - SAFE_STR(sd_img_gen_params->prompt), - SAFE_STR(sd_img_gen_params->negative_prompt), - sd_img_gen_params->clip_skip, - guidance, - sd_img_gen_params->sample_params.eta, - sd_img_gen_params->sample_params.shifted_timestep, - width, - height, - sample_method, - sigmas, - seed, - sd_img_gen_params->batch_count, - sd_img_gen_params->control_image, - sd_img_gen_params->control_strength, - sd_img_gen_params->pm_params, - ref_images, - ref_latents, - sd_img_gen_params->increase_ref_index, - concat_latent, - denoise_mask, - &sd_img_gen_params->cache); - - // restore circular params - sd_ctx->sd->circular_x = circular_x; - sd_ctx->sd->circular_y = circular_y; - - size_t t2 = ggml_time_ms(); - - LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000); - - return result_images; -} - -SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) { - if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) { - return nullptr; - } - sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params; - - std::string prompt = SAFE_STR(sd_vid_gen_params->prompt); - std::string negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt); - - int width = sd_vid_gen_params->width; - int height = sd_vid_gen_params->height; - int frames = sd_vid_gen_params->video_frames; - frames = (frames - 1) / 4 * 4 + 1; - int sample_steps = sd_vid_gen_params->sample_params.sample_steps; - - int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); - int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); - int spatial_multiple = vae_scale_factor * diffusion_model_down_factor; - - int width_offset = align_up_offset(width, spatial_multiple); - int height_offset = align_up_offset(height, spatial_multiple); - if (width_offset > 0 || height_offset > 0) { - width += width_offset; - height += height_offset; - LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple); - } - LOG_INFO("generate_video %dx%dx%d", width, height, frames); - - sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift); - - enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method; - if (sample_method == SAMPLE_METHOD_COUNT) { - sample_method = sd_get_default_sample_method(sd_ctx); - } - LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - - int high_noise_sample_steps = 0; - if (sd_ctx->sd->high_noise_diffusion_model) { - high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps; - } - - int total_steps = sample_steps; - - if (high_noise_sample_steps > 0) { - total_steps += high_noise_sample_steps; + if (shifted_timestep > 0 && !sd_version_is_sdxl(sd_ctx->sd->version)) { + LOG_WARN("timestep shifting is only supported for SDXL models!"); + shifted_timestep = 0; + } } +}; +struct SamplePlan { + enum sample_method_t sample_method = SAMPLE_METHOD_COUNT; + enum sample_method_t high_noise_sample_method = SAMPLE_METHOD_COUNT; + int sample_steps = 0; + int high_noise_sample_steps = 0; + int total_steps = 0; + float moe_boundary = 0.f; + int start_merge_step = -1; std::vector sigmas; - if (sd_vid_gen_params->sample_params.custom_sigmas_count > 0) { - sigmas = std::vector(sd_vid_gen_params->sample_params.custom_sigmas, - sd_vid_gen_params->sample_params.custom_sigmas + sd_vid_gen_params->sample_params.custom_sigmas_count); - if (total_steps != sigmas.size() - 1) { + + SamplePlan(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + const GenerationRequest& request) { + sample_method = sd_img_gen_params->sample_params.sample_method; + sample_steps = sd_img_gen_params->sample_params.sample_steps; + resolve(sd_ctx, &request, &sd_img_gen_params->sample_params); + } + + SamplePlan(sd_ctx_t* sd_ctx, + const sd_vid_gen_params_t* sd_vid_gen_params, + const GenerationRequest& request) { + sample_method = sd_vid_gen_params->sample_params.sample_method; + sample_steps = sd_vid_gen_params->sample_params.sample_steps; + if (sd_ctx->sd->high_noise_diffusion_model) { + high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps; + high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method; + } + moe_boundary = sd_vid_gen_params->moe_boundary; + resolve(sd_ctx, &request, &sd_vid_gen_params->sample_params); + } + + void resolve(sd_ctx_t* sd_ctx, + const GenerationRequest* request, + const sd_sample_params_t* sample_params) { + sample_method = resolve_sample_method(sd_ctx, sample_method); + + total_steps = sample_steps + std::max(0, high_noise_sample_steps); + + if (sample_params->custom_sigmas_count > 0) { + sigmas = std::vector(sample_params->custom_sigmas, + sample_params->custom_sigmas + sample_params->custom_sigmas_count); total_steps = static_cast(sigmas.size()) - 1; LOG_WARN("total_steps != custom_sigmas_count - 1, set total_steps to %d", total_steps); if (sample_steps >= total_steps) { @@ -3637,60 +2634,559 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s high_noise_sample_steps = total_steps - sample_steps; LOG_WARN("total_steps != custom_sigmas_count - 1, set high_noise_sample_steps to %d", high_noise_sample_steps); } + } else { + scheduler_t scheduler = resolve_scheduler(sd_ctx, + sample_params->scheduler, + sample_method); + sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, + sd_ctx->sd->get_image_seq_len(request->height, request->width), + scheduler, + sd_ctx->sd->version); } - } else { - scheduler_t scheduler = sd_vid_gen_params->sample_params.scheduler; - if (scheduler == SCHEDULER_COUNT) { - scheduler = sd_get_default_scheduler(sd_ctx, sample_method); - } - sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, - 0, - scheduler, - sd_ctx->sd->version); - } - if (high_noise_sample_steps < 0) { - // timesteps �?sigmas for Flow models (like wan2.2 a14b) - for (size_t i = 0; i < sigmas.size(); ++i) { - if (sigmas[i] < sd_vid_gen_params->moe_boundary) { - high_noise_sample_steps = static_cast(i); - break; + if (high_noise_sample_steps < 0) { + for (size_t i = 0; i < sigmas.size(); ++i) { + if (sigmas[i] < moe_boundary) { + high_noise_sample_steps = static_cast(i); + break; + } } + LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps); } - LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps); + + LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); + if (high_noise_sample_steps > 0) { + high_noise_sample_method = resolve_sample_method(sd_ctx, + high_noise_sample_method); + LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]); + } + + if (sd_ctx->sd->use_pmid) { + start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * total_steps); + LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); + } + } +}; + +struct ImageGenerationLatents { + sd::Tensor init_latent; + sd::Tensor concat_latent; + sd::Tensor uncond_concat_latent; + sd::Tensor control_image; + std::vector> ref_images; + std::vector> ref_latents; + sd::Tensor denoise_mask; + sd::Tensor clip_vision_output; + sd::Tensor vace_context; + int64_t ref_image_num = 0; +}; + +struct ImageGenerationEmbeds { + SDCondition cond; + SDCondition uncond; + SDCondition img_cond; + SDCondition id_cond; +}; + +struct CircularAxesState { + bool circular_x = false; + bool circular_y = false; +}; + +static CircularAxesState configure_image_vae_axes(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + const GenerationRequest& request) { + CircularAxesState original_axes = {sd_ctx->sd->circular_x, sd_ctx->sd->circular_y}; + + if (!sd_img_gen_params->vae_tiling_params.enabled) { + if (sd_ctx->sd->first_stage_model) { + sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + if (sd_ctx->sd->preview_vae) { + sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + return original_axes; } - ggml_init_params params; - params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = nullptr; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); + int tile_size_x, tile_size_y; + float overlap; + int latent_size_x = request.width / request.vae_scale_factor; + int latent_size_y = request.height / request.vae_scale_factor; + sd_ctx->sd->first_stage_model->get_tile_sizes(tile_size_x, + tile_size_y, + overlap, + sd_img_gen_params->vae_tiling_params, + latent_size_x, + latent_size_y); - ggml_context* work_ctx = ggml_init(params); - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); + sd_ctx->sd->circular_x = sd_ctx->sd->circular_x && (tile_size_x >= latent_size_x); + sd_ctx->sd->circular_y = sd_ctx->sd->circular_y && (tile_size_y >= latent_size_y); + + if (sd_ctx->sd->first_stage_model) { + sd_ctx->sd->first_stage_model->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + if (sd_ctx->sd->preview_vae) { + sd_ctx->sd->preview_vae->set_circular_axes(sd_ctx->sd->circular_x, sd_ctx->sd->circular_y); + } + + sd_ctx->sd->circular_x = original_axes.circular_x && (tile_size_x < latent_size_x); + sd_ctx->sd->circular_y = original_axes.circular_y && (tile_size_y < latent_size_y); + + return original_axes; +} + +static void restore_image_vae_axes(sd_ctx_t* sd_ctx, const CircularAxesState& original_axes) { + sd_ctx->sd->circular_x = original_axes.circular_x; + sd_ctx->sd->circular_y = original_axes.circular_y; +} + +class ImageVaeAxesGuard { +private: + sd_ctx_t* sd_ctx = nullptr; + CircularAxesState original_axes; + +public: + ImageVaeAxesGuard(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + const GenerationRequest& request) + : sd_ctx(sd_ctx), + original_axes(configure_image_vae_axes(sd_ctx, sd_img_gen_params, request)) {} + + ~ImageVaeAxesGuard() { + restore_image_vae_axes(sd_ctx, original_axes); + } + + ImageVaeAxesGuard(const ImageVaeAxesGuard&) = delete; + ImageVaeAxesGuard& operator=(const ImageVaeAxesGuard&) = delete; +}; + +static std::optional prepare_image_generation_latents(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + GenerationRequest* request, + SamplePlan* plan) { + int64_t prepare_start_ms = ggml_time_ms(); + + sd::Tensor init_image_tensor; + sd::Tensor control_image_tensor; + sd::Tensor mask_image_tensor; + + if (sd_img_gen_params->init_image.data != nullptr) { + LOG_INFO("IMG2IMG"); + + if (request->strength < 1.f) { + size_t t_enc = static_cast(plan->sample_steps * request->strength); + if (t_enc == static_cast(plan->sample_steps)) { + t_enc--; + } + LOG_INFO("target t_enc is %zu steps", t_enc); + std::vector sigma_sched; + sigma_sched.assign(plan->sigmas.begin() + plan->sample_steps - t_enc - 1, plan->sigmas.end()); + plan->sigmas = std::move(sigma_sched); + plan->sample_steps = static_cast(plan->sigmas.size() - 1); + } + + init_image_tensor = sd_image_to_tensor(sd_img_gen_params->init_image, request->width, request->height); + } + + if (sd_img_gen_params->mask_image.data != nullptr) { + mask_image_tensor = sd_image_to_tensor(sd_img_gen_params->mask_image, request->width, request->height); + mask_image_tensor = sd::ops::round(mask_image_tensor); + } + + if (sd_img_gen_params->control_image.data != nullptr) { + control_image_tensor = sd_image_to_tensor(sd_img_gen_params->control_image, request->width, request->height); + } + + if (init_image_tensor.empty() || mask_image_tensor.empty()) { + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + LOG_WARN("inpainting model requires both an init image and a mask image."); + } + } + + if (mask_image_tensor.empty()) { + mask_image_tensor = sd::full({request->width, request->height, 1, 1}, 1.f); + } + + sd::Tensor latent_mask = sd::ops::interpolate(mask_image_tensor, + {request->width / request->vae_scale_factor, + request->height / request->vae_scale_factor, + 1, + 1}); + + sd::Tensor init_latent; + sd::Tensor control_latent; + if (init_image_tensor.empty()) { + init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height); + } else { + init_latent = sd_ctx->sd->encode_first_stage(init_image_tensor); + if (init_latent.empty()) { + LOG_ERROR("failed to encode init image"); + return std::nullopt; + } + } + + if (!control_image_tensor.empty() && !sd_ctx->sd->vae_decode_only) { + control_latent = sd_ctx->sd->encode_first_stage(control_image_tensor); + if (control_latent.empty()) { + LOG_ERROR("failed to encode control image"); + return std::nullopt; + } + } + + std::vector> ref_images; + for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) { + ref_images.push_back(sd_image_to_tensor(sd_img_gen_params->ref_images[i])); + } + + if (ref_images.empty() && sd_version_is_unet_edit(sd_ctx->sd->version)) { + LOG_WARN("This model needs at least one reference image; using an empty reference"); + ref_images.push_back(sd::zeros({request->width, request->height, 3, 1})); + request->guidance.img_cfg = request->guidance.txt_cfg; + } + + if (!ref_images.empty()) { + LOG_INFO("EDIT mode"); + } + + std::vector> ref_latents; + for (size_t i = 0; i < ref_images.size(); i++) { + sd::Tensor ref_latent; + if (request->auto_resize_ref_image) { + LOG_DEBUG("auto resize ref images"); + int vae_image_size = std::min(1024 * 1024, request->width * request->height); + double vae_width = sqrt(vae_image_size * ref_images[i].shape()[0] / ref_images[i].shape()[1]); + double vae_height = vae_width * ref_images[i].shape()[1] / ref_images[i].shape()[0]; + + int factor = sd_version_is_qwen_image(sd_ctx->sd->version) ? 32 : 16; + vae_height = round(vae_height / factor) * factor; + vae_width = round(vae_width / factor) * factor; + + auto resized_ref_img = sd::ops::interpolate(ref_images[i], + {static_cast(vae_width), static_cast(vae_height), 3, 1}); + + LOG_DEBUG("resize vae ref image %d from %" PRId64 "x%" PRId64 " to %" PRId64 "x%" PRId64, + static_cast(i), + ref_images[i].shape()[1], + ref_images[i].shape()[0], + resized_ref_img.shape()[1], + resized_ref_img.shape()[0]); + + ref_latent = sd_ctx->sd->encode_first_stage(resized_ref_img); + } else { + ref_latent = sd_ctx->sd->encode_first_stage(ref_images[i]); + } + if (ref_latent.empty()) { + LOG_ERROR("failed to encode reference image %d", static_cast(i)); + return std::nullopt; + } + + ref_latents.push_back(std::move(ref_latent)); + } + + sd::Tensor concat_latent; + sd::Tensor uncond_concat_latent; + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + sd::Tensor masked_init_latent; + + if (sd_ctx->sd->version != VERSION_FLEX_2) { + if (!init_image_tensor.empty()) { + auto masked_image = ((1.0f - mask_image_tensor) * (init_image_tensor - 0.5f)) + 0.5f; + masked_init_latent = sd_ctx->sd->encode_first_stage(masked_image); + if (masked_init_latent.empty()) { + LOG_ERROR("failed to encode masked init image"); + return std::nullopt; + } + } else { + masked_init_latent = sd::Tensor::zeros_like(init_latent); + } + } else { + masked_init_latent = ((1.0f - latent_mask) * init_latent); + } + + auto uncond_masked_init_latent = sd::Tensor::zeros_like(masked_init_latent); + + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + auto mask = mask_image_tensor.reshape({request->vae_scale_factor, + request->width / request->vae_scale_factor, + request->vae_scale_factor, + request->height / request->vae_scale_factor}); + mask = mask.permute({1, 3, 0, 2}).reshape({request->width / request->vae_scale_factor, request->height / request->vae_scale_factor, request->vae_scale_factor * request->vae_scale_factor, 1}); + + concat_latent = sd::ops::concat(masked_init_latent, mask, 2); + uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, mask, 2); + } else if (sd_ctx->sd->version == VERSION_FLEX_2) { + concat_latent = sd::ops::concat(masked_init_latent, latent_mask, 2); + if (!control_latent.empty()) { + concat_latent = sd::ops::concat(concat_latent, control_latent, 2); + } else { + concat_latent = sd::ops::concat(concat_latent, sd::Tensor::zeros_like(masked_init_latent), 2); + } + + uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, latent_mask, 2); + uncond_concat_latent = sd::ops::concat(uncond_concat_latent, sd::Tensor::zeros_like(masked_init_latent), 2); + } else { // SD1.x SD2.x SDXL inpaint + concat_latent = sd::ops::concat(latent_mask, masked_init_latent, 2); + uncond_concat_latent = sd::ops::concat(latent_mask, uncond_masked_init_latent, 2); + } + } + if (sd_version_is_unet_edit(sd_ctx->sd->version)) { + concat_latent = sd::ops::interpolate(ref_latents[0], init_latent.shape()); + uncond_concat_latent = sd::Tensor::zeros_like(concat_latent); + } + if (sd_version_is_control(sd_ctx->sd->version)) { + if (!control_latent.empty()) { + concat_latent = control_latent; + } else { + concat_latent = sd::Tensor::zeros_like(init_latent); + } + uncond_concat_latent = sd::Tensor::zeros_like(concat_latent); + } + + if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) { + int64_t t1 = ggml_time_ms(); + LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000); + } + + ImageGenerationLatents latents; + latents.init_latent = std::move(init_latent); + latents.concat_latent = std::move(concat_latent); + latents.uncond_concat_latent = std::move(uncond_concat_latent); + latents.control_image = std::move(control_image_tensor); + latents.ref_images = std::move(ref_images); + latents.ref_latents = std::move(ref_latents); + + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + latents.denoise_mask = std::move(latent_mask); + } + + return latents; +} + +static std::optional prepare_image_generation_embeds(sd_ctx_t* sd_ctx, + const sd_img_gen_params_t* sd_img_gen_params, + GenerationRequest* request, + SamplePlan* plan, + ImageGenerationLatents* latents) { + ConditionerParams condition_params; + condition_params.text = request->prompt; + condition_params.clip_skip = request->clip_skip; + condition_params.width = request->width; + condition_params.height = request->height; + condition_params.ref_images = &latents->ref_images; + condition_params.adm_in_channels = static_cast(sd_ctx->sd->diffusion_model->get_adm_in_channels()); + + auto id_cond = sd_ctx->sd->get_pmid_conditon(request->pm_params, condition_params); + int64_t prepare_start_ms = ggml_time_ms(); + condition_params.zero_out_masked = false; + auto cond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, + condition_params); + if (cond.c_concat.empty()) { + cond.c_concat = latents->concat_latent; // TODO: optimize + } + + SDCondition uncond; + if (request->use_uncond || request->use_high_noise_uncond) { + bool zero_out_masked = false; + if (sd_version_is_sdxl(sd_ctx->sd->version) && + request->negative_prompt.empty() && + !sd_ctx->sd->is_using_edm_v_parameterization) { + zero_out_masked = true; + } + condition_params.text = request->negative_prompt; + condition_params.zero_out_masked = zero_out_masked; + uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, + condition_params); + if (uncond.c_concat.empty()) { + uncond.c_concat = latents->uncond_concat_latent; // TODO: optimize + } + } + + int64_t t1 = ggml_time_ms(); + LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000); + + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->cond_stage_model->free_params_buffer(); + } + + ImageGenerationEmbeds embeds; + if (request->use_img_cond) { + embeds.img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); + } + embeds.cond = std::move(cond); + embeds.uncond = std::move(uncond); + embeds.id_cond = std::move(id_cond); + + return embeds; +} + +static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx, + const GenerationRequest& request, + const std::vector>& final_latents) { + if (final_latents.size() != static_cast(request.batch_count)) { + LOG_ERROR("expected %d latents, got %zu", request.batch_count, final_latents.size()); + return nullptr; + } + LOG_INFO("decoding %zu latents", final_latents.size()); + std::vector> decoded_images; + int64_t t0 = ggml_time_ms(); + + for (size_t i = 0; i < final_latents.size(); i++) { + int64_t t1 = ggml_time_ms(); + sd::Tensor image = sd_ctx->sd->decode_first_stage(final_latents[i]); + if (image.empty()) { + LOG_ERROR("decode_first_stage failed for latent %" PRId64, i + 1); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->first_stage_model->free_params_buffer(); + } + return nullptr; + } + decoded_images.push_back(std::move(image)); + int64_t t2 = ggml_time_ms(); + LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000); + } + + int64_t t4 = ggml_time_ms(); + LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t0) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->first_stage_model->free_params_buffer(); + } + + sd_image_t* result_images = (sd_image_t*)calloc(request.batch_count, sizeof(sd_image_t)); + if (result_images == nullptr) { + return nullptr; + } + memset(result_images, 0, request.batch_count * sizeof(sd_image_t)); + + for (size_t i = 0; i < decoded_images.size(); i++) { + result_images[i] = tensor_to_sd_image(decoded_images[i]); + } + + return result_images; +} + +SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { + if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { return nullptr; } - int64_t seed = sd_vid_gen_params->seed; - if (seed < 0) { - seed = (int)time(nullptr); + int64_t t0 = ggml_time_ms(); + sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params; + GenerationRequest request(sd_ctx, sd_img_gen_params); + LOG_INFO("generate_image %dx%d", request.width, request.height); + + sd_ctx->sd->rng->manual_seed(request.seed); + sd_ctx->sd->sampler_rng->manual_seed(request.seed); + sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift); + sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count); + + ImageVaeAxesGuard axes_guard(sd_ctx, sd_img_gen_params, request); + + SamplePlan plan(sd_ctx, sd_img_gen_params, request); + auto latents_opt = prepare_image_generation_latents(sd_ctx, + sd_img_gen_params, + &request, + &plan); + if (!latents_opt.has_value()) { + return nullptr; + } + ImageGenerationLatents latents = std::move(*latents_opt); + + auto embeds_opt = prepare_image_generation_embeds(sd_ctx, + sd_img_gen_params, + &request, + &plan, + &latents); + if (!embeds_opt.has_value()) { + return nullptr; + } + ImageGenerationEmbeds embeds = std::move(*embeds_opt); + + std::vector> final_latents; + int64_t denoise_start = ggml_time_ms(); + for (int b = 0; b < request.batch_count; b++) { + int64_t sampling_start = ggml_time_ms(); + int64_t cur_seed = request.seed + b; + LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, request.batch_count, cur_seed); + + sd_ctx->sd->rng->manual_seed(cur_seed); + sd_ctx->sd->sampler_rng->manual_seed(cur_seed); + sd::Tensor noise = sd::randn_like(latents.init_latent, sd_ctx->sd->rng); + + sd::Tensor x_0 = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model, + true, + latents.init_latent, + std::move(noise), + embeds.cond, + embeds.uncond, + embeds.img_cond, + embeds.id_cond, + latents.control_image, + request.control_strength, + request.guidance, + request.eta, + request.shifted_timestep, + plan.sample_method, + plan.sigmas, + plan.start_merge_step, + latents.ref_latents, + request.increase_ref_index, + latents.denoise_mask, + sd::Tensor(), + 1.f, + request.cache_params); + int64_t sampling_end = ggml_time_ms(); + if (!x_0.empty()) { + LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + final_latents.push_back(std::move(x_0)); + continue; + } + + LOG_ERROR("sampling for image %d/%d failed after %.2fs", + b + 1, + request.batch_count, + (sampling_end - sampling_start) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + return nullptr; + } + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + int64_t denoise_end = ggml_time_ms(); + LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", + final_latents.size(), + (denoise_end - denoise_start) * 1.0f / 1000); + + auto result = decode_image_outputs(sd_ctx, request, final_latents); + if (result == nullptr) { + return nullptr; } - sd_ctx->sd->rng->manual_seed(seed); - sd_ctx->sd->sampler_rng->manual_seed(seed); + sd_ctx->sd->lora_stat(); - int64_t t0 = ggml_time_ms(); + int64_t t1 = ggml_time_ms(); + LOG_INFO("generate_image completed in %.2fs", (t1 - t0) * 1.0f / 1000); + return result; +} - // Apply lora - sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count); +static std::optional prepare_video_generation_latents(sd_ctx_t* sd_ctx, + const sd_vid_gen_params_t* sd_vid_gen_params, + GenerationRequest* request) { + ImageGenerationLatents latents; + int64_t prepare_start_ms = ggml_time_ms(); + + sd::Tensor start_image; + sd::Tensor end_image; + + if (sd_vid_gen_params->init_image.data) { + start_image = sd_image_to_tensor(sd_vid_gen_params->init_image, request->width, request->height); + } + + if (sd_vid_gen_params->end_image.data) { + end_image = sd_image_to_tensor(sd_vid_gen_params->end_image, request->width, request->height); + } - ggml_tensor* init_latent = nullptr; - ggml_tensor* clip_vision_output = nullptr; - ggml_tensor* concat_latent = nullptr; - ggml_tensor* denoise_mask = nullptr; - ggml_tensor* vace_context = nullptr; - int64_t ref_image_num = 0; // for vace if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" || @@ -3700,331 +3196,370 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") { - if (sd_vid_gen_params->init_image.data) { - clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2); + if (!start_image.empty()) { + auto clip_vision_output = sd_ctx->sd->get_clip_vision_output(start_image, false, -2); + if (clip_vision_output.empty()) { + LOG_ERROR("failed to compute clip vision output for init image"); + return std::nullopt; + } + latents.clip_vision_output = std::move(clip_vision_output); } else { - clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2, true); + latents.clip_vision_output = sd_ctx->sd->get_clip_vision_output(start_image, false, -2, true); } if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") { - ggml_tensor* end_image_clip_vision_output = nullptr; - if (sd_vid_gen_params->end_image.data) { - end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2); + sd::Tensor end_image_clip_vision_output; + if (!end_image.empty()) { + end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(end_image, false, -2); + if (end_image_clip_vision_output.empty()) { + LOG_ERROR("failed to compute clip vision output for end image"); + return std::nullopt; + } } else { - end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2, true); + end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(end_image, false, -2, true); } - clip_vision_output = ggml_ext_tensor_concat(work_ctx, clip_vision_output, end_image_clip_vision_output, 1); + latents.clip_vision_output = sd::ops::concat(latents.clip_vision_output, end_image_clip_vision_output, 1); } int64_t t1 = ggml_time_ms(); - LOG_INFO("get_clip_vision_output completed, taking %" PRId64 " ms", t1 - t0); + LOG_INFO("get_clip_vision_output completed, taking %" PRId64 " ms", t1 - prepare_start_ms); } - int64_t t1 = ggml_time_ms(); - ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3); - ggml_ext_tensor_iter(image, [&](ggml_tensor* image, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.5f; - if (i2 == 0 && sd_vid_gen_params->init_image.data) { // start image - value = *(sd_vid_gen_params->init_image.data + i1 * width * 3 + i0 * 3 + i3); - value /= 255.f; - } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data) { - value = *(sd_vid_gen_params->end_image.data + i1 * width * 3 + i0 * 3 + i3); - value /= 255.f; - } - ggml_ext_tensor_set_f32(image, value, i0, i1, i2, i3); - }); + int64_t t1 = ggml_time_ms(); + sd::Tensor image = sd::full({request->width, request->height, request->frames, 3, 1}, 0.5f); + if (!start_image.empty()) { + sd::ops::slice_assign(&image, 2, 0, 1, start_image.unsqueeze(2)); + } + if (!end_image.empty()) { + sd::ops::slice_assign(&image, 2, request->frames - 1, request->frames, end_image.unsqueeze(2)); + } - concat_latent = sd_ctx->sd->encode_first_stage(work_ctx, image); // [b*c, t, h/vae_scale_factor, w/vae_scale_factor] + auto concat_latent = sd_ctx->sd->encode_first_stage(image); // [b, c, t, h/vae_scale_factor, w/vae_scale_factor] + if (concat_latent.empty()) { + LOG_ERROR("failed to encode video conditioning frames"); + return std::nullopt; + } + latents.concat_latent = std::move(concat_latent); int64_t t2 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); - ggml_tensor* concat_mask = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - concat_latent->ne[0], - concat_latent->ne[1], - concat_latent->ne[2], - 4); // [b*4, t, w/vae_scale_factor, h/vae_scale_factor] - ggml_ext_tensor_iter(concat_mask, [&](ggml_tensor* concat_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.0f; - if (i2 == 0 && sd_vid_gen_params->init_image.data) { // start image - value = 1.0f; - } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data && i3 == 3) { - value = 1.0f; - } - ggml_ext_tensor_set_f32(concat_mask, value, i0, i1, i2, i3); - }); - - concat_latent = ggml_ext_tensor_concat(work_ctx, concat_mask, concat_latent, 3); // [b*(c+4), t, h/vae_scale_factor, w/vae_scale_factor] - } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && sd_vid_gen_params->init_image.data) { + sd::Tensor concat_mask = sd::zeros({latents.concat_latent.shape()[0], + latents.concat_latent.shape()[1], + latents.concat_latent.shape()[2], + 4, + 1}); // [b, 4, t, h/vae_scale_factor, w/vae_scale_factor] + if (!start_image.empty()) { + sd::ops::fill_slice(&concat_mask, 2, 0, 1, 1.0f); + } + if (!end_image.empty()) { + auto last_channel = sd::ops::slice(concat_mask, 3, 3, 4); + sd::ops::fill_slice(&last_channel, 2, last_channel.shape()[2] - 1, last_channel.shape()[2], 1.0f); + sd::ops::slice_assign(&concat_mask, 3, 3, 4, last_channel); + } + latents.concat_latent = sd::ops::concat(concat_mask, latents.concat_latent, 3); // [b, 4+c, t, h/vae_scale_factor, w/vae_scale_factor] + } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && !start_image.empty()) { LOG_INFO("IMG2VID"); - int64_t t1 = ggml_time_ms(); - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, init_img); - init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3); + int64_t t1 = ggml_time_ms(); + auto init_img = start_image.reshape({start_image.shape()[0], start_image.shape()[1], 1, start_image.shape()[2], 1}); + auto init_image_latent = sd_ctx->sd->encode_first_stage(init_img); // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor] + if (init_image_latent.empty()) { + LOG_ERROR("failed to encode init video frame"); + return std::nullopt; + } - auto init_image_latent = sd_ctx->sd->encode_to_vae_latents(work_ctx, init_img); // [b*c, 1, h/16, w/16] + latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true); // [b, c, t, h/vae_scale_factor, w/vae_scale_factor] + sd::ops::slice_assign(&latents.init_latent, 2, 0, init_image_latent.shape()[2], init_image_latent); - init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true); - denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1); - ggml_set_f32(denoise_mask, 1.f); - - init_latent = sd_ctx->sd->first_stage_model->diffusion_to_vae_latents(work_ctx, init_latent); - - ggml_ext_tensor_iter(init_image_latent, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3); - ggml_ext_tensor_set_f32(init_latent, value, i0, i1, i2, i3); - if (i3 == 0) { - ggml_ext_tensor_set_f32(denoise_mask, 0.f, i0, i1, i2, i3); - } - }); - - init_latent = sd_ctx->sd->first_stage_model->vae_to_diffuison_latents(work_ctx, init_latent); + latents.denoise_mask = sd::full({latents.init_latent.shape()[0], latents.init_latent.shape()[1], latents.init_latent.shape()[2], 1, 1}, 1.f); + sd::ops::fill_slice(&latents.denoise_mask, 2, 0, init_image_latent.shape()[2], 0.0f); int64_t t2 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.x-VACE-14B") { LOG_INFO("VACE"); - int64_t t1 = ggml_time_ms(); - ggml_tensor* ref_image_latent = nullptr; - if (sd_vid_gen_params->init_image.data) { - ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, ref_img); - ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3); - - ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img); // [b*c, 1, h/16, w/16] - auto zero_latent = ggml_dup_tensor(work_ctx, ref_image_latent); - ggml_set_f32(zero_latent, 0.f); - ref_image_latent = ggml_ext_tensor_concat(work_ctx, ref_image_latent, zero_latent, 3); // [b*2*c, 1, h/16, w/16] + int64_t t1 = ggml_time_ms(); + sd::Tensor ref_image_latent; + if (!start_image.empty()) { + auto ref_img = start_image.reshape({start_image.shape()[0], start_image.shape()[1], 1, start_image.shape()[2], 1}); + auto encoded_ref = sd_ctx->sd->encode_first_stage(ref_img); // [b, c, 1, h/vae_scale_factor, w/vae_scale_factor] + if (encoded_ref.empty()) { + LOG_ERROR("failed to encode VACE reference image"); + return std::nullopt; + } + ref_image_latent = sd::ops::concat(encoded_ref, sd::zeros(encoded_ref.shape()), 3); // [b, 2*c, 1, h/vae_scale_factor, w/vae_scale_factor] } - ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3); - ggml_ext_tensor_iter(control_video, [&](ggml_tensor* control_video, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = 0.5f; - if (i2 < sd_vid_gen_params->control_frames_size) { - value = sd_image_get_f32(sd_vid_gen_params->control_frames[i2], i0, i1, i3); - } - ggml_ext_tensor_set_f32(control_video, value, i0, i1, i2, i3); - }); - ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1); - ggml_set_f32(mask, 1.0f); - ggml_tensor* inactive = ggml_dup_tensor(work_ctx, control_video); - ggml_tensor* reactive = ggml_dup_tensor(work_ctx, control_video); + sd::Tensor control_video = sd::full({request->width, request->height, request->frames, 3, 1}, 0.5f); + int64_t control_frame_count = std::min(request->frames, sd_vid_gen_params->control_frames_size); + for (int64_t i = 0; i < control_frame_count; ++i) { + auto control_frame = sd_image_to_tensor(sd_vid_gen_params->control_frames[i], request->width, request->height); + sd::ops::slice_assign(&control_video, 2, i, i + 1, control_frame.unsqueeze(2)); + } - ggml_ext_tensor_iter(control_video, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float control_video_value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3) - 0.5f; - float mask_value = ggml_ext_tensor_get_f32(mask, i0, i1, i2, 0); - float inactive_value = (control_video_value * (1.f - mask_value)) + 0.5f; - float reactive_value = (control_video_value * mask_value) + 0.5f; + sd::Tensor mask = sd::full({request->width, request->height, request->frames, 1, 1}, 1.0f); - ggml_ext_tensor_set_f32(inactive, inactive_value, i0, i1, i2, i3); - ggml_ext_tensor_set_f32(reactive, reactive_value, i0, i1, i2, i3); - }); + control_video = control_video - 0.5f; + sd::Tensor inactive = control_video * (1.0f - mask) + 0.5f; + sd::Tensor reactive = control_video * mask + 0.5f; - inactive = sd_ctx->sd->encode_first_stage(work_ctx, inactive); // [b*c, t, h/vae_scale_factor, w/vae_scale_factor] - reactive = sd_ctx->sd->encode_first_stage(work_ctx, reactive); // [b*c, t, h/vae_scale_factor, w/vae_scale_factor] + inactive = sd_ctx->sd->encode_first_stage(inactive); // [b, c, t, h/vae_scale_factor, w/vae_scale_factor] + if (inactive.empty()) { + LOG_ERROR("failed to encode VACE inactive context"); + return std::nullopt; + } - int64_t length = inactive->ne[2]; - if (ref_image_latent) { + reactive = sd_ctx->sd->encode_first_stage(reactive); // [b, c, t, h/vae_scale_factor, w/vae_scale_factor] + if (reactive.empty()) { + LOG_ERROR("failed to encode VACE reactive context"); + return std::nullopt; + } + + int64_t length = inactive.shape()[2]; + if (!ref_image_latent.empty()) { length += 1; - frames = static_cast((length - 1) * 4 + 1); - ref_image_num = 1; + request->frames = static_cast((length - 1) * 4 + 1); + latents.ref_image_num = 1; } - vace_context = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, inactive->ne[0], inactive->ne[1], length, 96); // [b*96, t, h/vae_scale_factor, w/vae_scale_factor] - ggml_ext_tensor_iter(vace_context, [&](ggml_tensor* vace_context, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value; - if (i3 < 32) { - if (ref_image_latent && i2 == 0) { - value = ggml_ext_tensor_get_f32(ref_image_latent, i0, i1, 0, i3); - } else { - if (i3 < 16) { - value = ggml_ext_tensor_get_f32(inactive, i0, i1, i2 - ref_image_num, i3); - } else { - value = ggml_ext_tensor_get_f32(reactive, i0, i1, i2 - ref_image_num, i3 - 16); - } - } - } else { // mask - if (ref_image_latent && i2 == 0) { - value = 0.f; - } else { - int64_t vae_stride = vae_scale_factor; - int64_t mask_height_index = i1 * vae_stride + (i3 - 32) / vae_stride; - int64_t mask_width_index = i0 * vae_stride + (i3 - 32) % vae_stride; - value = ggml_ext_tensor_get_f32(mask, mask_width_index, mask_height_index, i2 - ref_image_num, 0); - } - } - ggml_ext_tensor_set_f32(vace_context, value, i0, i1, i2, i3); - }); - int64_t t2 = ggml_time_ms(); + auto vace_context = sd::ops::concat(inactive, reactive, 3); // [b, 2*c, t, h/vae_scale_factor, w/vae_scale_factor] + + mask = sd::full({request->width, request->height, inactive.shape()[2], 1, 1}, 1.0f); + auto mask_context = mask.reshape({request->vae_scale_factor, + inactive.shape()[0], + request->vae_scale_factor, + inactive.shape()[1], + inactive.shape()[2]}); // [t, h/vae_scale_factor, vae_scale_factor, w/vae_scale_factor, vae_scale_factor] + mask_context = mask_context.permute({1, 3, 4, 0, 2}) // [vae_scale_factor, vae_scale_factor, t, h/vae_scale_factor, w/vae_scale_factor] + .reshape({inactive.shape()[0], + inactive.shape()[1], + inactive.shape()[2], + request->vae_scale_factor * request->vae_scale_factor}); // [vae_scale_factor*vae_scale_factor, t, h/vae_scale_factor, w/vae_scale_factor] + + if (!ref_image_latent.empty()) { + vace_context = sd::ops::concat(ref_image_latent, vace_context, 2); // [b, 2*c, t+1, h/vae_scale_factor, w/vae_scale_factor] + auto mask_pad = sd::zeros({mask_context.shape()[0], + mask_context.shape()[1], + 1, + mask_context.shape()[3]}); // [vae_scale_factor*vae_scale_factor, 1, h/vae_scale_factor, w/vae_scale_factor] + mask_context = sd::ops::concat(mask_pad, mask_context, 2); // [vae_scale_factor*vae_scale_factor, t + 1, h/vae_scale_factor, w/vae_scale_factor] + } + + mask_context.unsqueeze_(mask_context.dim()); // [b, vae_scale_factor*vae_scale_factor, t + 1 or t, h/vae_scale_factor, w/vae_scale_factor] + + latents.vace_context = sd::ops::concat(vace_context, mask_context, 3); // [b, 2*c + vae_scale_factor*vae_scale_factor, t + 1 or t, h/vae_scale_factor, w/vae_scale_factor] + int64_t t2 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); } - if (init_latent == nullptr) { - init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true); + if (latents.init_latent.empty()) { + latents.init_latent = sd_ctx->sd->generate_init_latent(request->width, request->height, request->frames, true); } - // Get learned condition + return latents; +} + +static ImageGenerationEmbeds prepare_video_generation_embeds(sd_ctx_t* sd_ctx, + const sd_vid_gen_params_t* sd_vid_gen_params, + const GenerationRequest& request, + const ImageGenerationLatents& latents) { + ImageGenerationEmbeds embeds; ConditionerParams condition_params; - condition_params.clip_skip = sd_vid_gen_params->clip_skip; + condition_params.clip_skip = request.clip_skip; + condition_params.text = request.prompt; condition_params.zero_out_masked = true; - condition_params.text = prompt; - int64_t t1 = ggml_time_ms(); - SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, - sd_ctx->sd->n_threads, - condition_params); - cond.c_concat = concat_latent; - cond.c_vector = clip_vision_output; - SDCondition uncond; - if (sd_vid_gen_params->sample_params.guidance.txt_cfg != 1.0 || sd_vid_gen_params->high_noise_sample_params.guidance.txt_cfg != 1.0) { - condition_params.text = negative_prompt; - uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, - sd_ctx->sd->n_threads, - condition_params); - uncond.c_concat = concat_latent; - uncond.c_vector = clip_vision_output; + int64_t prepare_start_ms = ggml_time_ms(); + embeds.cond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, + condition_params); + embeds.cond.c_concat = latents.concat_latent; + embeds.cond.c_vector = latents.clip_vision_output; + if (request.use_uncond) { + condition_params.text = request.negative_prompt; + embeds.uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, + condition_params); + embeds.uncond.c_concat = latents.concat_latent; + embeds.uncond.c_vector = latents.clip_vision_output; } - int64_t t2 = ggml_time_ms(); - LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1); + + int64_t t1 = ggml_time_ms(); + LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000); if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->cond_stage_model->free_params_buffer(); } + return embeds; +} - int W = width / vae_scale_factor; - int H = height / vae_scale_factor; - int T = static_cast(init_latent->ne[2]); - int C = sd_ctx->sd->get_latent_channel(); - - ggml_tensor* final_latent; - ggml_tensor* x_t = init_latent; - ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C); - ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng); - // High Noise Sample - if (high_noise_sample_steps > 0) { - LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T); - enum sample_method_t high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method; - if (high_noise_sample_method == SAMPLE_METHOD_COUNT) { - high_noise_sample_method = sd_get_default_sample_method(sd_ctx); - } - LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]); - - int64_t sampling_start = ggml_time_ms(); - - std::vector high_noise_sigmas = std::vector(sigmas.begin(), sigmas.begin() + high_noise_sample_steps + 1); - sigmas = std::vector(sigmas.begin() + high_noise_sample_steps, sigmas.end()); - - x_t = sd_ctx->sd->sample(work_ctx, - sd_ctx->sd->high_noise_diffusion_model, - false, - x_t, - noise, - cond, - uncond, - {}, - nullptr, - 0, - sd_vid_gen_params->high_noise_sample_params.guidance, - sd_vid_gen_params->high_noise_sample_params.eta, - sd_vid_gen_params->high_noise_sample_params.shifted_timestep, - high_noise_sample_method, - high_noise_sigmas, - -1, - {}, - {}, - false, - denoise_mask, - vace_context, - sd_vid_gen_params->vace_strength, - &sd_vid_gen_params->cache); - - int64_t sampling_end = ggml_time_ms(); - LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); - } - noise = nullptr; +static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx, + const sd::Tensor& final_latent, + int* num_frames_out) { + if (final_latent.empty()) { + LOG_ERROR("no latent video to decode"); + return nullptr; } - - // Sample - { - LOG_DEBUG("sample %dx%dx%d", W, H, T); - int64_t sampling_start = ggml_time_ms(); - - final_latent = sd_ctx->sd->sample(work_ctx, - sd_ctx->sd->diffusion_model, - true, - x_t, - noise, - cond, - uncond, - {}, - nullptr, - 0, - sd_vid_gen_params->sample_params.guidance, - sd_vid_gen_params->sample_params.eta, - sd_vid_gen_params->sample_params.shifted_timestep, - sample_method, - sigmas, - -1, - {}, - {}, - false, - denoise_mask, - vace_context, - sd_vid_gen_params->vace_strength, - &sd_vid_gen_params->cache); - - int64_t sampling_end = ggml_time_ms(); - LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } - } - - if (ref_image_num > 0) { - ggml_tensor* trim_latent = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - final_latent->ne[0], - final_latent->ne[1], - final_latent->ne[2] - ref_image_num, - final_latent->ne[3]); - ggml_ext_tensor_iter(trim_latent, [&](ggml_tensor* trim_latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(final_latent, i0, i1, i2 + ref_image_num, i3); - ggml_ext_tensor_set_f32(trim_latent, value, i0, i1, i2, i3); - }); - final_latent = trim_latent; - } - - int64_t t4 = ggml_time_ms(); - LOG_INFO("generating latent video completed, taking %.2fs", (t4 - t2) * 1.0f / 1000); - ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true); - int64_t t5 = ggml_time_ms(); + int64_t t4 = ggml_time_ms(); + sd::Tensor vid = sd_ctx->sd->decode_first_stage(final_latent, true); + int64_t t5 = ggml_time_ms(); LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000); if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->first_stage_model->free_params_buffer(); } - - sd_ctx->sd->lora_stat(); - - sd_image_t* result_images = (sd_image_t*)calloc(vid->ne[2], sizeof(sd_image_t)); - if (result_images == nullptr) { - ggml_free(work_ctx); + if (vid.empty()) { + LOG_ERROR("decode_first_stage failed for video"); return nullptr; } - *num_frames_out = static_cast(vid->ne[2]); - for (int64_t i = 0; i < vid->ne[2]; i++) { - result_images[i].width = static_cast(vid->ne[0]); - result_images[i].height = static_cast(vid->ne[1]); - result_images[i].channel = 3; - result_images[i].data = ggml_tensor_to_sd_image(vid, static_cast(i), true); + sd_image_t* result_images = (sd_image_t*)calloc(vid.shape()[2], sizeof(sd_image_t)); + if (result_images == nullptr) { + return nullptr; + } + if (num_frames_out != nullptr) { + *num_frames_out = static_cast(vid.shape()[2]); } - ggml_free(work_ctx); - LOG_INFO("generate_video completed in %.2fs", (t5 - t0) * 1.0f / 1000); + for (int64_t i = 0; i < vid.shape()[2]; i++) { + result_images[i] = tensor_to_sd_image(vid, static_cast(i)); + } return result_images; } + +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) { + if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) { + return nullptr; + } + if (num_frames_out != nullptr) { + *num_frames_out = 0; + } + int64_t t0 = ggml_time_ms(); + sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params; + GenerationRequest request(sd_ctx, sd_vid_gen_params); + sd_ctx->sd->rng->manual_seed(request.seed); + sd_ctx->sd->sampler_rng->manual_seed(request.seed); + sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift); + sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count); + + SamplePlan plan(sd_ctx, sd_vid_gen_params, request); + auto latent_inputs_opt = prepare_video_generation_latents(sd_ctx, sd_vid_gen_params, &request); + if (!latent_inputs_opt.has_value()) { + return nullptr; + } + ImageGenerationLatents latents = std::move(*latent_inputs_opt); + ImageGenerationEmbeds embeds = prepare_video_generation_embeds(sd_ctx, + sd_vid_gen_params, + request, + latents); + LOG_INFO("generate_video %dx%dx%d", + request.width, + request.height, + request.frames); + + int64_t latent_start = ggml_time_ms(); + int W = request.width / request.vae_scale_factor; + int H = request.height / request.vae_scale_factor; + int T = static_cast(latents.init_latent.shape()[2]); + + sd::Tensor x_t = latents.init_latent; + sd::Tensor noise = sd::Tensor::randn_like(x_t, sd_ctx->sd->rng); + + if (plan.high_noise_sample_steps > 0) { + LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T); + + int64_t sampling_start = ggml_time_ms(); + std::vector high_noise_sigmas(plan.sigmas.begin(), plan.sigmas.begin() + plan.high_noise_sample_steps + 1); + plan.sigmas = std::vector(plan.sigmas.begin() + plan.high_noise_sample_steps, plan.sigmas.end()); + + sd::Tensor x_t_sampled = sd_ctx->sd->sample(sd_ctx->sd->high_noise_diffusion_model, + false, + x_t, + std::move(noise), + embeds.cond, + request.use_high_noise_uncond ? embeds.uncond : SDCondition(), + embeds.img_cond, + embeds.id_cond, + sd::Tensor(), + 0.f, + request.high_noise_guidance, + sd_vid_gen_params->high_noise_sample_params.eta, + request.shifted_timestep, + plan.high_noise_sample_method, + high_noise_sigmas, + -1, + std::vector>{}, + false, + latents.denoise_mask, + latents.vace_context, + request.vace_strength, + request.cache_params); + int64_t sampling_end = ggml_time_ms(); + if (x_t_sampled.empty()) { + LOG_ERROR("sampling(high noise) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); + } + return nullptr; + } + + x_t = std::move(x_t_sampled); + noise = {}; + LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); + } + } + + LOG_DEBUG("sample %dx%dx%d", W, H, T); + int64_t sampling_start = ggml_time_ms(); + sd::Tensor final_latent = sd_ctx->sd->sample(sd_ctx->sd->diffusion_model, + true, + x_t, + std::move(noise), + embeds.cond, + request.use_uncond ? embeds.uncond : SDCondition(), + embeds.img_cond, + embeds.id_cond, + sd::Tensor(), + 0.f, + sd_vid_gen_params->sample_params.guidance, + sd_vid_gen_params->sample_params.eta, + sd_vid_gen_params->sample_params.shifted_timestep, + plan.sample_method, + plan.sigmas, + -1, + std::vector>{}, + false, + latents.denoise_mask, + latents.vace_context, + request.vace_strength, + request.cache_params); + + int64_t sampling_end = ggml_time_ms(); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } + if (final_latent.empty()) { + LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + return nullptr; + } + LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + + if (latents.ref_image_num > 0) { + final_latent = sd::ops::slice(final_latent, 2, latents.ref_image_num, final_latent.shape()[2]); + } + + int64_t latent_end = ggml_time_ms(); + LOG_INFO("generating latent video completed, taking %.2fs", (latent_end - latent_start) * 1.0f / 1000); + + auto result = decode_video_outputs(sd_ctx, final_latent, num_frames_out); + if (result == nullptr) { + return nullptr; + } + + sd_ctx->sd->lora_stat(); + + int64_t t1 = ggml_time_ms(); + LOG_INFO("generate_video completed in %.2fs", (t1 - t0) * 1.0f / 1000); + return result; +} diff --git a/src/t5.hpp b/src/t5.hpp index 5f8c99dd..f64d0b6d 100644 --- a/src/t5.hpp +++ b/src/t5.hpp @@ -1,1038 +1,1036 @@ -#ifndef __T5_HPP__ -#define __T5_HPP__ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "darts.h" -#include "ggml_extend.hpp" -#include "json.hpp" -#include "model.h" -#include "vocab/vocab.h" - -// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h -// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h. -// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE -// -// Since tokenization is not the bottleneck in SD, performance was not a major consideration -// during the migration. -class MetaspacePreTokenizer { -private: - std::string replacement; - bool add_prefix_space; - -public: - MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true) - : replacement(replacement), add_prefix_space(add_prefix_space) {} - - std::string tokenize(const std::string& input) const { - std::string tokens; - std::stringstream ss(input); - - if (add_prefix_space) { - tokens += replacement; - } - - std::string token; - bool firstToken = true; - while (std::getline(ss, token, ' ')) { - if (!firstToken) - tokens += replacement + token; - else - tokens += token; - - firstToken = false; - } - - return tokens; - } -}; - -using EncodeResult = std::vector>; -class T5UniGramTokenizer { -public: - enum Status { - OK, - NO_PIECES_LOADED, - NO_ENTRY_FOUND, - BUILD_DOUBLE_ARRAY_FAILED, - PIECE_ALREADY_DEFINED, - INVLIAD_JSON - }; - -protected: - MetaspacePreTokenizer pre_tokenizer; - - // all pairs - std::vector> piece_score_pairs; - - float min_score_ = 0.0; - float max_score_ = 0.0; - std::unique_ptr trie_; - - // Maximum size of the return value of Trie, which corresponds - // to the maximum size of shared common prefix in the sentence pieces. - int trie_results_size_; - // unknown id. - int unk_id_ = 2; - std::string eos_token_ = ""; - int eos_id_ = 1; - int pad_id_ = 0; - // status. - Status status_ = OK; - - float kUnkPenalty = 10.0; - - std::string replacement; - bool add_prefix_space = true; - - void InitializePieces(const std::string& json_str) { - nlohmann::json data; - - try { - data = nlohmann::json::parse(json_str); - } catch (const nlohmann::json::parse_error&) { - status_ = INVLIAD_JSON; - return; - } - if (!data.contains("model")) { - status_ = INVLIAD_JSON; - return; - } - nlohmann::json model = data["model"]; - if (!model.contains("vocab")) { - status_ = INVLIAD_JSON; - return; - } - if (model.contains("unk_id")) { - unk_id_ = model["unk_id"]; - } - - replacement = data["pre_tokenizer"]["replacement"]; - add_prefix_space = data["pre_tokenizer"]["add_prefix_space"]; - - pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space); - - for (const auto& item : model["vocab"]) { - if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) { - status_ = INVLIAD_JSON; - return; - } - std::string piece = item[0]; - if (piece.empty()) { - piece = ""; - } - float score = item[1]; - piece_score_pairs.emplace_back(piece, score); - } - } - - // Builds a Trie index. - void BuildTrie(std::vector>* pieces) { - if (status_ != OK) - return; - - if (pieces->empty()) { - status_ = NO_PIECES_LOADED; - return; - } - - // sort by sentencepiece since DoubleArray::build() - // only accepts sorted strings. - sort(pieces->begin(), pieces->end()); - - // Makes key/value set for DoubleArrayTrie. - std::vector key(pieces->size()); - std::vector value(pieces->size()); - for (size_t i = 0; i < pieces->size(); ++i) { - // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second); - key[i] = (*pieces)[i].first.data(); // sorted piece. - value[i] = (*pieces)[i].second; // vocab_id - } - - trie_ = std::unique_ptr(new Darts::DoubleArray()); - if (trie_->build(key.size(), const_cast(&key[0]), nullptr, - &value[0]) != 0) { - status_ = BUILD_DOUBLE_ARRAY_FAILED; - return; - } - - // Computes the maximum number of shared prefixes in the trie. - const int kMaxTrieResultsSize = 1024; - std::vector results( - kMaxTrieResultsSize); - trie_results_size_ = 0; - for (const auto& p : *pieces) { - const size_t num_nodes = trie_->commonPrefixSearch( - p.first.data(), results.data(), results.size(), p.first.size()); - trie_results_size_ = std::max(trie_results_size_, static_cast(num_nodes)); - } - - if (trie_results_size_ == 0) - status_ = NO_ENTRY_FOUND; - } - - // Non-virtual (inlined) implementation for faster execution. - inline float GetScoreInlined(int id) const { - return piece_score_pairs[id].second; - } - - inline bool IsUnusedInlined(int id) const { - return false; // TODO - } - - inline bool IsUserDefinedInlined(int id) const { - return false; // TODO - } - - inline size_t OneCharLen(const char* src) const { - return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; - } - - // The optimized Viterbi encode. - // Main differences from the original function: - // 1. Memorizes the best path at each postion so far, - // 2. No need to store the Lattice nodes, - // 3. Works in utf-8 directly, - // 4. Defines a new struct with fewer fields than Lattice, - // 5. Does not depend on `class Lattice` nor call `SetSentence()`, - // `PopulateNodes()`, or `Viterbi()`. It does everything in one function. - // For detailed explanations please see the comments inside the function body. - EncodeResult EncodeOptimized(const std::string& normalized) const { - // An optimized Viterbi algorithm for unigram language models. Benchmarking - // results show that it generates almost identical outputs and achieves 2.1x - // speedup on average for 102 languages compared to the original - // implementation. It's based on the following three ideas: - // - // 1. Because it uses the *unigram* model: - // best_score(x1, x2, �? xt) = best_score(x1, x2, �? x{t-1}) + score(xt) - // Deciding the best path (and score) can be decoupled into two isolated - // terms: (a) the best path ended before the last token `best_score(x1, x2, �? - // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are - // not related to each other at all. - // - // Therefore, we can compute once and store the *best_path ending at - // each character position*. In this way, when we know best_path_ends_at[M], - // we can reuse it to compute all the best_path_ends_at_[...] where the last - // token starts at the same character position M. - // - // This improves the time complexity from O(n*k*k) to O(n*k) because it - // eliminates the extra loop of recomputing the best path ending at the same - // position, where n is the input length and k is the maximum number of tokens - // that can be recognized starting at each position. - // - // 2. Again, because it uses the *unigram* model, we don’t need to actually - // store the lattice nodes. We still recognize all the tokens and lattice - // nodes from the input, but along identifying them, we use and discard them - // on the fly. There is no need to actually store them for best path Viterbi - // decoding. The only thing we need to store is the best_path ending at - // each character position. - // - // This improvement reduces the things needed to store in memory from O(n*k) - // to O(n), where n is the input length and k is the maximum number of tokens - // that can be recognized starting at each position. - // - // It also avoids the need of dynamic-size lattice node pool, because the - // number of things to store is fixed as n. - // - // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding - // inputs. In the original implementation, the lattice positions are based on - // unicode positions. A mapping from unicode position to the utf-8 position is - // maintained to recover the utf-8 string piece. - // - // We found that it is sufficient and beneficial to directly work with utf-8 - // positions: - // - // Firstly, it saves the conversion and mapping between unicode positions and - // utf-8 positions. - // - // Secondly, it reduces the number of fields we need to maintain in the - // node/path structure. Specifically, there are 8 fields defined in - // `Lattice::Node` used by the original encoder, but here in the optimized - // encoder we only need to define 3 fields in `BestPathNode`. - - if (status() != OK || normalized.empty()) { - return {}; - } - // Represents the last node of the best path. - struct BestPathNode { - int id = -1; // The vocab id. (maybe -1 for UNK) - float best_path_score = - 0; // The total score of the best path ending at this node. - int starts_at = - -1; // The starting position (in utf-8) of this node. The entire best - // path can be constructed by backtracking along this link. - }; - const int size = static_cast(normalized.size()); - const float unk_score = min_score() - kUnkPenalty; - // The ends are exclusive. - std::vector best_path_ends_at(size + 1); - // Generate lattice on-the-fly (not stored) and update best_path_ends_at. - int starts_at = 0; - while (starts_at < size) { - std::size_t node_pos = 0; - std::size_t key_pos = starts_at; - const auto best_path_score_till_here = - best_path_ends_at[starts_at].best_path_score; - bool has_single_node = false; - const int mblen = - std::min(static_cast(OneCharLen(normalized.data() + starts_at)), - size - starts_at); - while (key_pos < size) { - const int ret = - trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1); - if (ret == -2) - break; - if (ret >= 0) { - if (IsUnusedInlined(ret)) - continue; - // Update the best path node. - auto& target_node = best_path_ends_at[key_pos]; - const auto length = (key_pos - starts_at); - // User defined symbol receives extra bonus to always be selected. - const auto score = IsUserDefinedInlined(ret) - ? (length * max_score_ - 0.1) - : GetScoreInlined(ret); - const auto candidate_best_path_score = - score + best_path_score_till_here; - if (target_node.starts_at == -1 || - candidate_best_path_score > target_node.best_path_score) { - target_node.best_path_score = static_cast(candidate_best_path_score); - target_node.starts_at = starts_at; - target_node.id = ret; - } - if (!has_single_node && length == mblen) { - has_single_node = true; - } - } - } - if (!has_single_node) { - auto& target_node = best_path_ends_at[starts_at + mblen]; - const auto candidate_best_path_score = - unk_score + best_path_score_till_here; - if (target_node.starts_at == -1 || - candidate_best_path_score > target_node.best_path_score) { - target_node.best_path_score = candidate_best_path_score; - target_node.starts_at = starts_at; - target_node.id = unk_id_; - } - } - // Move by one unicode character. - starts_at += mblen; - } - // Backtrack to identify the best path. - EncodeResult results; - int ends_at = size; - while (ends_at > 0) { - const auto& node = best_path_ends_at[ends_at]; - results.emplace_back( - normalized.substr(node.starts_at, ends_at - node.starts_at), node.id); - ends_at = node.starts_at; - } - std::reverse(results.begin(), results.end()); - return results; - } - -public: - explicit T5UniGramTokenizer(bool is_umt5 = false) { - if (is_umt5) { - InitializePieces(load_umt5_tokenizer_json()); - } else { - InitializePieces(load_t5_tokenizer_json()); - } - - min_score_ = FLT_MAX; - max_score_ = FLT_MIN; - - std::vector> pieces; - for (int i = 0; i < piece_score_pairs.size(); i++) { - const auto& sp = piece_score_pairs[i]; - - min_score_ = std::min(min_score_, sp.second); - max_score_ = std::max(max_score_, sp.second); - - pieces.emplace_back(sp.first, i); - } - - BuildTrie(&pieces); - } - ~T5UniGramTokenizer(){}; - - std::string Normalize(const std::string& input) const { - // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29 - // TODO: nmt-nfkc - std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " "); - return normalized; - } - - std::vector Encode(const std::string& input, bool append_eos_if_not_present = true) const { - std::string normalized = Normalize(input); - normalized = pre_tokenizer.tokenize(normalized); - EncodeResult result = EncodeOptimized(normalized); - if (result.size() > 0 && append_eos_if_not_present) { - auto item = result[result.size() - 1]; - if (item.first != eos_token_) { - result.emplace_back(eos_token_, eos_id_); - } - } - std::vector tokens; - for (auto item : result) { - tokens.push_back(item.second); - } - return tokens; - } - - void pad_tokens(std::vector& tokens, - std::vector& weights, - std::vector* attention_mask, - size_t max_length = 0, - bool padding = false) { - if (max_length > 0 && padding) { - size_t orig_token_num = tokens.size() - 1; - size_t n = static_cast(std::ceil(orig_token_num * 1.0 / (max_length - 1))); - if (n == 0) { - n = 1; - } - size_t length = max_length * n; - LOG_DEBUG("token length: %llu", length); - std::vector new_tokens; - std::vector new_weights; - std::vector new_attention_mask; - int token_idx = 0; - for (int i = 0; i < length; i++) { - if (token_idx >= orig_token_num) { - break; - } - if (attention_mask != nullptr) { - new_attention_mask.push_back(0.0); - } - if (i % max_length == max_length - 1) { - new_tokens.push_back(eos_id_); - new_weights.push_back(1.0); - } else { - new_tokens.push_back(tokens[token_idx]); - new_weights.push_back(weights[token_idx]); - token_idx++; - } - } - - new_tokens.push_back(eos_id_); - new_weights.push_back(1.0); - if (attention_mask != nullptr) { - new_attention_mask.push_back(0.0); - } - - tokens = new_tokens; - weights = new_weights; - if (attention_mask != nullptr) { - *attention_mask = new_attention_mask; - } - - if (padding) { - int pad_token_id = pad_id_; - tokens.insert(tokens.end(), length - tokens.size(), pad_token_id); - weights.insert(weights.end(), length - weights.size(), 1.0); - if (attention_mask != nullptr) { - // maybe keep some padding tokens unmasked? - attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF); - } - } - } - } - - // Returns the minimum score in sentence pieces. - // min_score() - 10 is used for the cost of unknown sentence. - float min_score() const { return min_score_; } - - // Returns the maximum score in sentence pieces. - // max_score() is used for the cost of user defined symbols. - float max_score() const { return max_score_; } - - Status status() const { return status_; } -}; - -class T5LayerNorm : public UnaryBlock { -protected: - int64_t hidden_size; - float eps; - - void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { - enum ggml_type wtype = GGML_TYPE_F32; - params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); - } - -public: - T5LayerNorm(int64_t hidden_size, - float eps = 1e-06f) - : hidden_size(hidden_size), - eps(eps) {} - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - ggml_tensor* w = params["weight"]; - x = ggml_rms_norm(ctx->ggml_ctx, x, eps); - x = ggml_mul(ctx->ggml_ctx, x, w); - return x; - } -}; - -struct T5DenseActDense : public UnaryBlock { -public: - T5DenseActDense(int64_t model_dim, int64_t ff_dim) { - blocks["wi"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - // x: [N, n_token, model_dim] - auto wi = std::dynamic_pointer_cast(blocks["wi"]); - auto wo = std::dynamic_pointer_cast(blocks["wo"]); - - x = wi->forward(ctx, x); - x = ggml_relu_inplace(ctx->ggml_ctx, x); - x = wo->forward(ctx, x); - return x; - } -}; - -struct T5DenseGatedActDense : public UnaryBlock { -public: - T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { - blocks["wi_0"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - blocks["wi_1"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - float scale = 1.f / 32.f; - // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...). - blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false, false, false, scale)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - // x: [N, n_token, model_dim] - auto wi_0 = std::dynamic_pointer_cast(blocks["wi_0"]); - auto wi_1 = std::dynamic_pointer_cast(blocks["wi_1"]); - auto wo = std::dynamic_pointer_cast(blocks["wo"]); - - auto hidden_gelu = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true); - auto hidden_linear = wi_1->forward(ctx, x); - x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear); - x = wo->forward(ctx, x); - return x; - } -}; - -struct T5LayerFF : public UnaryBlock { -public: - T5LayerFF(int64_t model_dim, int64_t ff_dim) { - blocks["DenseReluDense"] = std::shared_ptr(new T5DenseGatedActDense(model_dim, ff_dim)); - blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { - // x: [N, n_token, model_dim] - auto DenseReluDense = std::dynamic_pointer_cast(blocks["DenseReluDense"]); - auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); - - auto forwarded_states = layer_norm->forward(ctx, x); - forwarded_states = DenseReluDense->forward(ctx, forwarded_states); - x = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x); - return x; - } -}; - -class T5Attention : public GGMLBlock { -protected: - int64_t model_dim; - int64_t inner_dim; - int64_t num_heads; - bool using_relative_attention_bias; - int64_t relative_attention_num_buckets = 32; - int64_t relative_attention_max_distance = 128; - -public: - T5Attention(int64_t model_dim, - int64_t inner_dim, - int64_t num_heads, - bool using_relative_attention_bias = false) - : model_dim(model_dim), - inner_dim(inner_dim), - num_heads(num_heads), - using_relative_attention_bias(using_relative_attention_bias) { - blocks["q"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); - blocks["k"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); - blocks["v"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); - blocks["o"] = std::shared_ptr(new Linear(inner_dim, model_dim, false)); - if (using_relative_attention_bias) { - blocks["relative_attention_bias"] = std::shared_ptr(new Embedding(relative_attention_num_buckets, num_heads)); - } - } - - ggml_tensor* compute_bias(GGMLRunnerContext* ctx, - ggml_tensor* relative_position_bucket) { - auto relative_attention_bias = std::dynamic_pointer_cast(blocks["relative_attention_bias"]); - - auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) - values = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length) - return values; - } - - // x: [N, n_token, model_dim] - std::pair forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - auto q_proj = std::dynamic_pointer_cast(blocks["q"]); - auto k_proj = std::dynamic_pointer_cast(blocks["k"]); - auto v_proj = std::dynamic_pointer_cast(blocks["v"]); - auto out_proj = std::dynamic_pointer_cast(blocks["o"]); - - int64_t n_head = num_heads; - int64_t d_head = inner_dim / n_head; - - auto q = q_proj->forward(ctx, x); - auto k = k_proj->forward(ctx, x); - auto v = v_proj->forward(ctx, x); - - if (using_relative_attention_bias && relative_position_bucket != nullptr) { - past_bias = compute_bias(ctx, relative_position_bucket); - } - if (past_bias != nullptr) { - if (mask != nullptr) { - mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias); - mask = ggml_add(ctx->ggml_ctx, mask, past_bias); - } else { - mask = past_bias; - } - } - - k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast(d_head)), true); - - x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head] - - x = out_proj->forward(ctx, x); // [N, n_token, model_dim] - return {x, past_bias}; - } -}; - -struct T5LayerSelfAttention : public GGMLBlock { -public: - T5LayerSelfAttention(int64_t model_dim, - int64_t inner_dim, - int64_t ff_dim, - int64_t num_heads, - bool using_relative_attention_bias) { - blocks["SelfAttention"] = std::shared_ptr(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias)); - blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); - } - - std::pair forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // x: [N, n_token, model_dim] - auto SelfAttention = std::dynamic_pointer_cast(blocks["SelfAttention"]); - auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); - - auto normed_hidden_state = layer_norm->forward(ctx, x); - auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket); - auto output = ret.first; - past_bias = ret.second; - - x = ggml_add_inplace(ctx->ggml_ctx, output, x); - return {x, past_bias}; - } -}; - -struct T5Block : public GGMLBlock { -public: - T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) { - blocks["layer.0"] = std::shared_ptr(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias)); - blocks["layer.1"] = std::shared_ptr(new T5LayerFF(model_dim, ff_dim)); - } - - std::pair forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // x: [N, n_token, model_dim] - auto layer_0 = std::dynamic_pointer_cast(blocks["layer.0"]); - auto layer_1 = std::dynamic_pointer_cast(blocks["layer.1"]); - - auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket); - x = ret.first; - past_bias = ret.second; - x = layer_1->forward(ctx, x); - return {x, past_bias}; - } -}; - -struct T5Stack : public GGMLBlock { - int64_t num_layers; - -public: - T5Stack(int64_t num_layers, - int64_t model_dim, - int64_t inner_dim, - int64_t ff_dim, - int64_t num_heads, - bool relative_attention = true) - : num_layers(num_layers) { - for (int i = 0; i < num_layers; i++) { - blocks["block." + std::to_string(i)] = std::shared_ptr(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0))); - } - - blocks["final_layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* x, - ggml_tensor* past_bias = nullptr, - ggml_tensor* attention_mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // x: [N, n_token, model_dim] - for (int i = 0; i < num_layers; i++) { - auto block = std::dynamic_pointer_cast(blocks["block." + std::to_string(i)]); - - auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); - x = ret.first; - past_bias = ret.second; - } - - auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]); - - x = final_layer_norm->forward(ctx, x); - return x; - } -}; - -struct T5Params { - int64_t num_layers = 24; - int64_t model_dim = 4096; - int64_t ff_dim = 10240; - int64_t num_heads = 64; - int64_t vocab_size = 32128; - bool relative_attention = true; -}; - -struct T5 : public GGMLBlock { - T5Params params; - -public: - T5() {} - T5(T5Params params) - : params(params) { - blocks["encoder"] = std::shared_ptr(new T5Stack(params.num_layers, - params.model_dim, - params.model_dim, - params.ff_dim, - params.num_heads, - params.relative_attention)); - blocks["shared"] = std::shared_ptr(new Embedding(params.vocab_size, - params.model_dim)); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* input_ids, - ggml_tensor* past_bias = nullptr, - ggml_tensor* attention_mask = nullptr, - ggml_tensor* relative_position_bucket = nullptr) { - // input_ids: [N, n_token] - - auto shared = std::dynamic_pointer_cast(blocks["shared"]); - auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); - - auto x = shared->forward(ctx, input_ids); - x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); - return x; - } -}; - -struct T5Runner : public GGMLRunner { - T5Params params; - T5 model; - std::vector relative_position_bucket_vec; - - T5Runner(ggml_backend_t backend, - bool offload_params_to_cpu, - const String2TensorStorage& tensor_storage_map, - const std::string prefix, - bool is_umt5 = false) - : GGMLRunner(backend, offload_params_to_cpu) { - if (is_umt5) { - params.vocab_size = 256384; - params.relative_attention = false; - } - model = T5(params); - model.init(params_ctx, tensor_storage_map, prefix); - } - - std::string get_desc() override { - return "t5"; - } - - void get_param_tensors(std::map& tensors, const std::string prefix) { - model.get_param_tensors(tensors, prefix); - } - - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* input_ids, - ggml_tensor* relative_position_bucket, - ggml_tensor* attention_mask = nullptr) { - size_t N = input_ids->ne[1]; - size_t n_token = input_ids->ne[0]; - - auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim] - return hidden_states; - } - - ggml_cgraph* build_graph(ggml_tensor* input_ids, - ggml_tensor* attention_mask = nullptr) { - ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - input_ids = to_backend(input_ids); - attention_mask = to_backend(attention_mask); - - relative_position_bucket_vec = compute_relative_position_bucket(static_cast(input_ids->ne[0]), static_cast(input_ids->ne[0])); - - // for (int i = 0; i < relative_position_bucket_vec.size(); i++) { - // if (i % 77 == 0) { - // printf("\n"); - // } - // printf("%d ", relative_position_bucket_vec[i]); - // } - - auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx, - GGML_TYPE_I32, - input_ids->ne[0], - input_ids->ne[0]); - set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); - - auto runner_ctx = get_context(); - ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask); - - ggml_build_forward_expand(gf, hidden_states); - - return gf; - } - - bool compute(const int n_threads, - ggml_tensor* input_ids, - ggml_tensor* attention_mask, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(input_ids, attention_mask); - }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); - } - - static std::vector _relative_position_bucket(const std::vector& relative_position, - bool bidirectional = true, - int num_buckets = 32, - int max_distance = 128) { - std::vector relative_buckets(relative_position.size(), 0); - std::vector abs_relative_position = relative_position; - - if (bidirectional) { - num_buckets = num_buckets / 2; - for (size_t i = 0; i < relative_position.size(); ++i) { - if (relative_position[i] > 0) { - relative_buckets[i] += num_buckets; - } - abs_relative_position[i] = std::abs(relative_position[i]); - } - } else { - for (size_t i = 0; i < relative_position.size(); ++i) { - abs_relative_position[i] = std::max(-relative_position[i], 0); - } - } - - int max_exact = num_buckets / 2; - std::vector relative_position_if_large(relative_position.size(), 0); - - for (size_t i = 0; i < relative_position.size(); ++i) { - if (abs_relative_position[i] < max_exact) { - relative_buckets[i] += abs_relative_position[i]; - } else { - float log_pos = std::log(static_cast(abs_relative_position[i]) / max_exact); - float log_base = std::log(static_cast(max_distance) / max_exact); - relative_position_if_large[i] = max_exact + static_cast((log_pos / log_base) * (num_buckets - max_exact)); - relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1); - relative_buckets[i] += relative_position_if_large[i]; - } - } - - return relative_buckets; - } - - std::vector compute_relative_position_bucket(int query_length, - int key_length) { - std::vector context_position(query_length); - std::vector memory_position(key_length); - - for (int i = 0; i < query_length; ++i) { - context_position[i] = i; - } - for (int i = 0; i < key_length; ++i) { - memory_position[i] = i; - } - - std::vector> relative_position(query_length, std::vector(key_length, 0)); - for (int i = 0; i < query_length; ++i) { - for (int j = 0; j < key_length; ++j) { - relative_position[i][j] = memory_position[j] - context_position[i]; - } - } - - std::vector relative_position_bucket; - for (int i = 0; i < query_length; ++i) { - std::vector result = _relative_position_bucket(relative_position[i], true); - relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end()); - } - - return relative_position_bucket; - } -}; - -struct T5Embedder { - T5UniGramTokenizer tokenizer; - T5Runner model; - - T5Embedder(ggml_backend_t backend, - bool offload_params_to_cpu, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - bool is_umt5 = false) - : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { - } - - void get_param_tensors(std::map& tensors, const std::string prefix) { - model.get_param_tensors(tensors, prefix); - } - - void alloc_params_buffer() { - model.alloc_params_buffer(); - } - - std::tuple, std::vector, std::vector> tokenize(std::string text, - size_t max_length = 0, - bool padding = false) { - auto parsed_attention = parse_prompt_attention(text); - - { - std::stringstream ss; - ss << "["; - for (const auto& item : parsed_attention) { - ss << "['" << item.first << "', " << item.second << "], "; - } - ss << "]"; - LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); - } - - std::vector tokens; - std::vector weights; - for (const auto& item : parsed_attention) { - const std::string& curr_text = item.first; - float curr_weight = item.second; - std::vector curr_tokens = tokenizer.Encode(curr_text, false); - tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); - weights.insert(weights.end(), curr_tokens.size(), curr_weight); - } - - int EOS_TOKEN_ID = 1; - tokens.push_back(EOS_TOKEN_ID); - weights.push_back(1.0); - - std::vector attention_mask; - - tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding); - - // for (int i = 0; i < tokens.size(); i++) { - // std::cout << tokens[i] << ":" << weights[i] << ", "; - // } - // std::cout << std::endl; - - return {tokens, weights, attention_mask}; - } - - void test() { - ggml_init_params params; - params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - params.mem_buffer = nullptr; - params.no_alloc = false; - - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); - - { - std::string text("a lovely cat"); - // std::string text("一只可爱的�?); // umt5 chinease test - auto tokens_and_weights = tokenize(text, 512, true); - std::vector& tokens = std::get<0>(tokens_and_weights); - std::vector& weights = std::get<1>(tokens_and_weights); - std::vector& masks = std::get<2>(tokens_and_weights); - for (auto token : tokens) { - printf("%d ", token); - } - printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - auto attention_mask = vector_to_ggml_tensor(work_ctx, masks); - ggml_tensor* out = nullptr; - - int64_t t0 = ggml_time_ms(); - model.compute(8, input_ids, attention_mask, &out, work_ctx); - int64_t t1 = ggml_time_ms(); - - print_ggml_tensor(out); - LOG_DEBUG("t5 test done in %lldms", t1 - t0); - } - } - - static void load_from_file_and_test(const std::string& file_path) { - // cpu f16: pass - // cpu f32: pass - // cuda f16: pass - // cuda f32: pass - // cuda q8_0: pass - // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); - ggml_type model_data_type = GGML_TYPE_F16; - - ModelLoader model_loader; - if (!model_loader.init_from_file_and_convert_name(file_path)) { - LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); - return; - } - - auto& tensor_storage_map = model_loader.get_tensor_storage_map(); - for (auto& [name, tensor_storage] : tensor_storage_map) { - if (ends_with(name, "weight")) { - tensor_storage.expected_type = model_data_type; - } - } - - std::shared_ptr t5 = std::make_shared(backend, false, tensor_storage_map, "", true); - - t5->alloc_params_buffer(); - std::map tensors; - t5->get_param_tensors(tensors, ""); - - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); - return; - } - - LOG_INFO("t5 model loaded"); - t5->test(); - } -}; - -#endif // __T5_HPP__ \ No newline at end of file +#ifndef __T5_HPP__ +#define __T5_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "darts.h" +#include "ggml_extend.hpp" +#include "json.hpp" +#include "model.h" +#include "vocab/vocab.h" + +// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h +// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h. +// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE +// +// Since tokenization is not the bottleneck in SD, performance was not a major consideration +// during the migration. +class MetaspacePreTokenizer { +private: + std::string replacement; + bool add_prefix_space; + +public: + MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true) + : replacement(replacement), add_prefix_space(add_prefix_space) {} + + std::string tokenize(const std::string& input) const { + std::string tokens; + std::stringstream ss(input); + + if (add_prefix_space) { + tokens += replacement; + } + + std::string token; + bool firstToken = true; + while (std::getline(ss, token, ' ')) { + if (!firstToken) + tokens += replacement + token; + else + tokens += token; + + firstToken = false; + } + + return tokens; + } +}; + +using EncodeResult = std::vector>; +class T5UniGramTokenizer { +public: + enum Status { + OK, + NO_PIECES_LOADED, + NO_ENTRY_FOUND, + BUILD_DOUBLE_ARRAY_FAILED, + PIECE_ALREADY_DEFINED, + INVLIAD_JSON + }; + +protected: + MetaspacePreTokenizer pre_tokenizer; + + // all pairs + std::vector> piece_score_pairs; + + float min_score_ = 0.0; + float max_score_ = 0.0; + std::unique_ptr trie_; + + // Maximum size of the return value of Trie, which corresponds + // to the maximum size of shared common prefix in the sentence pieces. + int trie_results_size_; + // unknown id. + int unk_id_ = 2; + std::string eos_token_ = ""; + int eos_id_ = 1; + int pad_id_ = 0; + // status. + Status status_ = OK; + + float kUnkPenalty = 10.0; + + std::string replacement; + bool add_prefix_space = true; + + void InitializePieces(const std::string& json_str) { + nlohmann::json data; + + try { + data = nlohmann::json::parse(json_str); + } catch (const nlohmann::json::parse_error&) { + status_ = INVLIAD_JSON; + return; + } + if (!data.contains("model")) { + status_ = INVLIAD_JSON; + return; + } + nlohmann::json model = data["model"]; + if (!model.contains("vocab")) { + status_ = INVLIAD_JSON; + return; + } + if (model.contains("unk_id")) { + unk_id_ = model["unk_id"]; + } + + replacement = data["pre_tokenizer"]["replacement"]; + add_prefix_space = data["pre_tokenizer"]["add_prefix_space"]; + + pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space); + + for (const auto& item : model["vocab"]) { + if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) { + status_ = INVLIAD_JSON; + return; + } + std::string piece = item[0]; + if (piece.empty()) { + piece = ""; + } + float score = item[1]; + piece_score_pairs.emplace_back(piece, score); + } + } + + // Builds a Trie index. + void BuildTrie(std::vector>* pieces) { + if (status_ != OK) + return; + + if (pieces->empty()) { + status_ = NO_PIECES_LOADED; + return; + } + + // sort by sentencepiece since DoubleArray::build() + // only accepts sorted strings. + sort(pieces->begin(), pieces->end()); + + // Makes key/value set for DoubleArrayTrie. + std::vector key(pieces->size()); + std::vector value(pieces->size()); + for (size_t i = 0; i < pieces->size(); ++i) { + // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second); + key[i] = (*pieces)[i].first.data(); // sorted piece. + value[i] = (*pieces)[i].second; // vocab_id + } + + trie_ = std::unique_ptr(new Darts::DoubleArray()); + if (trie_->build(key.size(), const_cast(&key[0]), nullptr, + &value[0]) != 0) { + status_ = BUILD_DOUBLE_ARRAY_FAILED; + return; + } + + // Computes the maximum number of shared prefixes in the trie. + const int kMaxTrieResultsSize = 1024; + std::vector results( + kMaxTrieResultsSize); + trie_results_size_ = 0; + for (const auto& p : *pieces) { + const size_t num_nodes = trie_->commonPrefixSearch( + p.first.data(), results.data(), results.size(), p.first.size()); + trie_results_size_ = std::max(trie_results_size_, static_cast(num_nodes)); + } + + if (trie_results_size_ == 0) + status_ = NO_ENTRY_FOUND; + } + + // Non-virtual (inlined) implementation for faster execution. + inline float GetScoreInlined(int id) const { + return piece_score_pairs[id].second; + } + + inline bool IsUnusedInlined(int id) const { + return false; // TODO + } + + inline bool IsUserDefinedInlined(int id) const { + return false; // TODO + } + + inline size_t OneCharLen(const char* src) const { + return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; + } + + // The optimized Viterbi encode. + // Main differences from the original function: + // 1. Memorizes the best path at each postion so far, + // 2. No need to store the Lattice nodes, + // 3. Works in utf-8 directly, + // 4. Defines a new struct with fewer fields than Lattice, + // 5. Does not depend on `class Lattice` nor call `SetSentence()`, + // `PopulateNodes()`, or `Viterbi()`. It does everything in one function. + // For detailed explanations please see the comments inside the function body. + EncodeResult EncodeOptimized(const std::string& normalized) const { + // An optimized Viterbi algorithm for unigram language models. Benchmarking + // results show that it generates almost identical outputs and achieves 2.1x + // speedup on average for 102 languages compared to the original + // implementation. It's based on the following three ideas: + // + // 1. Because it uses the *unigram* model: + // best_score(x1, x2, ... xt) = best_score(x1, x2, ... x{t-1}) + score(xt) + // Deciding the best path (and score) can be decoupled into two isolated + // terms: (a) the best path ended before the last token `best_score(x1, x2, ...)` + // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are + // not related to each other at all. + // + // Therefore, we can compute once and store the *best_path ending at + // each character position*. In this way, when we know best_path_ends_at[M], + // we can reuse it to compute all the best_path_ends_at_[...] where the last + // token starts at the same character position M. + // + // This improves the time complexity from O(n*k*k) to O(n*k) because it + // eliminates the extra loop of recomputing the best path ending at the same + // position, where n is the input length and k is the maximum number of tokens + // that can be recognized starting at each position. + // + // 2. Again, because it uses the *unigram* model, we don't need to actually + // store the lattice nodes. We still recognize all the tokens and lattice + // nodes from the input, but along identifying them, we use and discard them + // on the fly. There is no need to actually store them for best path Viterbi + // decoding. The only thing we need to store is the best_path ending at + // each character position. + // + // This improvement reduces the things needed to store in memory from O(n*k) + // to O(n), where n is the input length and k is the maximum number of tokens + // that can be recognized starting at each position. + // + // It also avoids the need of dynamic-size lattice node pool, because the + // number of things to store is fixed as n. + // + // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding + // inputs. In the original implementation, the lattice positions are based on + // unicode positions. A mapping from unicode position to the utf-8 position is + // maintained to recover the utf-8 string piece. + // + // We found that it is sufficient and beneficial to directly work with utf-8 + // positions: + // + // Firstly, it saves the conversion and mapping between unicode positions and + // utf-8 positions. + // + // Secondly, it reduces the number of fields we need to maintain in the + // node/path structure. Specifically, there are 8 fields defined in + // `Lattice::Node` used by the original encoder, but here in the optimized + // encoder we only need to define 3 fields in `BestPathNode`. + + if (status() != OK || normalized.empty()) { + return {}; + } + // Represents the last node of the best path. + struct BestPathNode { + int id = -1; // The vocab id. (maybe -1 for UNK) + float best_path_score = + 0; // The total score of the best path ending at this node. + int starts_at = + -1; // The starting position (in utf-8) of this node. The entire best + // path can be constructed by backtracking along this link. + }; + const int size = static_cast(normalized.size()); + const float unk_score = min_score() - kUnkPenalty; + // The ends are exclusive. + std::vector best_path_ends_at(size + 1); + // Generate lattice on-the-fly (not stored) and update best_path_ends_at. + int starts_at = 0; + while (starts_at < size) { + std::size_t node_pos = 0; + std::size_t key_pos = starts_at; + const auto best_path_score_till_here = + best_path_ends_at[starts_at].best_path_score; + bool has_single_node = false; + const int mblen = + std::min(static_cast(OneCharLen(normalized.data() + starts_at)), + size - starts_at); + while (key_pos < size) { + const int ret = + trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1); + if (ret == -2) + break; + if (ret >= 0) { + if (IsUnusedInlined(ret)) + continue; + // Update the best path node. + auto& target_node = best_path_ends_at[key_pos]; + const auto length = (key_pos - starts_at); + // User defined symbol receives extra bonus to always be selected. + const auto score = IsUserDefinedInlined(ret) + ? (length * max_score_ - 0.1) + : GetScoreInlined(ret); + const auto candidate_best_path_score = + score + best_path_score_till_here; + if (target_node.starts_at == -1 || + candidate_best_path_score > target_node.best_path_score) { + target_node.best_path_score = static_cast(candidate_best_path_score); + target_node.starts_at = starts_at; + target_node.id = ret; + } + if (!has_single_node && length == mblen) { + has_single_node = true; + } + } + } + if (!has_single_node) { + auto& target_node = best_path_ends_at[starts_at + mblen]; + const auto candidate_best_path_score = + unk_score + best_path_score_till_here; + if (target_node.starts_at == -1 || + candidate_best_path_score > target_node.best_path_score) { + target_node.best_path_score = candidate_best_path_score; + target_node.starts_at = starts_at; + target_node.id = unk_id_; + } + } + // Move by one unicode character. + starts_at += mblen; + } + // Backtrack to identify the best path. + EncodeResult results; + int ends_at = size; + while (ends_at > 0) { + const auto& node = best_path_ends_at[ends_at]; + results.emplace_back( + normalized.substr(node.starts_at, ends_at - node.starts_at), node.id); + ends_at = node.starts_at; + } + std::reverse(results.begin(), results.end()); + return results; + } + +public: + explicit T5UniGramTokenizer(bool is_umt5 = false) { + if (is_umt5) { + InitializePieces(load_umt5_tokenizer_json()); + } else { + InitializePieces(load_t5_tokenizer_json()); + } + + min_score_ = FLT_MAX; + max_score_ = FLT_MIN; + + std::vector> pieces; + for (int i = 0; i < piece_score_pairs.size(); i++) { + const auto& sp = piece_score_pairs[i]; + + min_score_ = std::min(min_score_, sp.second); + max_score_ = std::max(max_score_, sp.second); + + pieces.emplace_back(sp.first, i); + } + + BuildTrie(&pieces); + } + ~T5UniGramTokenizer(){}; + + std::string Normalize(const std::string& input) const { + // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29 + // TODO: nmt-nfkc + std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " "); + return normalized; + } + + std::vector Encode(const std::string& input, bool append_eos_if_not_present = true) const { + std::string normalized = Normalize(input); + normalized = pre_tokenizer.tokenize(normalized); + EncodeResult result = EncodeOptimized(normalized); + if (result.size() > 0 && append_eos_if_not_present) { + auto item = result[result.size() - 1]; + if (item.first != eos_token_) { + result.emplace_back(eos_token_, eos_id_); + } + } + std::vector tokens; + for (auto item : result) { + tokens.push_back(item.second); + } + return tokens; + } + + void pad_tokens(std::vector& tokens, + std::vector& weights, + std::vector* attention_mask, + size_t max_length = 0, + bool padding = false) { + if (max_length > 0 && padding) { + size_t orig_token_num = tokens.size() - 1; + size_t n = static_cast(std::ceil(orig_token_num * 1.0 / (max_length - 1))); + if (n == 0) { + n = 1; + } + size_t length = max_length * n; + LOG_DEBUG("token length: %llu", length); + std::vector new_tokens; + std::vector new_weights; + std::vector new_attention_mask; + int token_idx = 0; + for (int i = 0; i < length; i++) { + if (token_idx >= orig_token_num) { + break; + } + if (attention_mask != nullptr) { + new_attention_mask.push_back(0.0); + } + if (i % max_length == max_length - 1) { + new_tokens.push_back(eos_id_); + new_weights.push_back(1.0); + } else { + new_tokens.push_back(tokens[token_idx]); + new_weights.push_back(weights[token_idx]); + token_idx++; + } + } + + new_tokens.push_back(eos_id_); + new_weights.push_back(1.0); + if (attention_mask != nullptr) { + new_attention_mask.push_back(0.0); + } + + tokens = new_tokens; + weights = new_weights; + if (attention_mask != nullptr) { + *attention_mask = new_attention_mask; + } + + if (padding) { + int pad_token_id = pad_id_; + tokens.insert(tokens.end(), length - tokens.size(), pad_token_id); + weights.insert(weights.end(), length - weights.size(), 1.0); + if (attention_mask != nullptr) { + // maybe keep some padding tokens unmasked? + attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF); + } + } + } + } + + // Returns the minimum score in sentence pieces. + // min_score() - 10 is used for the cost of unknown sentence. + float min_score() const { return min_score_; } + + // Returns the maximum score in sentence pieces. + // max_score() is used for the cost of user defined symbols. + float max_score() const { return max_score_; } + + Status status() const { return status_; } +}; + +class T5LayerNorm : public UnaryBlock { +protected: + int64_t hidden_size; + float eps; + + void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + enum ggml_type wtype = GGML_TYPE_F32; + params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); + } + +public: + T5LayerNorm(int64_t hidden_size, + float eps = 1e-06f) + : hidden_size(hidden_size), + eps(eps) {} + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + ggml_tensor* w = params["weight"]; + x = ggml_rms_norm(ctx->ggml_ctx, x, eps); + x = ggml_mul(ctx->ggml_ctx, x, w); + return x; + } +}; + +struct T5DenseActDense : public UnaryBlock { +public: + T5DenseActDense(int64_t model_dim, int64_t ff_dim) { + blocks["wi"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); + blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + // x: [N, n_token, model_dim] + auto wi = std::dynamic_pointer_cast(blocks["wi"]); + auto wo = std::dynamic_pointer_cast(blocks["wo"]); + + x = wi->forward(ctx, x); + x = ggml_relu_inplace(ctx->ggml_ctx, x); + x = wo->forward(ctx, x); + return x; + } +}; + +struct T5DenseGatedActDense : public UnaryBlock { +public: + T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { + blocks["wi_0"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); + blocks["wi_1"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); + float scale = 1.f / 32.f; + // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...). + blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false, false, false, scale)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + // x: [N, n_token, model_dim] + auto wi_0 = std::dynamic_pointer_cast(blocks["wi_0"]); + auto wi_1 = std::dynamic_pointer_cast(blocks["wi_1"]); + auto wo = std::dynamic_pointer_cast(blocks["wo"]); + + auto hidden_gelu = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true); + auto hidden_linear = wi_1->forward(ctx, x); + x = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear); + x = wo->forward(ctx, x); + return x; + } +}; + +struct T5LayerFF : public UnaryBlock { +public: + T5LayerFF(int64_t model_dim, int64_t ff_dim) { + blocks["DenseReluDense"] = std::shared_ptr(new T5DenseGatedActDense(model_dim, ff_dim)); + blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override { + // x: [N, n_token, model_dim] + auto DenseReluDense = std::dynamic_pointer_cast(blocks["DenseReluDense"]); + auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); + + auto forwarded_states = layer_norm->forward(ctx, x); + forwarded_states = DenseReluDense->forward(ctx, forwarded_states); + x = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x); + return x; + } +}; + +class T5Attention : public GGMLBlock { +protected: + int64_t model_dim; + int64_t inner_dim; + int64_t num_heads; + bool using_relative_attention_bias; + int64_t relative_attention_num_buckets = 32; + int64_t relative_attention_max_distance = 128; + +public: + T5Attention(int64_t model_dim, + int64_t inner_dim, + int64_t num_heads, + bool using_relative_attention_bias = false) + : model_dim(model_dim), + inner_dim(inner_dim), + num_heads(num_heads), + using_relative_attention_bias(using_relative_attention_bias) { + blocks["q"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); + blocks["k"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); + blocks["v"] = std::shared_ptr(new Linear(model_dim, inner_dim, false)); + blocks["o"] = std::shared_ptr(new Linear(inner_dim, model_dim, false)); + if (using_relative_attention_bias) { + blocks["relative_attention_bias"] = std::shared_ptr(new Embedding(relative_attention_num_buckets, num_heads)); + } + } + + ggml_tensor* compute_bias(GGMLRunnerContext* ctx, + ggml_tensor* relative_position_bucket) { + auto relative_attention_bias = std::dynamic_pointer_cast(blocks["relative_attention_bias"]); + + auto values = relative_attention_bias->forward(ctx, relative_position_bucket); // shape (query_length, key_length, num_heads) + values = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3)); // shape (1, num_heads, query_length, key_length) + return values; + } + + // x: [N, n_token, model_dim] + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + auto q_proj = std::dynamic_pointer_cast(blocks["q"]); + auto k_proj = std::dynamic_pointer_cast(blocks["k"]); + auto v_proj = std::dynamic_pointer_cast(blocks["v"]); + auto out_proj = std::dynamic_pointer_cast(blocks["o"]); + + int64_t n_head = num_heads; + int64_t d_head = inner_dim / n_head; + + auto q = q_proj->forward(ctx, x); + auto k = k_proj->forward(ctx, x); + auto v = v_proj->forward(ctx, x); + + if (using_relative_attention_bias && relative_position_bucket != nullptr) { + past_bias = compute_bias(ctx, relative_position_bucket); + } + if (past_bias != nullptr) { + if (mask != nullptr) { + mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias); + mask = ggml_add(ctx->ggml_ctx, mask, past_bias); + } else { + mask = past_bias; + } + } + + k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast(d_head)), true); + + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask); // [N, n_token, d_head * n_head] + + x = out_proj->forward(ctx, x); // [N, n_token, model_dim] + return {x, past_bias}; + } +}; + +struct T5LayerSelfAttention : public GGMLBlock { +public: + T5LayerSelfAttention(int64_t model_dim, + int64_t inner_dim, + int64_t ff_dim, + int64_t num_heads, + bool using_relative_attention_bias) { + blocks["SelfAttention"] = std::shared_ptr(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias)); + blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // x: [N, n_token, model_dim] + auto SelfAttention = std::dynamic_pointer_cast(blocks["SelfAttention"]); + auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); + + auto normed_hidden_state = layer_norm->forward(ctx, x); + auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket); + auto output = ret.first; + past_bias = ret.second; + + x = ggml_add_inplace(ctx->ggml_ctx, output, x); + return {x, past_bias}; + } +}; + +struct T5Block : public GGMLBlock { +public: + T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) { + blocks["layer.0"] = std::shared_ptr(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias)); + blocks["layer.1"] = std::shared_ptr(new T5LayerFF(model_dim, ff_dim)); + } + + std::pair forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // x: [N, n_token, model_dim] + auto layer_0 = std::dynamic_pointer_cast(blocks["layer.0"]); + auto layer_1 = std::dynamic_pointer_cast(blocks["layer.1"]); + + auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket); + x = ret.first; + past_bias = ret.second; + x = layer_1->forward(ctx, x); + return {x, past_bias}; + } +}; + +struct T5Stack : public GGMLBlock { + int64_t num_layers; + +public: + T5Stack(int64_t num_layers, + int64_t model_dim, + int64_t inner_dim, + int64_t ff_dim, + int64_t num_heads, + bool relative_attention = true) + : num_layers(num_layers) { + for (int i = 0; i < num_layers; i++) { + blocks["block." + std::to_string(i)] = std::shared_ptr(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0))); + } + + blocks["final_layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* past_bias = nullptr, + ggml_tensor* attention_mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // x: [N, n_token, model_dim] + for (int i = 0; i < num_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["block." + std::to_string(i)]); + + auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); + x = ret.first; + past_bias = ret.second; + } + + auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]); + + x = final_layer_norm->forward(ctx, x); + return x; + } +}; + +struct T5Params { + int64_t num_layers = 24; + int64_t model_dim = 4096; + int64_t ff_dim = 10240; + int64_t num_heads = 64; + int64_t vocab_size = 32128; + bool relative_attention = true; +}; + +struct T5 : public GGMLBlock { + T5Params params; + +public: + T5() {} + T5(T5Params params) + : params(params) { + blocks["encoder"] = std::shared_ptr(new T5Stack(params.num_layers, + params.model_dim, + params.model_dim, + params.ff_dim, + params.num_heads, + params.relative_attention)); + blocks["shared"] = std::shared_ptr(new Embedding(params.vocab_size, + params.model_dim)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* past_bias = nullptr, + ggml_tensor* attention_mask = nullptr, + ggml_tensor* relative_position_bucket = nullptr) { + // input_ids: [N, n_token] + + auto shared = std::dynamic_pointer_cast(blocks["shared"]); + auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); + + auto x = shared->forward(ctx, input_ids); + x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); + return x; + } +}; + +struct T5Runner : public GGMLRunner { + T5Params params; + T5 model; + std::vector relative_position_bucket_vec; + + T5Runner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map, + const std::string prefix, + bool is_umt5 = false) + : GGMLRunner(backend, offload_params_to_cpu) { + if (is_umt5) { + params.vocab_size = 256384; + params.relative_attention = false; + } + model = T5(params); + model.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "t5"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + model.get_param_tensors(tensors, prefix); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* relative_position_bucket, + ggml_tensor* attention_mask = nullptr) { + size_t N = input_ids->ne[1]; + size_t n_token = input_ids->ne[0]; + + auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim] + return hidden_states; + } + + ggml_cgraph* build_graph(const sd::Tensor& input_ids_tensor, + const sd::Tensor& attention_mask_tensor = {}) { + ggml_cgraph* gf = ggml_new_graph(compute_ctx); + ggml_tensor* input_ids = make_input(input_ids_tensor); + ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor); + + relative_position_bucket_vec = compute_relative_position_bucket(static_cast(input_ids->ne[0]), static_cast(input_ids->ne[0])); + + // for (int i = 0; i < relative_position_bucket_vec.size(); i++) { + // if (i % 77 == 0) { + // printf("\n"); + // } + // printf("%d ", relative_position_bucket_vec[i]); + // } + + auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx, + GGML_TYPE_I32, + input_ids->ne[0], + input_ids->ne[0]); + set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); + + auto runner_ctx = get_context(); + ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask); + + ggml_build_forward_expand(gf, hidden_states); + + return gf; + } + + sd::Tensor compute(const int n_threads, + const sd::Tensor& input_ids, + const sd::Tensor& attention_mask) { + auto get_graph = [&]() -> ggml_cgraph* { + return build_graph(input_ids, attention_mask); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true), 3); + } + + static std::vector _relative_position_bucket(const std::vector& relative_position, + bool bidirectional = true, + int num_buckets = 32, + int max_distance = 128) { + std::vector relative_buckets(relative_position.size(), 0); + std::vector abs_relative_position = relative_position; + + if (bidirectional) { + num_buckets = num_buckets / 2; + for (size_t i = 0; i < relative_position.size(); ++i) { + if (relative_position[i] > 0) { + relative_buckets[i] += num_buckets; + } + abs_relative_position[i] = std::abs(relative_position[i]); + } + } else { + for (size_t i = 0; i < relative_position.size(); ++i) { + abs_relative_position[i] = std::max(-relative_position[i], 0); + } + } + + int max_exact = num_buckets / 2; + std::vector relative_position_if_large(relative_position.size(), 0); + + for (size_t i = 0; i < relative_position.size(); ++i) { + if (abs_relative_position[i] < max_exact) { + relative_buckets[i] += abs_relative_position[i]; + } else { + float log_pos = std::log(static_cast(abs_relative_position[i]) / max_exact); + float log_base = std::log(static_cast(max_distance) / max_exact); + relative_position_if_large[i] = max_exact + static_cast((log_pos / log_base) * (num_buckets - max_exact)); + relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1); + relative_buckets[i] += relative_position_if_large[i]; + } + } + + return relative_buckets; + } + + std::vector compute_relative_position_bucket(int query_length, + int key_length) { + std::vector context_position(query_length); + std::vector memory_position(key_length); + + for (int i = 0; i < query_length; ++i) { + context_position[i] = i; + } + for (int i = 0; i < key_length; ++i) { + memory_position[i] = i; + } + + std::vector> relative_position(query_length, std::vector(key_length, 0)); + for (int i = 0; i < query_length; ++i) { + for (int j = 0; j < key_length; ++j) { + relative_position[i][j] = memory_position[j] - context_position[i]; + } + } + + std::vector relative_position_bucket; + for (int i = 0; i < query_length; ++i) { + std::vector result = _relative_position_bucket(relative_position[i], true); + relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end()); + } + + return relative_position_bucket; + } +}; + +struct T5Embedder { + T5UniGramTokenizer tokenizer; + T5Runner model; + + T5Embedder(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + bool is_umt5 = false) + : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + model.get_param_tensors(tensors, prefix); + } + + void alloc_params_buffer() { + model.alloc_params_buffer(); + } + + std::tuple, std::vector, std::vector> tokenize(std::string text, + size_t max_length = 0, + bool padding = false) { + auto parsed_attention = parse_prompt_attention(text); + + { + std::stringstream ss; + ss << "["; + for (const auto& item : parsed_attention) { + ss << "['" << item.first << "', " << item.second << "], "; + } + ss << "]"; + LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); + } + + std::vector tokens; + std::vector weights; + for (const auto& item : parsed_attention) { + const std::string& curr_text = item.first; + float curr_weight = item.second; + std::vector curr_tokens = tokenizer.Encode(curr_text, false); + tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); + weights.insert(weights.end(), curr_tokens.size(), curr_weight); + } + + int EOS_TOKEN_ID = 1; + tokens.push_back(EOS_TOKEN_ID); + weights.push_back(1.0); + + std::vector attention_mask; + + tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding); + + // for (int i = 0; i < tokens.size(); i++) { + // std::cout << tokens[i] << ":" << weights[i] << ", "; + // } + // std::cout << std::endl; + + return {tokens, weights, attention_mask}; + } + + void test() { + ggml_init_params params; + params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB + params.mem_buffer = nullptr; + params.no_alloc = false; + + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); + + { + std::string text("a lovely cat"); + auto tokens_and_weights = tokenize(text, 512, true); + std::vector& tokens = std::get<0>(tokens_and_weights); + std::vector& weights = std::get<1>(tokens_and_weights); + std::vector& masks = std::get<2>(tokens_and_weights); + for (auto token : tokens) { + printf("%d ", token); + } + printf("\n"); + auto input_ids = sd::Tensor::from_vector(tokens); + auto attention_mask = sd::Tensor::from_vector(masks); + sd::Tensor out; + + int64_t t0 = ggml_time_ms(); + auto out_opt = model.compute(8, input_ids, attention_mask); + int64_t t1 = ggml_time_ms(); + + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); + LOG_DEBUG("t5 test done in %lldms", t1 - t0); + } + } + + static void load_from_file_and_test(const std::string& file_path) { + // cpu f16: pass + // cpu f32: pass + // cuda f16: pass + // cuda f32: pass + // cuda q8_0: pass + // ggml_backend_t backend = ggml_backend_cuda_init(0); + ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_type model_data_type = GGML_TYPE_F16; + + ModelLoader model_loader; + if (!model_loader.init_from_file_and_convert_name(file_path)) { + LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); + return; + } + + auto& tensor_storage_map = model_loader.get_tensor_storage_map(); + for (auto& [name, tensor_storage] : tensor_storage_map) { + if (ends_with(name, "weight")) { + tensor_storage.expected_type = model_data_type; + } + } + + std::shared_ptr t5 = std::make_shared(backend, false, tensor_storage_map, "", true); + + t5->alloc_params_buffer(); + std::map tensors; + t5->get_param_tensors(tensors, ""); + + bool success = model_loader.load_tensors(tensors); + + if (!success) { + LOG_ERROR("load tensors from model loader failed"); + return; + } + + LOG_INFO("t5 model loaded"); + t5->test(); + } +}; + +#endif // __T5_HPP__ diff --git a/src/tae.hpp b/src/tae.hpp index 3df09e4e..0a0ca682 100644 --- a/src/tae.hpp +++ b/src/tae.hpp @@ -562,41 +562,40 @@ struct TinyImageAutoEncoder : public VAE { taesd.get_param_tensors(tensors, prefix); } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { + SD_UNUSED(rng); return vae_output; } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { + return latents; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { + return latents; } int get_encoder_output_channels(int input_channels) { return taesd.z_channels; } - ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { ggml_cgraph* gf = ggml_new_graph(compute_ctx); - z = to_backend(z); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z); ggml_build_forward_expand(gf, out); return gf; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z_tensor, + bool decode_graph) override { auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(z, decode_graph); + return build_graph(z_tensor, decode_graph); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z_tensor.dim()); } }; @@ -625,42 +624,41 @@ struct TinyVideoAutoEncoder : public VAE { taehv.get_param_tensors(tensors, prefix); } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { + SD_UNUSED(rng); return vae_output; } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { + return latents; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { + return latents; } int get_encoder_output_channels(int input_channels) { return taehv.z_channels; } - ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { + ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { ggml_cgraph* gf = ggml_new_graph(compute_ctx); - z = to_backend(z); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); ggml_tensor* out = decode_graph ? taehv.decode(&runner_ctx, z) : taehv.encode(&runner_ctx, z); ggml_build_forward_expand(gf, out); return gf; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z_tensor, + bool decode_graph) override { auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(z, decode_graph); + return build_graph(z_tensor, decode_graph); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z_tensor.dim()); } }; -#endif // __TAE_HPP__ \ No newline at end of file +#endif // __TAE_HPP__ diff --git a/src/tensor.hpp b/src/tensor.hpp new file mode 100644 index 00000000..33a2bdea --- /dev/null +++ b/src/tensor.hpp @@ -0,0 +1,1249 @@ +#ifndef __SD_TENSOR_HPP__ +#define __SD_TENSOR_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rng.hpp" + +namespace sd { + + template + class Tensor; + + inline std::vector tensor_unravel_index(int64_t flat, const std::vector& shape); + + [[noreturn]] inline void tensor_throw_invalid_argument(const std::string& message) { + std::fprintf(stderr, "sd::Tensor error: %s\n", message.c_str()); + std::fflush(stderr); + throw std::invalid_argument(message); + } + + inline std::string tensor_shape_to_string(const std::vector& shape) { + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < shape.size(); ++i) { + if (i != 0) { + oss << ", "; + } + oss << shape[i]; + } + oss << "]"; + return oss.str(); + } + + inline int64_t tensor_numel(const std::vector& shape) { + if (shape.empty()) { + return 0; + } + int64_t numel = 1; + for (int64_t dim : shape) { + if (dim < 0) { + tensor_throw_invalid_argument("Tensor shape must be non-negative, got shape=" + + tensor_shape_to_string(shape)); + } + numel *= dim; + } + return numel; + } + + template + class Tensor { + public: + Tensor() = default; + + explicit Tensor(std::vector shape) + : data_(static_cast(tensor_numel(shape))), shape_(std::move(shape)) { + } + + Tensor(std::vector shape, std::vector data) + : data_(std::move(data)), shape_(std::move(shape)) { + if (static_cast(data_.size()) != tensor_numel(shape_)) { + tensor_throw_invalid_argument("Tensor data size does not match shape: data.size()=" + + std::to_string(data_.size()) + ", shape=" + + tensor_shape_to_string(shape_) + ", numel=" + + std::to_string(tensor_numel(shape_))); + } + } + + const std::vector& shape() const { + return shape_; + } + + int64_t dim() const { + return static_cast(shape_.size()); + } + + int64_t numel() const { + return static_cast(data_.size()); + } + + bool empty() const { + return data_.empty(); + } + + T* data() { + return data_.data(); + } + + const T* data() const { + return data_.data(); + } + + std::vector& values() { + return data_; + } + + const std::vector& values() const { + return data_; + } + + void resize(std::vector shape) { + shape_ = std::move(shape); + data_.resize(static_cast(tensor_numel(shape_))); + } + + Tensor& reshape_(std::vector shape) { + if (tensor_numel(shape) != numel()) { + tensor_throw_invalid_argument("Tensor reshape changes element count: from shape=" + + tensor_shape_to_string(shape_) + " (numel=" + + std::to_string(numel()) + ") to shape=" + + tensor_shape_to_string(shape) + " (numel=" + + std::to_string(tensor_numel(shape)) + ")"); + } + shape_ = std::move(shape); + return *this; + } + + Tensor reshape(std::vector shape) const { + Tensor result = *this; + result.reshape_(std::move(shape)); + return result; + } + + Tensor& squeeze_() { + std::vector new_shape; + new_shape.reserve(shape_.size()); + for (int64_t dim : shape_) { + if (dim != 1) { + new_shape.push_back(dim); + } + } + shape_ = std::move(new_shape); + return *this; + } + + Tensor& squeeze_(size_t dim) { + if (dim >= shape_.size()) { + tensor_throw_invalid_argument("Tensor squeeze dimension out of range: dim=" + + std::to_string(dim) + ", shape=" + + tensor_shape_to_string(shape_)); + } + if (shape_[dim] != 1) { + tensor_throw_invalid_argument("Tensor squeeze requires dimension size 1: dim=" + + std::to_string(dim) + ", shape=" + + tensor_shape_to_string(shape_)); + } + shape_.erase(shape_.begin() + static_cast(dim)); + return *this; + } + + Tensor squeeze() const { + Tensor result = *this; + result.squeeze_(); + return result; + } + + Tensor squeeze(size_t dim) const { + Tensor result = *this; + result.squeeze_(dim); + return result; + } + + Tensor& unsqueeze_(size_t dim) { + if (dim > shape_.size()) { + tensor_throw_invalid_argument("Tensor unsqueeze dimension out of range: dim=" + + std::to_string(dim) + ", shape=" + + tensor_shape_to_string(shape_)); + } + shape_.insert(shape_.begin() + static_cast(dim), 1); + return *this; + } + + Tensor unsqueeze(size_t dim) const { + Tensor result = *this; + result.unsqueeze_(dim); + return result; + } + + Tensor permute(const std::vector& dims) const { + if (dims.size() != static_cast(dim())) { + tensor_throw_invalid_argument("Tensor permute requires one dimension index per axis: tensor_shape=" + + tensor_shape_to_string(shape_) + ", dims_size=" + + std::to_string(dims.size())); + } + + std::vector seen(dims.size(), false); + std::vector out_shape(dims.size(), 1); + for (size_t i = 0; i < dims.size(); ++i) { + size_t dim_index = dims[i]; + if (dim_index >= dims.size() || seen[dim_index]) { + tensor_throw_invalid_argument("Tensor permute dimensions must be a valid permutation: tensor_shape=" + + tensor_shape_to_string(shape_)); + } + seen[dim_index] = true; + out_shape[i] = shape_[dim_index]; + } + + Tensor result(out_shape); + if (result.numel() == 0) { + return result; + } + + for (int64_t flat = 0; flat < result.numel(); ++flat) { + std::vector out_coord = tensor_unravel_index(flat, out_shape); + std::vector src_coord(static_cast(dim()), 0); + for (size_t i = 0; i < dims.size(); ++i) { + src_coord[dims[i]] = out_coord[i]; + } + result[flat] = index(src_coord); + } + + return result; + } + + Tensor& permute_(const std::vector& dims) { + *this = permute(dims); + return *this; + } + + void fill_(const T& value) { + std::fill(data_.begin(), data_.end(), value); + } + + Tensor& masked_fill_(const Tensor& mask, const T& value); + + T mean() const; + + static Tensor zeros(std::vector shape) { + return Tensor(std::move(shape)); + } + + static Tensor zeros_like(const Tensor& other) { + return zeros(other.shape()); + } + + static Tensor ones(std::vector shape) { + return full(std::move(shape), static_cast(1)); + } + + static Tensor ones_like(const Tensor& other) { + return ones(other.shape()); + } + + static Tensor full(std::vector shape, const T& value) { + Tensor tensor(std::move(shape)); + tensor.fill_(value); + return tensor; + } + + static Tensor randn(std::vector shape, const std::shared_ptr& rng) { + static_assert(std::is_same_v, "Tensor::randn currently requires Tensor"); + if (!rng) { + tensor_throw_invalid_argument("Tensor randn requires a valid RNG"); + } + const uint32_t size = static_cast(tensor_numel(shape)); + return Tensor(std::move(shape), rng->randn(size)); + } + + static Tensor randn_like(const Tensor& other, const std::shared_ptr& rng) { + return randn(other.shape(), rng); + } + + static Tensor from_vector(std::vector data) { + const int64_t size = static_cast(data.size()); + return Tensor({size}, std::move(data)); + } + + T& index(const std::vector& coord) { + return data_.at(offset_of(coord)); + } + + const T& index(const std::vector& coord) const { + return data_.at(offset_of(coord)); + } + + template && ...)>> + T& index(Indices... indices) { + return index(std::vector{static_cast(indices)...}); + } + + template && ...)>> + const T& index(Indices... indices) const { + return index(std::vector{static_cast(indices)...}); + } + + T& operator[](int64_t index) { + return data_.at(static_cast(index)); + } + + const T& operator[](int64_t index) const { + return data_.at(static_cast(index)); + } + + private: + size_t offset_of(const std::vector& coord) const { + if (coord.size() != shape_.size()) { + tensor_throw_invalid_argument("Tensor index rank mismatch: coord_rank=" + + std::to_string(coord.size()) + ", shape=" + + tensor_shape_to_string(shape_)); + } + size_t offset = 0; + size_t stride = 1; + for (size_t i = 0; i < shape_.size(); ++i) { + if (coord[i] < 0 || coord[i] >= shape_[i]) { + tensor_throw_invalid_argument("Tensor index out of range: shape=" + + tensor_shape_to_string(shape_)); + } + offset += static_cast(coord[i]) * stride; + stride *= static_cast(shape_[i]); + } + return offset; + } + + std::vector data_; + std::vector shape_; + }; + + template + inline T Tensor::mean() const { + if (empty()) { + return T{}; + } + T sum = T{}; + for (const T& value : data_) { + sum += value; + } + return sum / static_cast(numel()); + } + + template <> + inline float Tensor::mean() const { + if (empty()) { + return 0.0f; + } + double sum = 0.0; + for (float value : data_) { + sum += static_cast(value); + } + return static_cast(sum / static_cast(numel())); + } + + template + inline void tensor_check_same_shape(const Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + tensor_throw_invalid_argument("Tensor shapes must match: lhs_shape=" + + tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" + + tensor_shape_to_string(rhs.shape())); + } + } + + inline std::vector tensor_broadcast_shape(const std::vector& lhs, const std::vector& rhs) { + size_t ndim = std::max(lhs.size(), rhs.size()); + std::vector shape(ndim, 1); + for (size_t i = 0; i < ndim; ++i) { + int64_t lhs_dim = lhs.size() > i ? lhs[i] : 1; + int64_t rhs_dim = rhs.size() > i ? rhs[i] : 1; + if (lhs_dim != rhs_dim && lhs_dim != 1 && rhs_dim != 1) { + tensor_throw_invalid_argument("Tensor shapes are not broadcastable: lhs_shape=" + + tensor_shape_to_string(lhs) + ", rhs_shape=" + + tensor_shape_to_string(rhs)); + } + shape[i] = std::max(lhs_dim, rhs_dim); + } + return shape; + } + + inline std::vector tensor_unravel_index(int64_t flat, const std::vector& shape) { + std::vector coord(shape.size(), 0); + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] <= 0) { + tensor_throw_invalid_argument("Tensor unravel_index requires positive shape: shape=" + + tensor_shape_to_string(shape)); + } + coord[i] = flat % shape[i]; + flat /= shape[i]; + } + return coord; + } + + inline std::vector tensor_compute_strides(const std::vector& shape) { + std::vector strides(shape.size(), 1); + int64_t stride = 1; + for (size_t i = 0; i < shape.size(); ++i) { + strides[i] = stride; + stride *= shape[i]; + } + return strides; + } + + template + inline void tensor_for_each_broadcast_offset(const std::vector& out_shape, + const std::vector& lhs_shape_raw, + const std::vector& lhs_strides_raw, + const std::vector& rhs_shape_raw, + const std::vector& rhs_strides_raw, + F&& fn) { + const size_t ndim = out_shape.size(); + std::vector out_strides = tensor_compute_strides(out_shape); + std::vector lhs_shape(ndim, 1); + std::vector lhs_strides(ndim, 0); + std::vector rhs_shape(ndim, 1); + std::vector rhs_strides(ndim, 0); + + for (size_t i = 0; i < lhs_shape_raw.size(); ++i) { + lhs_shape[i] = lhs_shape_raw[i]; + lhs_strides[i] = lhs_strides_raw[i]; + } + for (size_t i = 0; i < rhs_shape_raw.size(); ++i) { + rhs_shape[i] = rhs_shape_raw[i]; + rhs_strides[i] = rhs_strides_raw[i]; + } + + const int64_t numel = tensor_numel(out_shape); + for (int64_t flat = 0; flat < numel; ++flat) { + int64_t remaining = flat; + int64_t lhs_offset = 0; + int64_t rhs_offset = 0; + for (size_t i = ndim; i-- > 0;) { + int64_t coord = remaining / out_strides[i]; + remaining %= out_strides[i]; + if (lhs_shape[i] != 1) { + lhs_offset += coord * lhs_strides[i]; + } + if (rhs_shape[i] != 1) { + rhs_offset += coord * rhs_strides[i]; + } + } + fn(flat, lhs_offset, rhs_offset); + } + } + + template + inline Tensor& Tensor::masked_fill_(const Tensor& mask, const T& value) { + if (empty()) { + return *this; + } + tensor_broadcast_shape(shape_, mask.shape()); + const std::vector data_strides = tensor_compute_strides(shape_); + const std::vector mask_strides = tensor_compute_strides(mask.shape()); + const uint8_t* mask_data = mask.data(); + tensor_for_each_broadcast_offset(shape_, + shape_, + data_strides, + mask.shape(), + mask_strides, + [&](int64_t, int64_t data_offset, int64_t mask_offset) { + if (mask_data[mask_offset] != 0) { + data_[static_cast(data_offset)] = value; + } + }); + return *this; + } + + template ::value>> + inline Tensor operator<(const Tensor& lhs, Scalar rhs) { + Tensor result(lhs.shape()); + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + result[i] = lhs[i] < value ? 1 : 0; + } + return result; + } + + template ::value>> + inline Tensor operator<(Scalar lhs, const Tensor& rhs) { + Tensor result(rhs.shape()); + const T value = static_cast(lhs); + for (int64_t i = 0; i < rhs.numel(); ++i) { + result[i] = value < rhs[i] ? 1 : 0; + } + return result; + } + + template + inline Tensor operator<(const Tensor& lhs, const Tensor& rhs) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] < rhs_data[rhs_offset] ? 1 : 0; + }); + return result; + } + + template + inline Tensor& operator+=(Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() == rhs.shape()) { + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] += rhs[i]; + } + return lhs; + } + tensor_broadcast_shape(lhs.shape(), rhs.shape()); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(lhs.shape(), + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) { + lhs[static_cast(lhs_offset)] += rhs_data[rhs_offset]; + }); + return lhs; + } + + template ::value>> + inline Tensor& operator+=(Tensor& lhs, Scalar rhs) { + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] += value; + } + return lhs; + } + + template + inline Tensor& operator-=(Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() == rhs.shape()) { + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] -= rhs[i]; + } + return lhs; + } + tensor_broadcast_shape(lhs.shape(), rhs.shape()); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(lhs.shape(), + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) { + lhs[static_cast(lhs_offset)] -= rhs_data[rhs_offset]; + }); + return lhs; + } + + template ::value>> + inline Tensor& operator-=(Tensor& lhs, Scalar rhs) { + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] -= value; + } + return lhs; + } + + template + inline Tensor& operator*=(Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() == rhs.shape()) { + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] *= rhs[i]; + } + return lhs; + } + tensor_broadcast_shape(lhs.shape(), rhs.shape()); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(lhs.shape(), + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) { + lhs[static_cast(lhs_offset)] *= rhs_data[rhs_offset]; + }); + return lhs; + } + + template ::value>> + inline Tensor& operator*=(Tensor& lhs, Scalar rhs) { + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] *= value; + } + return lhs; + } + + template + inline Tensor& operator/=(Tensor& lhs, const Tensor& rhs) { + if (lhs.shape() == rhs.shape()) { + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] /= rhs[i]; + } + return lhs; + } + tensor_broadcast_shape(lhs.shape(), rhs.shape()); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(lhs.shape(), + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t, int64_t lhs_offset, int64_t rhs_offset) { + lhs[static_cast(lhs_offset)] /= rhs_data[rhs_offset]; + }); + return lhs; + } + + template ::value>> + inline Tensor& operator/=(Tensor& lhs, Scalar rhs) { + const T value = static_cast(rhs); + for (int64_t i = 0; i < lhs.numel(); ++i) { + lhs[i] /= value; + } + return lhs; + } + + template + inline Tensor operator+(Tensor lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] + rhs_data[rhs_offset]; + }); + return result; + } + lhs += rhs; + return lhs; + } + + template ::value>> + inline Tensor operator+(Tensor lhs, Scalar rhs) { + lhs += rhs; + return lhs; + } + + template ::value>> + inline Tensor operator+(Scalar lhs, Tensor rhs) { + rhs += lhs; + return rhs; + } + + template + inline Tensor operator-(Tensor lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] - rhs_data[rhs_offset]; + }); + return result; + } + lhs -= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator-(Tensor lhs, Scalar rhs) { + lhs -= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator-(Scalar lhs, const Tensor& rhs) { + Tensor result = rhs; + const T value = static_cast(lhs); + for (int64_t i = 0; i < result.numel(); ++i) { + result[i] = value - result[i]; + } + return result; + } + + template + inline Tensor operator*(Tensor lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] * rhs_data[rhs_offset]; + }); + return result; + } + lhs *= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator*(Tensor lhs, Scalar rhs) { + lhs *= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator*(Scalar lhs, Tensor rhs) { + rhs *= lhs; + return rhs; + } + + template + inline Tensor operator/(Tensor lhs, const Tensor& rhs) { + if (lhs.shape() != rhs.shape()) { + const std::vector out_shape = tensor_broadcast_shape(lhs.shape(), rhs.shape()); + Tensor result(out_shape); + const std::vector lhs_strides = tensor_compute_strides(lhs.shape()); + const std::vector rhs_strides = tensor_compute_strides(rhs.shape()); + const T* lhs_data = lhs.data(); + const T* rhs_data = rhs.data(); + tensor_for_each_broadcast_offset(out_shape, + lhs.shape(), + lhs_strides, + rhs.shape(), + rhs_strides, + [&](int64_t flat, int64_t lhs_offset, int64_t rhs_offset) { + result[flat] = lhs_data[lhs_offset] / rhs_data[rhs_offset]; + }); + return result; + } + lhs /= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator/(Tensor lhs, Scalar rhs) { + lhs /= rhs; + return lhs; + } + + template ::value>> + inline Tensor operator/(Scalar lhs, const Tensor& rhs) { + Tensor result = rhs; + const T value = static_cast(lhs); + for (int64_t i = 0; i < result.numel(); ++i) { + result[i] = value / result[i]; + } + return result; + } + + template + inline Tensor operator-(const Tensor& tensor) { + Tensor result = tensor; + for (int64_t i = 0; i < result.numel(); ++i) { + result[i] = -result[i]; + } + return result; + } + + template + inline Tensor zeros(std::vector shape) { + return Tensor::zeros(std::move(shape)); + } + + template + inline Tensor full(std::vector shape, const T& value) { + return Tensor::full(std::move(shape), value); + } + + template + inline Tensor randn(std::vector shape, const std::shared_ptr& rng) { + return Tensor::randn(std::move(shape), rng); + } + + template + inline Tensor randn_like(const Tensor& tensor, const std::shared_ptr& rng) { + return Tensor::randn(tensor.shape(), rng); + } + + template + inline std::vector tensor_to_vector(const Tensor& tensor) { + return tensor.values(); + } + + namespace ops { + enum class InterpolateMode { + Nearest, + }; + + inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) { + if (index < 0) { + index += dim_size; + } + return index; + } + + template + inline std::pair resolve_slice_bounds(const Tensor& input, + size_t dim, + int64_t start, + int64_t end) { + if (dim >= static_cast(input.dim())) { + tensor_throw_invalid_argument("Tensor slice dimension out of range: dim=" + + std::to_string(dim) + ", rank=" + + std::to_string(input.dim()) + ", input_shape=" + + tensor_shape_to_string(input.shape())); + } + + int64_t dim_size = input.shape()[dim]; + start = normalize_slice_bound(start, dim_size); + end = normalize_slice_bound(end, dim_size); + + if (start < 0 || start > dim_size || end < 0 || end > dim_size || start > end) { + tensor_throw_invalid_argument("Tensor slice bounds out of range: dim=" + + std::to_string(dim) + ", start=" + + std::to_string(start) + ", end=" + + std::to_string(end) + ", input_shape=" + + tensor_shape_to_string(input.shape())); + } + + return {start, end}; + } + + template + inline Tensor exp(const Tensor& input) { + Tensor output(input.shape()); + for (int64_t i = 0; i < input.numel(); ++i) { + output[i] = static_cast(std::exp(static_cast(input[i]))); + } + return output; + } + + template + inline Tensor clamp(const Tensor& input, const T& min_value, const T& max_value) { + if (min_value > max_value) { + tensor_throw_invalid_argument("Tensor clamp requires min_value <= max_value"); + } + Tensor output(input.shape()); + for (int64_t i = 0; i < input.numel(); ++i) { + output[i] = std::clamp(input[i], min_value, max_value); + } + return output; + } + + template + inline Tensor round(const Tensor& input) { + Tensor output(input.shape()); + for (int64_t i = 0; i < input.numel(); ++i) { + output[i] = static_cast(std::round(static_cast(input[i]))); + } + return output; + } + + template + inline Tensor slice(const Tensor& input, + size_t dim, + int64_t start, + int64_t end) { + auto [resolved_start, resolved_end] = resolve_slice_bounds(input, dim, start, end); + std::vector out_shape = input.shape(); + out_shape[dim] = resolved_end - resolved_start; + + Tensor output(out_shape); + if (output.numel() == 0) { + return output; + } + + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= input.shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(input.dim()); ++i) { + outer *= input.shape()[i]; + } + + int64_t src_chunk = (resolved_end - resolved_start) * inner; + int64_t src_stride = input.shape()[dim] * inner; + for (int64_t i = 0; i < outer; ++i) { + const int64_t src_offset = i * src_stride + resolved_start * inner; + const int64_t dst_offset = i * src_chunk; + std::copy_n(input.data() + src_offset, src_chunk, output.data() + dst_offset); + } + + return output; + } + + template + inline Tensor narrow(const Tensor& input, + size_t dim, + int64_t start, + int64_t length) { + if (length < 0) { + tensor_throw_invalid_argument("Tensor narrow requires non-negative length: length=" + + std::to_string(length) + ", input_shape=" + + tensor_shape_to_string(input.shape())); + } + return slice(input, dim, start, start + length); + } + + template + inline void slice_assign(Tensor* dst, + size_t dim, + int64_t start, + int64_t end, + const Tensor& src) { + if (dst == nullptr) { + tensor_throw_invalid_argument("Tensor slice_assign requires non-null dst"); + } + + auto [resolved_start, resolved_end] = resolve_slice_bounds(*dst, dim, start, end); + if (src.dim() != dst->dim()) { + tensor_throw_invalid_argument("Tensor slice_assign requires matching rank: dst_shape=" + + tensor_shape_to_string(dst->shape()) + ", src_shape=" + + tensor_shape_to_string(src.shape())); + } + + std::vector expected_shape = dst->shape(); + expected_shape[dim] = resolved_end - resolved_start; + if (src.shape() != expected_shape) { + tensor_throw_invalid_argument("Tensor slice_assign requires matching source shape: dst_shape=" + + tensor_shape_to_string(dst->shape()) + ", src_shape=" + + tensor_shape_to_string(src.shape()) + ", expected_src_shape=" + + tensor_shape_to_string(expected_shape)); + } + + if (src.numel() == 0) { + return; + } + + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= dst->shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(dst->dim()); ++i) { + outer *= dst->shape()[i]; + } + + int64_t dst_chunk = (resolved_end - resolved_start) * inner; + int64_t dst_stride = dst->shape()[dim] * inner; + for (int64_t i = 0; i < outer; ++i) { + const int64_t dst_offset = i * dst_stride + resolved_start * inner; + const int64_t src_offset = i * dst_chunk; + std::copy_n(src.data() + src_offset, dst_chunk, dst->data() + dst_offset); + } + } + + template + inline void fill_slice(Tensor* dst, + size_t dim, + int64_t start, + int64_t end, + const T& value) { + if (dst == nullptr) { + tensor_throw_invalid_argument("Tensor fill_slice requires non-null dst"); + } + + auto [resolved_start, resolved_end] = resolve_slice_bounds(*dst, dim, start, end); + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= dst->shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(dst->dim()); ++i) { + outer *= dst->shape()[i]; + } + + int64_t chunk = (resolved_end - resolved_start) * inner; + int64_t stride = dst->shape()[dim] * inner; + for (int64_t i = 0; i < outer; ++i) { + const int64_t offset = i * stride + resolved_start * inner; + std::fill_n(dst->data() + offset, chunk, value); + } + } + + template + inline Tensor interpolate(const Tensor& input, + std::vector output_shape, + InterpolateMode mode = InterpolateMode::Nearest, + bool align_corners = false) { + if (mode != InterpolateMode::Nearest) { + tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" + + std::to_string(static_cast(mode))); + } + if (align_corners) { + tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + if (input.shape() == output_shape) { + return input; + } + if (input.dim() != static_cast(output_shape.size())) { + tensor_throw_invalid_argument("Tensor interpolate requires matching rank: input_dim=" + + std::to_string(input.dim()) + ", output_dim=" + + std::to_string(output_shape.size()) + ", input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + for (size_t i = 0; i < output_shape.size(); ++i) { + if (output_shape[i] <= 0) { + tensor_throw_invalid_argument("Tensor interpolate output shape must be positive: input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + if (input.shape()[i] <= 0) { + tensor_throw_invalid_argument("Tensor interpolate input shape must be positive: input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + } + + Tensor output(std::move(output_shape)); + for (int64_t flat = 0; flat < output.numel(); ++flat) { + std::vector output_coord = tensor_unravel_index(flat, output.shape()); + std::vector input_coord(static_cast(input.dim()), 0); + for (size_t i = 0; i < static_cast(input.dim()); ++i) { + input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i]; + } + output[flat] = input.index(input_coord); + } + + return output; + } + + template + inline Tensor interpolate(const Tensor& input, + const std::optional>& size, + const std::optional>& scale_factor, + InterpolateMode mode = InterpolateMode::Nearest, + bool align_corners = false) { + if (mode != InterpolateMode::Nearest) { + tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" + + std::to_string(static_cast(mode))); + } + if (align_corners) { + tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" + + tensor_shape_to_string(input.shape())); + } + if (size.has_value() == scale_factor.has_value()) { + tensor_throw_invalid_argument("Tensor interpolate requires exactly one of size or scale_factor: input_shape=" + + tensor_shape_to_string(input.shape())); + } + + std::vector output_shape = input.shape(); + if (size.has_value()) { + if (size->empty() || size->size() > output_shape.size()) { + tensor_throw_invalid_argument("Tensor interpolate size must target low dimensions: input_shape=" + + tensor_shape_to_string(input.shape()) + ", size_rank=" + + std::to_string(size->size())); + } + for (size_t i = 0; i < size->size(); ++i) { + if ((*size)[i] <= 0) { + tensor_throw_invalid_argument("Tensor interpolate size must be positive: input_shape=" + + tensor_shape_to_string(input.shape()) + ", size=" + + tensor_shape_to_string(*size)); + } + output_shape[i] = (*size)[i]; + } + } else { + if (scale_factor->empty() || scale_factor->size() > output_shape.size()) { + tensor_throw_invalid_argument("Tensor interpolate scale_factor must target low dimensions: input_shape=" + + tensor_shape_to_string(input.shape()) + ", scale_factor_rank=" + + std::to_string(scale_factor->size())); + } + for (size_t i = 0; i < scale_factor->size(); ++i) { + if ((*scale_factor)[i] <= 0.0) { + tensor_throw_invalid_argument("Tensor interpolate scale_factor must be positive: input_shape=" + + tensor_shape_to_string(input.shape())); + } + output_shape[i] = static_cast( + std::floor(static_cast(output_shape[i]) * (*scale_factor)[i])); + if (output_shape[i] <= 0) { + tensor_throw_invalid_argument("Tensor interpolate output shape must be positive: input_shape=" + + tensor_shape_to_string(input.shape()) + ", output_shape=" + + tensor_shape_to_string(output_shape)); + } + } + } + + return interpolate(input, std::move(output_shape), mode, align_corners); + } + + template + inline Tensor interpolate(const Tensor& input, + const std::optional>& size, + double scale_factor, + InterpolateMode mode = InterpolateMode::Nearest, + bool align_corners = false) { + return interpolate(input, + size, + std::vector(size.has_value() ? size->size() : input.dim(), scale_factor), + mode, + align_corners); + } + + template + inline Tensor concat(const Tensor& lhs, const Tensor& rhs, size_t dim) { + if (lhs.dim() != rhs.dim()) { + tensor_throw_invalid_argument("Tensor concat requires same rank: lhs_dim=" + + std::to_string(lhs.dim()) + ", rhs_dim=" + + std::to_string(rhs.dim()) + ", lhs_shape=" + + tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" + + tensor_shape_to_string(rhs.shape())); + } + if (dim >= static_cast(lhs.dim())) { + tensor_throw_invalid_argument("Tensor concat dimension out of range: dim=" + + std::to_string(dim) + ", rank=" + + std::to_string(lhs.dim()) + ", lhs_shape=" + + tensor_shape_to_string(lhs.shape())); + } + std::vector out_shape = lhs.shape(); + for (size_t i = 0; i < static_cast(lhs.dim()); ++i) { + if (i == dim) { + continue; + } + if (lhs.shape()[i] != rhs.shape()[i]) { + tensor_throw_invalid_argument("Tensor concat requires matching non-concat dimensions: dim=" + + std::to_string(dim) + ", lhs_shape=" + + tensor_shape_to_string(lhs.shape()) + ", rhs_shape=" + + tensor_shape_to_string(rhs.shape())); + } + } + out_shape[dim] += rhs.shape()[dim]; + + Tensor out(out_shape); + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= lhs.shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(lhs.dim()); ++i) { + outer *= lhs.shape()[i]; + } + + int64_t lhs_chunk = lhs.shape()[dim] * inner; + int64_t rhs_chunk = rhs.shape()[dim] * inner; + int64_t out_chunk = lhs_chunk + rhs_chunk; + + for (int64_t i = 0; i < outer; ++i) { + int64_t lhs_offset = i * lhs_chunk; + int64_t rhs_offset = i * rhs_chunk; + int64_t out_offset = i * out_chunk; + + std::copy_n(lhs.data() + lhs_offset, lhs_chunk, out.data() + out_offset); + std::copy_n(rhs.data() + rhs_offset, rhs_chunk, out.data() + out_offset + lhs_chunk); + } + return out; + } + + template + inline std::vector> chunk(const Tensor& tensor, int64_t chunks, size_t dim) { + if (chunks <= 0) { + tensor_throw_invalid_argument("Tensor chunk requires chunks > 0: chunks=" + + std::to_string(chunks) + ", tensor_shape=" + + tensor_shape_to_string(tensor.shape())); + } + if (dim >= static_cast(tensor.dim())) { + tensor_throw_invalid_argument("Tensor chunk dimension out of range: dim=" + + std::to_string(dim) + ", rank=" + + std::to_string(tensor.dim()) + ", tensor_shape=" + + tensor_shape_to_string(tensor.shape())); + } + + const int64_t dim_size = tensor.shape()[dim]; + if (dim_size == 0) { + return {}; + } + if (dim_size % chunks != 0) { + tensor_throw_invalid_argument("Tensor chunk requires the dimension size to be divisible by chunks: dim=" + + std::to_string(dim) + ", dim_size=" + + std::to_string(dim_size) + ", chunks=" + + std::to_string(chunks) + ", tensor_shape=" + + tensor_shape_to_string(tensor.shape())); + } + + const int64_t chunk_size = dim_size / chunks; + int64_t inner = 1; + for (size_t i = 0; i < dim; ++i) { + inner *= tensor.shape()[i]; + } + + int64_t outer = 1; + for (size_t i = dim + 1; i < static_cast(tensor.dim()); ++i) { + outer *= tensor.shape()[i]; + } + + std::vector> parts; + parts.reserve(static_cast(chunks)); + + for (int64_t start = 0; start < dim_size; start += chunk_size) { + std::vector part_shape = tensor.shape(); + part_shape[dim] = chunk_size; + Tensor part(part_shape); + + const int64_t src_chunk = chunk_size * inner; + const int64_t dst_chunk = src_chunk; + for (int64_t i = 0; i < outer; ++i) { + const int64_t src_offset = (i * dim_size + start) * inner; + const int64_t dst_offset = i * dst_chunk; + std::copy_n(tensor.data() + src_offset, src_chunk, part.data() + dst_offset); + } + + parts.push_back(std::move(part)); + } + + return parts; + } + + } // namespace ops + +} // namespace sd + +#endif diff --git a/src/tensor_ggml.hpp b/src/tensor_ggml.hpp new file mode 100644 index 00000000..493a958c --- /dev/null +++ b/src/tensor_ggml.hpp @@ -0,0 +1,127 @@ +#ifndef __SD_TENSOR_GGML_HPP__ +#define __SD_TENSOR_GGML_HPP__ + +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "tensor.hpp" + +namespace sd { + + template + struct GGMLTypeTraits; + + template <> + struct GGMLTypeTraits { + static constexpr ggml_type type = GGML_TYPE_F32; + }; + + template <> + struct GGMLTypeTraits { + static constexpr ggml_type type = GGML_TYPE_F16; + }; + + template <> + struct GGMLTypeTraits { + static constexpr ggml_type type = GGML_TYPE_I32; + }; + + template <> + struct GGMLTypeTraits { + static constexpr ggml_type type = GGML_TYPE_I64; + }; + + inline std::vector shape_from_ggml(const ggml_tensor* tensor) { + std::vector shape; + shape.reserve(static_cast(ggml_n_dims(tensor))); + for (int i = 0; i < ggml_n_dims(tensor); ++i) { + shape.push_back(tensor->ne[i]); + } + return shape; + } + + template + inline Tensor make_sd_tensor_from_ggml(const ggml_tensor* tensor) { + if (tensor == nullptr) { + return {}; + } + if (tensor->type != GGMLTypeTraits::type) { + GGML_ABORT("ggml tensor type does not match sd::Tensor type"); + } + Tensor result(shape_from_ggml(tensor)); + if (tensor->buffer != nullptr) { + ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor)); + } else { + std::memcpy(result.data(), tensor->data, ggml_nbytes(tensor)); + } + return result; + } + + template + inline ggml_tensor* make_ggml_tensor(ggml_context* ctx, const Tensor& tensor, bool copy_data = true) { + GGML_ASSERT(tensor.dim() > 0 && tensor.dim() <= 5); + + int n_dims = std::min(static_cast(tensor.dim()), GGML_MAX_DIMS); + + std::array ne = {1, 1, 1, 1}; + for (int64_t i = 0; i < n_dims; ++i) { + ne[static_cast(i)] = tensor.shape()[static_cast(i)]; + } + + if (tensor.dim() == 5) { + ne[3] *= tensor.shape()[4]; + } + + ggml_tensor* result = ggml_new_tensor(ctx, GGMLTypeTraits::type, n_dims, ne.data()); + if (copy_data && tensor.numel() > 0) { + std::memcpy(result->data, tensor.data(), static_cast(ggml_nbytes(result))); + } + return result; + } + + template + inline Tensor load_tensor_from_file_as_tensor(const std::string& file_path) { + std::ifstream file(file_path, std::ios::binary); + if (!file.is_open()) { + throw std::runtime_error("failed to open tensor file: " + file_path); + } + + int32_t n_dims = 0; + int32_t length = 0; + int32_t ttype = 0; + file.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + file.read(reinterpret_cast(&length), sizeof(length)); + file.read(reinterpret_cast(&ttype), sizeof(ttype)); + if (!file.good()) { + throw std::runtime_error("incomplete tensor file header: " + file_path); + } + if (static_cast(ttype) != GGMLTypeTraits::type) { + throw std::invalid_argument("tensor file type does not match requested sd::Tensor type"); + } + + std::vector shape(4, 1); + for (int i = 0; i < n_dims; ++i) { + int32_t dim = 1; + file.read(reinterpret_cast(&dim), sizeof(dim)); + shape[static_cast(i)] = dim; + } + std::string name(static_cast(length), '\0'); + file.read(name.data(), length); + + shape.resize(static_cast(n_dims)); + Tensor tensor(shape); + file.read(reinterpret_cast(tensor.data()), static_cast(tensor.numel() * sizeof(T))); + if (!file.good()) { + throw std::runtime_error("incomplete tensor file data: " + file_path); + } + return tensor; + } + +} // namespace sd + +#endif diff --git a/src/tokenize_util.cpp b/src/tokenize_util.cpp index 22cf8ae2..33fdad26 100644 --- a/src/tokenize_util.cpp +++ b/src/tokenize_util.cpp @@ -1,993 +1,993 @@ -#include -#include -#include -#include - -#include "tokenize_util.h" - -bool is_number(char32_t ch) { - return (ch >= U'0' && ch <= U'9'); -} - -bool is_letter(char32_t ch) { - static const struct { char32_t start, end; } ranges[] = { - {0x41, 0x5A}, - {0x61, 0x7A}, - {0xAA, 0xAA}, - {0xB5, 0xB5}, - {0xBA, 0xBA}, - {0xC0, 0xD6}, - {0xD8, 0xF6}, - {0xF8, 0x2C1}, - {0x2C6, 0x2D1}, - {0x2E0, 0x2E4}, - {0x2EC, 0x2EC}, - {0x2EE, 0x2EE}, - {0x370, 0x374}, - {0x376, 0x377}, - {0x37A, 0x37D}, - {0x37F, 0x37F}, - {0x386, 0x386}, - {0x388, 0x38A}, - {0x38C, 0x38C}, - {0x38E, 0x3A1}, - {0x3A3, 0x3F5}, - {0x3F7, 0x481}, - {0x48A, 0x52F}, - {0x531, 0x556}, - {0x559, 0x559}, - {0x560, 0x588}, - {0x5D0, 0x5EA}, - {0x5EF, 0x5F2}, - {0x620, 0x64A}, - {0x66E, 0x66F}, - {0x671, 0x6D3}, - {0x6D5, 0x6D5}, - {0x6E5, 0x6E6}, - {0x6EE, 0x6EF}, - {0x6FA, 0x6FC}, - {0x6FF, 0x6FF}, - {0x710, 0x710}, - {0x712, 0x72F}, - {0x74D, 0x7A5}, - {0x7B1, 0x7B1}, - {0x7CA, 0x7EA}, - {0x7F4, 0x7F5}, - {0x7FA, 0x7FA}, - {0x800, 0x815}, - {0x81A, 0x81A}, - {0x824, 0x824}, - {0x828, 0x828}, - {0x840, 0x858}, - {0x860, 0x86A}, - {0x870, 0x887}, - {0x889, 0x88F}, - {0x8A0, 0x8C9}, - {0x904, 0x939}, - {0x93D, 0x93D}, - {0x950, 0x950}, - {0x958, 0x961}, - {0x971, 0x980}, - {0x985, 0x98C}, - {0x98F, 0x990}, - {0x993, 0x9A8}, - {0x9AA, 0x9B0}, - {0x9B2, 0x9B2}, - {0x9B6, 0x9B9}, - {0x9BD, 0x9BD}, - {0x9CE, 0x9CE}, - {0x9DC, 0x9DD}, - {0x9DF, 0x9E1}, - {0x9F0, 0x9F1}, - {0x9FC, 0x9FC}, - {0xA05, 0xA0A}, - {0xA0F, 0xA10}, - {0xA13, 0xA28}, - {0xA2A, 0xA30}, - {0xA32, 0xA33}, - {0xA35, 0xA36}, - {0xA38, 0xA39}, - {0xA59, 0xA5C}, - {0xA5E, 0xA5E}, - {0xA72, 0xA74}, - {0xA85, 0xA8D}, - {0xA8F, 0xA91}, - {0xA93, 0xAA8}, - {0xAAA, 0xAB0}, - {0xAB2, 0xAB3}, - {0xAB5, 0xAB9}, - {0xABD, 0xABD}, - {0xAD0, 0xAD0}, - {0xAE0, 0xAE1}, - {0xAF9, 0xAF9}, - {0xB05, 0xB0C}, - {0xB0F, 0xB10}, - {0xB13, 0xB28}, - {0xB2A, 0xB30}, - {0xB32, 0xB33}, - {0xB35, 0xB39}, - {0xB3D, 0xB3D}, - {0xB5C, 0xB5D}, - {0xB5F, 0xB61}, - {0xB71, 0xB71}, - {0xB83, 0xB83}, - {0xB85, 0xB8A}, - {0xB8E, 0xB90}, - {0xB92, 0xB95}, - {0xB99, 0xB9A}, - {0xB9C, 0xB9C}, - {0xB9E, 0xB9F}, - {0xBA3, 0xBA4}, - {0xBA8, 0xBAA}, - {0xBAE, 0xBB9}, - {0xBD0, 0xBD0}, - {0xC05, 0xC0C}, - {0xC0E, 0xC10}, - {0xC12, 0xC28}, - {0xC2A, 0xC39}, - {0xC3D, 0xC3D}, - {0xC58, 0xC5A}, - {0xC5C, 0xC5D}, - {0xC60, 0xC61}, - {0xC80, 0xC80}, - {0xC85, 0xC8C}, - {0xC8E, 0xC90}, - {0xC92, 0xCA8}, - {0xCAA, 0xCB3}, - {0xCB5, 0xCB9}, - {0xCBD, 0xCBD}, - {0xCDC, 0xCDE}, - {0xCE0, 0xCE1}, - {0xCF1, 0xCF2}, - {0xD04, 0xD0C}, - {0xD0E, 0xD10}, - {0xD12, 0xD3A}, - {0xD3D, 0xD3D}, - {0xD4E, 0xD4E}, - {0xD54, 0xD56}, - {0xD5F, 0xD61}, - {0xD7A, 0xD7F}, - {0xD85, 0xD96}, - {0xD9A, 0xDB1}, - {0xDB3, 0xDBB}, - {0xDBD, 0xDBD}, - {0xDC0, 0xDC6}, - {0xE01, 0xE30}, - {0xE32, 0xE33}, - {0xE40, 0xE46}, - {0xE81, 0xE82}, - {0xE84, 0xE84}, - {0xE86, 0xE8A}, - {0xE8C, 0xEA3}, - {0xEA5, 0xEA5}, - {0xEA7, 0xEB0}, - {0xEB2, 0xEB3}, - {0xEBD, 0xEBD}, - {0xEC0, 0xEC4}, - {0xEC6, 0xEC6}, - {0xEDC, 0xEDF}, - {0xF00, 0xF00}, - {0xF40, 0xF47}, - {0xF49, 0xF6C}, - {0xF88, 0xF8C}, - {0x1000, 0x102A}, - {0x103F, 0x103F}, - {0x1050, 0x1055}, - {0x105A, 0x105D}, - {0x1061, 0x1061}, - {0x1065, 0x1066}, - {0x106E, 0x1070}, - {0x1075, 0x1081}, - {0x108E, 0x108E}, - {0x10A0, 0x10C5}, - {0x10C7, 0x10C7}, - {0x10CD, 0x10CD}, - {0x10D0, 0x10FA}, - {0x10FC, 0x1248}, - {0x124A, 0x124D}, - {0x1250, 0x1256}, - {0x1258, 0x1258}, - {0x125A, 0x125D}, - {0x1260, 0x1288}, - {0x128A, 0x128D}, - {0x1290, 0x12B0}, - {0x12B2, 0x12B5}, - {0x12B8, 0x12BE}, - {0x12C0, 0x12C0}, - {0x12C2, 0x12C5}, - {0x12C8, 0x12D6}, - {0x12D8, 0x1310}, - {0x1312, 0x1315}, - {0x1318, 0x135A}, - {0x1380, 0x138F}, - {0x13A0, 0x13F5}, - {0x13F8, 0x13FD}, - {0x1401, 0x166C}, - {0x166F, 0x167F}, - {0x1681, 0x169A}, - {0x16A0, 0x16EA}, - {0x16F1, 0x16F8}, - {0x1700, 0x1711}, - {0x171F, 0x1731}, - {0x1740, 0x1751}, - {0x1760, 0x176C}, - {0x176E, 0x1770}, - {0x1780, 0x17B3}, - {0x17D7, 0x17D7}, - {0x17DC, 0x17DC}, - {0x1820, 0x1878}, - {0x1880, 0x1884}, - {0x1887, 0x18A8}, - {0x18AA, 0x18AA}, - {0x18B0, 0x18F5}, - {0x1900, 0x191E}, - {0x1950, 0x196D}, - {0x1970, 0x1974}, - {0x1980, 0x19AB}, - {0x19B0, 0x19C9}, - {0x1A00, 0x1A16}, - {0x1A20, 0x1A54}, - {0x1AA7, 0x1AA7}, - {0x1B05, 0x1B33}, - {0x1B45, 0x1B4C}, - {0x1B83, 0x1BA0}, - {0x1BAE, 0x1BAF}, - {0x1BBA, 0x1BE5}, - {0x1C00, 0x1C23}, - {0x1C4D, 0x1C4F}, - {0x1C5A, 0x1C7D}, - {0x1C80, 0x1C8A}, - {0x1C90, 0x1CBA}, - {0x1CBD, 0x1CBF}, - {0x1CE9, 0x1CEC}, - {0x1CEE, 0x1CF3}, - {0x1CF5, 0x1CF6}, - {0x1CFA, 0x1CFA}, - {0x1D00, 0x1DBF}, - {0x1E00, 0x1F15}, - {0x1F18, 0x1F1D}, - {0x1F20, 0x1F45}, - {0x1F48, 0x1F4D}, - {0x1F50, 0x1F57}, - {0x1F59, 0x1F59}, - {0x1F5B, 0x1F5B}, - {0x1F5D, 0x1F5D}, - {0x1F5F, 0x1F7D}, - {0x1F80, 0x1FB4}, - {0x1FB6, 0x1FBC}, - {0x1FBE, 0x1FBE}, - {0x1FC2, 0x1FC4}, - {0x1FC6, 0x1FCC}, - {0x1FD0, 0x1FD3}, - {0x1FD6, 0x1FDB}, - {0x1FE0, 0x1FEC}, - {0x1FF2, 0x1FF4}, - {0x1FF6, 0x1FFC}, - {0x2071, 0x2071}, - {0x207F, 0x207F}, - {0x2090, 0x209C}, - {0x2102, 0x2102}, - {0x2107, 0x2107}, - {0x210A, 0x2113}, - {0x2115, 0x2115}, - {0x2119, 0x211D}, - {0x2124, 0x2124}, - {0x2126, 0x2126}, - {0x2128, 0x2128}, - {0x212A, 0x212D}, - {0x212F, 0x2139}, - {0x213C, 0x213F}, - {0x2145, 0x2149}, - {0x214E, 0x214E}, - {0x2183, 0x2184}, - {0x2C00, 0x2CE4}, - {0x2CEB, 0x2CEE}, - {0x2CF2, 0x2CF3}, - {0x2D00, 0x2D25}, - {0x2D27, 0x2D27}, - {0x2D2D, 0x2D2D}, - {0x2D30, 0x2D67}, - {0x2D6F, 0x2D6F}, - {0x2D80, 0x2D96}, - {0x2DA0, 0x2DA6}, - {0x2DA8, 0x2DAE}, - {0x2DB0, 0x2DB6}, - {0x2DB8, 0x2DBE}, - {0x2DC0, 0x2DC6}, - {0x2DC8, 0x2DCE}, - {0x2DD0, 0x2DD6}, - {0x2DD8, 0x2DDE}, - {0x2E2F, 0x2E2F}, - {0x3005, 0x3006}, - {0x3031, 0x3035}, - {0x303B, 0x303C}, - {0x3041, 0x3096}, - {0x309D, 0x309F}, - {0x30A1, 0x30FA}, - {0x30FC, 0x30FF}, - {0x3105, 0x312F}, - {0x3131, 0x318E}, - {0x31A0, 0x31BF}, - {0x31F0, 0x31FF}, - {0x3400, 0x4DBF}, - {0x4E00, 0xA48C}, - {0xA4D0, 0xA4FD}, - {0xA500, 0xA60C}, - {0xA610, 0xA61F}, - {0xA62A, 0xA62B}, - {0xA640, 0xA66E}, - {0xA67F, 0xA69D}, - {0xA6A0, 0xA6E5}, - {0xA717, 0xA71F}, - {0xA722, 0xA788}, - {0xA78B, 0xA7DC}, - {0xA7F1, 0xA801}, - {0xA803, 0xA805}, - {0xA807, 0xA80A}, - {0xA80C, 0xA822}, - {0xA840, 0xA873}, - {0xA882, 0xA8B3}, - {0xA8F2, 0xA8F7}, - {0xA8FB, 0xA8FB}, - {0xA8FD, 0xA8FE}, - {0xA90A, 0xA925}, - {0xA930, 0xA946}, - {0xA960, 0xA97C}, - {0xA984, 0xA9B2}, - {0xA9CF, 0xA9CF}, - {0xA9E0, 0xA9E4}, - {0xA9E6, 0xA9EF}, - {0xA9FA, 0xA9FE}, - {0xAA00, 0xAA28}, - {0xAA40, 0xAA42}, - {0xAA44, 0xAA4B}, - {0xAA60, 0xAA76}, - {0xAA7A, 0xAA7A}, - {0xAA7E, 0xAAAF}, - {0xAAB1, 0xAAB1}, - {0xAAB5, 0xAAB6}, - {0xAAB9, 0xAABD}, - {0xAAC0, 0xAAC0}, - {0xAAC2, 0xAAC2}, - {0xAADB, 0xAADD}, - {0xAAE0, 0xAAEA}, - {0xAAF2, 0xAAF4}, - {0xAB01, 0xAB06}, - {0xAB09, 0xAB0E}, - {0xAB11, 0xAB16}, - {0xAB20, 0xAB26}, - {0xAB28, 0xAB2E}, - {0xAB30, 0xAB5A}, - {0xAB5C, 0xAB69}, - {0xAB70, 0xABE2}, - {0xAC00, 0xD7A3}, - {0xD7B0, 0xD7C6}, - {0xD7CB, 0xD7FB}, - {0xF900, 0xFA6D}, - {0xFA70, 0xFAD9}, - {0xFB00, 0xFB06}, - {0xFB13, 0xFB17}, - {0xFB1D, 0xFB1D}, - {0xFB1F, 0xFB28}, - {0xFB2A, 0xFB36}, - {0xFB38, 0xFB3C}, - {0xFB3E, 0xFB3E}, - {0xFB40, 0xFB41}, - {0xFB43, 0xFB44}, - {0xFB46, 0xFBB1}, - {0xFBD3, 0xFD3D}, - {0xFD50, 0xFD8F}, - {0xFD92, 0xFDC7}, - {0xFDF0, 0xFDFB}, - {0xFE70, 0xFE74}, - {0xFE76, 0xFEFC}, - {0xFF21, 0xFF3A}, - {0xFF41, 0xFF5A}, - {0xFF66, 0xFFBE}, - {0xFFC2, 0xFFC7}, - {0xFFCA, 0xFFCF}, - {0xFFD2, 0xFFD7}, - {0xFFDA, 0xFFDC}, - {0x10000, 0x1000B}, - {0x1000D, 0x10026}, - {0x10028, 0x1003A}, - {0x1003C, 0x1003D}, - {0x1003F, 0x1004D}, - {0x10050, 0x1005D}, - {0x10080, 0x100FA}, - {0x10280, 0x1029C}, - {0x102A0, 0x102D0}, - {0x10300, 0x1031F}, - {0x1032D, 0x10340}, - {0x10342, 0x10349}, - {0x10350, 0x10375}, - {0x10380, 0x1039D}, - {0x103A0, 0x103C3}, - {0x103C8, 0x103CF}, - {0x10400, 0x1049D}, - {0x104B0, 0x104D3}, - {0x104D8, 0x104FB}, - {0x10500, 0x10527}, - {0x10530, 0x10563}, - {0x10570, 0x1057A}, - {0x1057C, 0x1058A}, - {0x1058C, 0x10592}, - {0x10594, 0x10595}, - {0x10597, 0x105A1}, - {0x105A3, 0x105B1}, - {0x105B3, 0x105B9}, - {0x105BB, 0x105BC}, - {0x105C0, 0x105F3}, - {0x10600, 0x10736}, - {0x10740, 0x10755}, - {0x10760, 0x10767}, - {0x10780, 0x10785}, - {0x10787, 0x107B0}, - {0x107B2, 0x107BA}, - {0x10800, 0x10805}, - {0x10808, 0x10808}, - {0x1080A, 0x10835}, - {0x10837, 0x10838}, - {0x1083C, 0x1083C}, - {0x1083F, 0x10855}, - {0x10860, 0x10876}, - {0x10880, 0x1089E}, - {0x108E0, 0x108F2}, - {0x108F4, 0x108F5}, - {0x10900, 0x10915}, - {0x10920, 0x10939}, - {0x10940, 0x10959}, - {0x10980, 0x109B7}, - {0x109BE, 0x109BF}, - {0x10A00, 0x10A00}, - {0x10A10, 0x10A13}, - {0x10A15, 0x10A17}, - {0x10A19, 0x10A35}, - {0x10A60, 0x10A7C}, - {0x10A80, 0x10A9C}, - {0x10AC0, 0x10AC7}, - {0x10AC9, 0x10AE4}, - {0x10B00, 0x10B35}, - {0x10B40, 0x10B55}, - {0x10B60, 0x10B72}, - {0x10B80, 0x10B91}, - {0x10C00, 0x10C48}, - {0x10C80, 0x10CB2}, - {0x10CC0, 0x10CF2}, - {0x10D00, 0x10D23}, - {0x10D4A, 0x10D65}, - {0x10D6F, 0x10D85}, - {0x10E80, 0x10EA9}, - {0x10EB0, 0x10EB1}, - {0x10EC2, 0x10EC7}, - {0x10F00, 0x10F1C}, - {0x10F27, 0x10F27}, - {0x10F30, 0x10F45}, - {0x10F70, 0x10F81}, - {0x10FB0, 0x10FC4}, - {0x10FE0, 0x10FF6}, - {0x11003, 0x11037}, - {0x11071, 0x11072}, - {0x11075, 0x11075}, - {0x11083, 0x110AF}, - {0x110D0, 0x110E8}, - {0x11103, 0x11126}, - {0x11144, 0x11144}, - {0x11147, 0x11147}, - {0x11150, 0x11172}, - {0x11176, 0x11176}, - {0x11183, 0x111B2}, - {0x111C1, 0x111C4}, - {0x111DA, 0x111DA}, - {0x111DC, 0x111DC}, - {0x11200, 0x11211}, - {0x11213, 0x1122B}, - {0x1123F, 0x11240}, - {0x11280, 0x11286}, - {0x11288, 0x11288}, - {0x1128A, 0x1128D}, - {0x1128F, 0x1129D}, - {0x1129F, 0x112A8}, - {0x112B0, 0x112DE}, - {0x11305, 0x1130C}, - {0x1130F, 0x11310}, - {0x11313, 0x11328}, - {0x1132A, 0x11330}, - {0x11332, 0x11333}, - {0x11335, 0x11339}, - {0x1133D, 0x1133D}, - {0x11350, 0x11350}, - {0x1135D, 0x11361}, - {0x11380, 0x11389}, - {0x1138B, 0x1138B}, - {0x1138E, 0x1138E}, - {0x11390, 0x113B5}, - {0x113B7, 0x113B7}, - {0x113D1, 0x113D1}, - {0x113D3, 0x113D3}, - {0x11400, 0x11434}, - {0x11447, 0x1144A}, - {0x1145F, 0x11461}, - {0x11480, 0x114AF}, - {0x114C4, 0x114C5}, - {0x114C7, 0x114C7}, - {0x11580, 0x115AE}, - {0x115D8, 0x115DB}, - {0x11600, 0x1162F}, - {0x11644, 0x11644}, - {0x11680, 0x116AA}, - {0x116B8, 0x116B8}, - {0x11700, 0x1171A}, - {0x11740, 0x11746}, - {0x11800, 0x1182B}, - {0x118A0, 0x118DF}, - {0x118FF, 0x11906}, - {0x11909, 0x11909}, - {0x1190C, 0x11913}, - {0x11915, 0x11916}, - {0x11918, 0x1192F}, - {0x1193F, 0x1193F}, - {0x11941, 0x11941}, - {0x119A0, 0x119A7}, - {0x119AA, 0x119D0}, - {0x119E1, 0x119E1}, - {0x119E3, 0x119E3}, - {0x11A00, 0x11A00}, - {0x11A0B, 0x11A32}, - {0x11A3A, 0x11A3A}, - {0x11A50, 0x11A50}, - {0x11A5C, 0x11A89}, - {0x11A9D, 0x11A9D}, - {0x11AB0, 0x11AF8}, - {0x11BC0, 0x11BE0}, - {0x11C00, 0x11C08}, - {0x11C0A, 0x11C2E}, - {0x11C40, 0x11C40}, - {0x11C72, 0x11C8F}, - {0x11D00, 0x11D06}, - {0x11D08, 0x11D09}, - {0x11D0B, 0x11D30}, - {0x11D46, 0x11D46}, - {0x11D60, 0x11D65}, - {0x11D67, 0x11D68}, - {0x11D6A, 0x11D89}, - {0x11D98, 0x11D98}, - {0x11DB0, 0x11DDB}, - {0x11EE0, 0x11EF2}, - {0x11F02, 0x11F02}, - {0x11F04, 0x11F10}, - {0x11F12, 0x11F33}, - {0x11FB0, 0x11FB0}, - {0x12000, 0x12399}, - {0x12480, 0x12543}, - {0x12F90, 0x12FF0}, - {0x13000, 0x1342F}, - {0x13441, 0x13446}, - {0x13460, 0x143FA}, - {0x14400, 0x14646}, - {0x16100, 0x1611D}, - {0x16800, 0x16A38}, - {0x16A40, 0x16A5E}, - {0x16A70, 0x16ABE}, - {0x16AD0, 0x16AED}, - {0x16B00, 0x16B2F}, - {0x16B40, 0x16B43}, - {0x16B63, 0x16B77}, - {0x16B7D, 0x16B8F}, - {0x16D40, 0x16D6C}, - {0x16E40, 0x16E7F}, - {0x16EA0, 0x16EB8}, - {0x16EBB, 0x16ED3}, - {0x16F00, 0x16F4A}, - {0x16F50, 0x16F50}, - {0x16F93, 0x16F9F}, - {0x16FE0, 0x16FE1}, - {0x16FE3, 0x16FE3}, - {0x16FF2, 0x16FF3}, - {0x17000, 0x18CD5}, - {0x18CFF, 0x18D1E}, - {0x18D80, 0x18DF2}, - {0x1AFF0, 0x1AFF3}, - {0x1AFF5, 0x1AFFB}, - {0x1AFFD, 0x1AFFE}, - {0x1B000, 0x1B122}, - {0x1B132, 0x1B132}, - {0x1B150, 0x1B152}, - {0x1B155, 0x1B155}, - {0x1B164, 0x1B167}, - {0x1B170, 0x1B2FB}, - {0x1BC00, 0x1BC6A}, - {0x1BC70, 0x1BC7C}, - {0x1BC80, 0x1BC88}, - {0x1BC90, 0x1BC99}, - {0x1D400, 0x1D454}, - {0x1D456, 0x1D49C}, - {0x1D49E, 0x1D49F}, - {0x1D4A2, 0x1D4A2}, - {0x1D4A5, 0x1D4A6}, - {0x1D4A9, 0x1D4AC}, - {0x1D4AE, 0x1D4B9}, - {0x1D4BB, 0x1D4BB}, - {0x1D4BD, 0x1D4C3}, - {0x1D4C5, 0x1D505}, - {0x1D507, 0x1D50A}, - {0x1D50D, 0x1D514}, - {0x1D516, 0x1D51C}, - {0x1D51E, 0x1D539}, - {0x1D53B, 0x1D53E}, - {0x1D540, 0x1D544}, - {0x1D546, 0x1D546}, - {0x1D54A, 0x1D550}, - {0x1D552, 0x1D6A5}, - {0x1D6A8, 0x1D6C0}, - {0x1D6C2, 0x1D6DA}, - {0x1D6DC, 0x1D6FA}, - {0x1D6FC, 0x1D714}, - {0x1D716, 0x1D734}, - {0x1D736, 0x1D74E}, - {0x1D750, 0x1D76E}, - {0x1D770, 0x1D788}, - {0x1D78A, 0x1D7A8}, - {0x1D7AA, 0x1D7C2}, - {0x1D7C4, 0x1D7CB}, - {0x1DF00, 0x1DF1E}, - {0x1DF25, 0x1DF2A}, - {0x1E030, 0x1E06D}, - {0x1E100, 0x1E12C}, - {0x1E137, 0x1E13D}, - {0x1E14E, 0x1E14E}, - {0x1E290, 0x1E2AD}, - {0x1E2C0, 0x1E2EB}, - {0x1E4D0, 0x1E4EB}, - {0x1E5D0, 0x1E5ED}, - {0x1E5F0, 0x1E5F0}, - {0x1E6C0, 0x1E6DE}, - {0x1E6E0, 0x1E6E2}, - {0x1E6E4, 0x1E6E5}, - {0x1E6E7, 0x1E6ED}, - {0x1E6F0, 0x1E6F4}, - {0x1E6FE, 0x1E6FF}, - {0x1E7E0, 0x1E7E6}, - {0x1E7E8, 0x1E7EB}, - {0x1E7ED, 0x1E7EE}, - {0x1E7F0, 0x1E7FE}, - {0x1E800, 0x1E8C4}, - {0x1E900, 0x1E943}, - {0x1E94B, 0x1E94B}, - {0x1EE00, 0x1EE03}, - {0x1EE05, 0x1EE1F}, - {0x1EE21, 0x1EE22}, - {0x1EE24, 0x1EE24}, - {0x1EE27, 0x1EE27}, - {0x1EE29, 0x1EE32}, - {0x1EE34, 0x1EE37}, - {0x1EE39, 0x1EE39}, - {0x1EE3B, 0x1EE3B}, - {0x1EE42, 0x1EE42}, - {0x1EE47, 0x1EE47}, - {0x1EE49, 0x1EE49}, - {0x1EE4B, 0x1EE4B}, - {0x1EE4D, 0x1EE4F}, - {0x1EE51, 0x1EE52}, - {0x1EE54, 0x1EE54}, - {0x1EE57, 0x1EE57}, - {0x1EE59, 0x1EE59}, - {0x1EE5B, 0x1EE5B}, - {0x1EE5D, 0x1EE5D}, - {0x1EE5F, 0x1EE5F}, - {0x1EE61, 0x1EE62}, - {0x1EE64, 0x1EE64}, - {0x1EE67, 0x1EE6A}, - {0x1EE6C, 0x1EE72}, - {0x1EE74, 0x1EE77}, - {0x1EE79, 0x1EE7C}, - {0x1EE7E, 0x1EE7E}, - {0x1EE80, 0x1EE89}, - {0x1EE8B, 0x1EE9B}, - {0x1EEA1, 0x1EEA3}, - {0x1EEA5, 0x1EEA9}, - {0x1EEAB, 0x1EEBB}, - {0x20000, 0x2A6DF}, - {0x2A700, 0x2B81D}, - {0x2B820, 0x2CEAD}, - {0x2CEB0, 0x2EBE0}, - {0x2EBF0, 0x2EE5D}, - {0x2F800, 0x2FA1D}, - {0x30000, 0x3134A}, - {0x31350, 0x33479}, - }; - - for (const auto& r : ranges) { - if (ch >= r.start && ch <= r.end) - return true; - } - return false; -} - -bool is_space(char32_t cp) { - switch (cp) { - case 0x0009: // TAB \t - case 0x000A: // LF \n - case 0x000B: // VT - case 0x000C: // FF - case 0x000D: // CR \r - case 0x0020: // Space - case 0x00A0: // No-Break Space - case 0x1680: // Ogham Space Mark - case 0x2000: // En Quad - case 0x2001: // Em Quad - case 0x2002: // En Space - case 0x2003: // Em Space - case 0x2004: // Three-Per-Em Space - case 0x2005: // Four-Per-Em Space - case 0x2006: // Six-Per-Em Space - case 0x2007: // Figure Space - case 0x2008: // Punctuation Space - case 0x2009: // Thin Space - case 0x200A: // Hair Space - case 0x202F: // Narrow No-Break Space - case 0x205F: // Medium Mathematical Space - case 0x3000: // Ideographic Space - return true; - default: - return false; - } -} - -std::string str_to_lower(const std::string& input) { - std::string result = input; - std::transform(result.begin(), result.end(), result.begin(), - [](unsigned char c) { return std::tolower(c); }); - return result; -} - -// UTF-8 -> Unicode code points -std::vector utf8_to_codepoints(const std::string& str) { - std::vector codepoints; - size_t i = 0; - while (i < str.size()) { - unsigned char c = str[i]; - char32_t cp = 0; - size_t extra_bytes = 0; - - if ((c & 0x80) == 0) - cp = c; - else if ((c & 0xE0) == 0xC0) { - cp = c & 0x1F; - extra_bytes = 1; - } else if ((c & 0xF0) == 0xE0) { - cp = c & 0x0F; - extra_bytes = 2; - } else if ((c & 0xF8) == 0xF0) { - cp = c & 0x07; - extra_bytes = 3; - } else { - ++i; - continue; - } // Invalid UTF-8 - - if (i + extra_bytes >= str.size()) - break; - - for (size_t j = 1; j <= extra_bytes; ++j) - cp = (cp << 6) | (str[i + j] & 0x3F); - - codepoints.push_back(cp); - i += 1 + extra_bytes; - } - return codepoints; -} - -// Unicode code point -> UTF-8 -std::string codepoint_to_utf8(char32_t cp) { - std::string out; - if (cp <= 0x7F) - out.push_back(static_cast(cp)); - else if (cp <= 0x7FF) { - out.push_back(static_cast(0xC0 | (cp >> 6))); - out.push_back(static_cast(0x80 | (cp & 0x3F))); - } else if (cp <= 0xFFFF) { - out.push_back(static_cast(0xE0 | (cp >> 12))); - out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); - out.push_back(static_cast(0x80 | (cp & 0x3F))); - } else { - out.push_back(static_cast(0xF0 | (cp >> 18))); - out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); - out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); - out.push_back(static_cast(0x80 | (cp & 0x3F))); - } - return out; -} - -bool starts_with(const std::vector& text, - const std::vector& prefix, - std::size_t index) { - if (index > text.size()) { - return false; - } - if (prefix.size() > text.size() - index) { - return false; - } - return std::equal(prefix.begin(), prefix.end(), text.begin() + index); -} - -// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+ -// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+ -std::vector token_split(const std::string& text) { - std::vector tokens; - auto cps = utf8_to_codepoints(text); - size_t i = 0; - - while (i < cps.size()) { - char32_t cp = cps[i]; - - // `(?i:'s|'t|'re|'ve|'m|'ll|'d)` - if (cp == U'\'' && i + 1 < cps.size()) { - std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1])); - if (next == "s" || next == "t" || next == "m") { - tokens.push_back("'" + next); - i += 2; - continue; - } - if (i + 2 < cps.size()) { - next += str_to_lower(codepoint_to_utf8(cps[i + 2])); - if (next == "re" || next == "ve" || next == "ll" || next == "d") { - tokens.push_back("'" + next); - i += 3; - continue; - } - } - } - - // `\p{N}` - if (is_number(cp)) { - tokens.push_back(codepoint_to_utf8(cp)); - ++i; - continue; - } - - // `[^\r\n\p{L}\p{N}]?\p{L}+` - { - // `[^\r\n\p{L}\p{N}]\p{L}+` - if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) { - std::string token = codepoint_to_utf8(cp); - ++i; - - while (i < cps.size() && is_letter(cps[i])) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - tokens.push_back(token); - continue; - } - - // `\p{L}+` - if (is_letter(cp)) { - std::string token = codepoint_to_utf8(cp); - ++i; - while (i < cps.size() && is_letter(cps[i])) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - tokens.push_back(token); - continue; - } - } - - // ` ?[^\s\p{L}\p{N}]+[\r\n]*` - { - // ` [^\s\p{L}\p{N}]+[\r\n]*` - if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) { - std::string token = codepoint_to_utf8(cp); - token += codepoint_to_utf8(cps[i + 1]); - i += 2; - - while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - - while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - - tokens.push_back(token); - continue; - } - - // `[^\s\p{L}\p{N}]+[\r\n]*` - std::string token; - if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { - std::string token = codepoint_to_utf8(cp); - ++i; - - while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - - while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { - token += codepoint_to_utf8(cps[i]); - ++i; - } - - tokens.push_back(token); - continue; - } - } - - // `\s*[\r\n]+|\s+(?!\S)|\s+` - if (is_space(cp)) { - std::string token; - bool saw_new_line = false; - - while (i < cps.size() && is_space(cps[i])) { - token += codepoint_to_utf8(cps[i]); - - if (cps[i] == U'\r' || cps[i] == U'\n') { - saw_new_line = true; - } else { - if (saw_new_line) { - break; - } - } - - ++i; - } - - tokens.push_back(token); - continue; - } - - // skip - ++i; - } - - return tokens; -} - -std::vector split_with_special_tokens( - const std::string& text, - const std::vector& special_tokens) { - std::vector result; - size_t pos = 0; - size_t text_len = text.size(); - - while (pos < text_len) { - size_t next_pos = text_len; - std::string matched_token; - - for (const auto& token : special_tokens) { - size_t token_pos = text.find(token, pos); - if (token_pos != std::string::npos && token_pos < next_pos) { - next_pos = token_pos; - matched_token = token; - } - } - - if (next_pos > pos) { - result.push_back(text.substr(pos, next_pos - pos)); - } - - if (!matched_token.empty()) { - result.push_back(matched_token); - pos = next_pos + matched_token.size(); - } else { - break; - } - } - - return result; -} - -// int main() { -// std::string text = "I'm testing C++ token_split function. 你好,世界! 123"; -// auto tokens = token_split(text); - -// for (const auto& t : tokens) { -// std::cout << "[" << t << "] "; -// } -// std::cout << "\n"; -// return 0; -// } +#include +#include +#include +#include + +#include "tokenize_util.h" + +bool is_number(char32_t ch) { + return (ch >= U'0' && ch <= U'9'); +} + +bool is_letter(char32_t ch) { + static const struct { char32_t start, end; } ranges[] = { + {0x41, 0x5A}, + {0x61, 0x7A}, + {0xAA, 0xAA}, + {0xB5, 0xB5}, + {0xBA, 0xBA}, + {0xC0, 0xD6}, + {0xD8, 0xF6}, + {0xF8, 0x2C1}, + {0x2C6, 0x2D1}, + {0x2E0, 0x2E4}, + {0x2EC, 0x2EC}, + {0x2EE, 0x2EE}, + {0x370, 0x374}, + {0x376, 0x377}, + {0x37A, 0x37D}, + {0x37F, 0x37F}, + {0x386, 0x386}, + {0x388, 0x38A}, + {0x38C, 0x38C}, + {0x38E, 0x3A1}, + {0x3A3, 0x3F5}, + {0x3F7, 0x481}, + {0x48A, 0x52F}, + {0x531, 0x556}, + {0x559, 0x559}, + {0x560, 0x588}, + {0x5D0, 0x5EA}, + {0x5EF, 0x5F2}, + {0x620, 0x64A}, + {0x66E, 0x66F}, + {0x671, 0x6D3}, + {0x6D5, 0x6D5}, + {0x6E5, 0x6E6}, + {0x6EE, 0x6EF}, + {0x6FA, 0x6FC}, + {0x6FF, 0x6FF}, + {0x710, 0x710}, + {0x712, 0x72F}, + {0x74D, 0x7A5}, + {0x7B1, 0x7B1}, + {0x7CA, 0x7EA}, + {0x7F4, 0x7F5}, + {0x7FA, 0x7FA}, + {0x800, 0x815}, + {0x81A, 0x81A}, + {0x824, 0x824}, + {0x828, 0x828}, + {0x840, 0x858}, + {0x860, 0x86A}, + {0x870, 0x887}, + {0x889, 0x88F}, + {0x8A0, 0x8C9}, + {0x904, 0x939}, + {0x93D, 0x93D}, + {0x950, 0x950}, + {0x958, 0x961}, + {0x971, 0x980}, + {0x985, 0x98C}, + {0x98F, 0x990}, + {0x993, 0x9A8}, + {0x9AA, 0x9B0}, + {0x9B2, 0x9B2}, + {0x9B6, 0x9B9}, + {0x9BD, 0x9BD}, + {0x9CE, 0x9CE}, + {0x9DC, 0x9DD}, + {0x9DF, 0x9E1}, + {0x9F0, 0x9F1}, + {0x9FC, 0x9FC}, + {0xA05, 0xA0A}, + {0xA0F, 0xA10}, + {0xA13, 0xA28}, + {0xA2A, 0xA30}, + {0xA32, 0xA33}, + {0xA35, 0xA36}, + {0xA38, 0xA39}, + {0xA59, 0xA5C}, + {0xA5E, 0xA5E}, + {0xA72, 0xA74}, + {0xA85, 0xA8D}, + {0xA8F, 0xA91}, + {0xA93, 0xAA8}, + {0xAAA, 0xAB0}, + {0xAB2, 0xAB3}, + {0xAB5, 0xAB9}, + {0xABD, 0xABD}, + {0xAD0, 0xAD0}, + {0xAE0, 0xAE1}, + {0xAF9, 0xAF9}, + {0xB05, 0xB0C}, + {0xB0F, 0xB10}, + {0xB13, 0xB28}, + {0xB2A, 0xB30}, + {0xB32, 0xB33}, + {0xB35, 0xB39}, + {0xB3D, 0xB3D}, + {0xB5C, 0xB5D}, + {0xB5F, 0xB61}, + {0xB71, 0xB71}, + {0xB83, 0xB83}, + {0xB85, 0xB8A}, + {0xB8E, 0xB90}, + {0xB92, 0xB95}, + {0xB99, 0xB9A}, + {0xB9C, 0xB9C}, + {0xB9E, 0xB9F}, + {0xBA3, 0xBA4}, + {0xBA8, 0xBAA}, + {0xBAE, 0xBB9}, + {0xBD0, 0xBD0}, + {0xC05, 0xC0C}, + {0xC0E, 0xC10}, + {0xC12, 0xC28}, + {0xC2A, 0xC39}, + {0xC3D, 0xC3D}, + {0xC58, 0xC5A}, + {0xC5C, 0xC5D}, + {0xC60, 0xC61}, + {0xC80, 0xC80}, + {0xC85, 0xC8C}, + {0xC8E, 0xC90}, + {0xC92, 0xCA8}, + {0xCAA, 0xCB3}, + {0xCB5, 0xCB9}, + {0xCBD, 0xCBD}, + {0xCDC, 0xCDE}, + {0xCE0, 0xCE1}, + {0xCF1, 0xCF2}, + {0xD04, 0xD0C}, + {0xD0E, 0xD10}, + {0xD12, 0xD3A}, + {0xD3D, 0xD3D}, + {0xD4E, 0xD4E}, + {0xD54, 0xD56}, + {0xD5F, 0xD61}, + {0xD7A, 0xD7F}, + {0xD85, 0xD96}, + {0xD9A, 0xDB1}, + {0xDB3, 0xDBB}, + {0xDBD, 0xDBD}, + {0xDC0, 0xDC6}, + {0xE01, 0xE30}, + {0xE32, 0xE33}, + {0xE40, 0xE46}, + {0xE81, 0xE82}, + {0xE84, 0xE84}, + {0xE86, 0xE8A}, + {0xE8C, 0xEA3}, + {0xEA5, 0xEA5}, + {0xEA7, 0xEB0}, + {0xEB2, 0xEB3}, + {0xEBD, 0xEBD}, + {0xEC0, 0xEC4}, + {0xEC6, 0xEC6}, + {0xEDC, 0xEDF}, + {0xF00, 0xF00}, + {0xF40, 0xF47}, + {0xF49, 0xF6C}, + {0xF88, 0xF8C}, + {0x1000, 0x102A}, + {0x103F, 0x103F}, + {0x1050, 0x1055}, + {0x105A, 0x105D}, + {0x1061, 0x1061}, + {0x1065, 0x1066}, + {0x106E, 0x1070}, + {0x1075, 0x1081}, + {0x108E, 0x108E}, + {0x10A0, 0x10C5}, + {0x10C7, 0x10C7}, + {0x10CD, 0x10CD}, + {0x10D0, 0x10FA}, + {0x10FC, 0x1248}, + {0x124A, 0x124D}, + {0x1250, 0x1256}, + {0x1258, 0x1258}, + {0x125A, 0x125D}, + {0x1260, 0x1288}, + {0x128A, 0x128D}, + {0x1290, 0x12B0}, + {0x12B2, 0x12B5}, + {0x12B8, 0x12BE}, + {0x12C0, 0x12C0}, + {0x12C2, 0x12C5}, + {0x12C8, 0x12D6}, + {0x12D8, 0x1310}, + {0x1312, 0x1315}, + {0x1318, 0x135A}, + {0x1380, 0x138F}, + {0x13A0, 0x13F5}, + {0x13F8, 0x13FD}, + {0x1401, 0x166C}, + {0x166F, 0x167F}, + {0x1681, 0x169A}, + {0x16A0, 0x16EA}, + {0x16F1, 0x16F8}, + {0x1700, 0x1711}, + {0x171F, 0x1731}, + {0x1740, 0x1751}, + {0x1760, 0x176C}, + {0x176E, 0x1770}, + {0x1780, 0x17B3}, + {0x17D7, 0x17D7}, + {0x17DC, 0x17DC}, + {0x1820, 0x1878}, + {0x1880, 0x1884}, + {0x1887, 0x18A8}, + {0x18AA, 0x18AA}, + {0x18B0, 0x18F5}, + {0x1900, 0x191E}, + {0x1950, 0x196D}, + {0x1970, 0x1974}, + {0x1980, 0x19AB}, + {0x19B0, 0x19C9}, + {0x1A00, 0x1A16}, + {0x1A20, 0x1A54}, + {0x1AA7, 0x1AA7}, + {0x1B05, 0x1B33}, + {0x1B45, 0x1B4C}, + {0x1B83, 0x1BA0}, + {0x1BAE, 0x1BAF}, + {0x1BBA, 0x1BE5}, + {0x1C00, 0x1C23}, + {0x1C4D, 0x1C4F}, + {0x1C5A, 0x1C7D}, + {0x1C80, 0x1C8A}, + {0x1C90, 0x1CBA}, + {0x1CBD, 0x1CBF}, + {0x1CE9, 0x1CEC}, + {0x1CEE, 0x1CF3}, + {0x1CF5, 0x1CF6}, + {0x1CFA, 0x1CFA}, + {0x1D00, 0x1DBF}, + {0x1E00, 0x1F15}, + {0x1F18, 0x1F1D}, + {0x1F20, 0x1F45}, + {0x1F48, 0x1F4D}, + {0x1F50, 0x1F57}, + {0x1F59, 0x1F59}, + {0x1F5B, 0x1F5B}, + {0x1F5D, 0x1F5D}, + {0x1F5F, 0x1F7D}, + {0x1F80, 0x1FB4}, + {0x1FB6, 0x1FBC}, + {0x1FBE, 0x1FBE}, + {0x1FC2, 0x1FC4}, + {0x1FC6, 0x1FCC}, + {0x1FD0, 0x1FD3}, + {0x1FD6, 0x1FDB}, + {0x1FE0, 0x1FEC}, + {0x1FF2, 0x1FF4}, + {0x1FF6, 0x1FFC}, + {0x2071, 0x2071}, + {0x207F, 0x207F}, + {0x2090, 0x209C}, + {0x2102, 0x2102}, + {0x2107, 0x2107}, + {0x210A, 0x2113}, + {0x2115, 0x2115}, + {0x2119, 0x211D}, + {0x2124, 0x2124}, + {0x2126, 0x2126}, + {0x2128, 0x2128}, + {0x212A, 0x212D}, + {0x212F, 0x2139}, + {0x213C, 0x213F}, + {0x2145, 0x2149}, + {0x214E, 0x214E}, + {0x2183, 0x2184}, + {0x2C00, 0x2CE4}, + {0x2CEB, 0x2CEE}, + {0x2CF2, 0x2CF3}, + {0x2D00, 0x2D25}, + {0x2D27, 0x2D27}, + {0x2D2D, 0x2D2D}, + {0x2D30, 0x2D67}, + {0x2D6F, 0x2D6F}, + {0x2D80, 0x2D96}, + {0x2DA0, 0x2DA6}, + {0x2DA8, 0x2DAE}, + {0x2DB0, 0x2DB6}, + {0x2DB8, 0x2DBE}, + {0x2DC0, 0x2DC6}, + {0x2DC8, 0x2DCE}, + {0x2DD0, 0x2DD6}, + {0x2DD8, 0x2DDE}, + {0x2E2F, 0x2E2F}, + {0x3005, 0x3006}, + {0x3031, 0x3035}, + {0x303B, 0x303C}, + {0x3041, 0x3096}, + {0x309D, 0x309F}, + {0x30A1, 0x30FA}, + {0x30FC, 0x30FF}, + {0x3105, 0x312F}, + {0x3131, 0x318E}, + {0x31A0, 0x31BF}, + {0x31F0, 0x31FF}, + {0x3400, 0x4DBF}, + {0x4E00, 0xA48C}, + {0xA4D0, 0xA4FD}, + {0xA500, 0xA60C}, + {0xA610, 0xA61F}, + {0xA62A, 0xA62B}, + {0xA640, 0xA66E}, + {0xA67F, 0xA69D}, + {0xA6A0, 0xA6E5}, + {0xA717, 0xA71F}, + {0xA722, 0xA788}, + {0xA78B, 0xA7DC}, + {0xA7F1, 0xA801}, + {0xA803, 0xA805}, + {0xA807, 0xA80A}, + {0xA80C, 0xA822}, + {0xA840, 0xA873}, + {0xA882, 0xA8B3}, + {0xA8F2, 0xA8F7}, + {0xA8FB, 0xA8FB}, + {0xA8FD, 0xA8FE}, + {0xA90A, 0xA925}, + {0xA930, 0xA946}, + {0xA960, 0xA97C}, + {0xA984, 0xA9B2}, + {0xA9CF, 0xA9CF}, + {0xA9E0, 0xA9E4}, + {0xA9E6, 0xA9EF}, + {0xA9FA, 0xA9FE}, + {0xAA00, 0xAA28}, + {0xAA40, 0xAA42}, + {0xAA44, 0xAA4B}, + {0xAA60, 0xAA76}, + {0xAA7A, 0xAA7A}, + {0xAA7E, 0xAAAF}, + {0xAAB1, 0xAAB1}, + {0xAAB5, 0xAAB6}, + {0xAAB9, 0xAABD}, + {0xAAC0, 0xAAC0}, + {0xAAC2, 0xAAC2}, + {0xAADB, 0xAADD}, + {0xAAE0, 0xAAEA}, + {0xAAF2, 0xAAF4}, + {0xAB01, 0xAB06}, + {0xAB09, 0xAB0E}, + {0xAB11, 0xAB16}, + {0xAB20, 0xAB26}, + {0xAB28, 0xAB2E}, + {0xAB30, 0xAB5A}, + {0xAB5C, 0xAB69}, + {0xAB70, 0xABE2}, + {0xAC00, 0xD7A3}, + {0xD7B0, 0xD7C6}, + {0xD7CB, 0xD7FB}, + {0xF900, 0xFA6D}, + {0xFA70, 0xFAD9}, + {0xFB00, 0xFB06}, + {0xFB13, 0xFB17}, + {0xFB1D, 0xFB1D}, + {0xFB1F, 0xFB28}, + {0xFB2A, 0xFB36}, + {0xFB38, 0xFB3C}, + {0xFB3E, 0xFB3E}, + {0xFB40, 0xFB41}, + {0xFB43, 0xFB44}, + {0xFB46, 0xFBB1}, + {0xFBD3, 0xFD3D}, + {0xFD50, 0xFD8F}, + {0xFD92, 0xFDC7}, + {0xFDF0, 0xFDFB}, + {0xFE70, 0xFE74}, + {0xFE76, 0xFEFC}, + {0xFF21, 0xFF3A}, + {0xFF41, 0xFF5A}, + {0xFF66, 0xFFBE}, + {0xFFC2, 0xFFC7}, + {0xFFCA, 0xFFCF}, + {0xFFD2, 0xFFD7}, + {0xFFDA, 0xFFDC}, + {0x10000, 0x1000B}, + {0x1000D, 0x10026}, + {0x10028, 0x1003A}, + {0x1003C, 0x1003D}, + {0x1003F, 0x1004D}, + {0x10050, 0x1005D}, + {0x10080, 0x100FA}, + {0x10280, 0x1029C}, + {0x102A0, 0x102D0}, + {0x10300, 0x1031F}, + {0x1032D, 0x10340}, + {0x10342, 0x10349}, + {0x10350, 0x10375}, + {0x10380, 0x1039D}, + {0x103A0, 0x103C3}, + {0x103C8, 0x103CF}, + {0x10400, 0x1049D}, + {0x104B0, 0x104D3}, + {0x104D8, 0x104FB}, + {0x10500, 0x10527}, + {0x10530, 0x10563}, + {0x10570, 0x1057A}, + {0x1057C, 0x1058A}, + {0x1058C, 0x10592}, + {0x10594, 0x10595}, + {0x10597, 0x105A1}, + {0x105A3, 0x105B1}, + {0x105B3, 0x105B9}, + {0x105BB, 0x105BC}, + {0x105C0, 0x105F3}, + {0x10600, 0x10736}, + {0x10740, 0x10755}, + {0x10760, 0x10767}, + {0x10780, 0x10785}, + {0x10787, 0x107B0}, + {0x107B2, 0x107BA}, + {0x10800, 0x10805}, + {0x10808, 0x10808}, + {0x1080A, 0x10835}, + {0x10837, 0x10838}, + {0x1083C, 0x1083C}, + {0x1083F, 0x10855}, + {0x10860, 0x10876}, + {0x10880, 0x1089E}, + {0x108E0, 0x108F2}, + {0x108F4, 0x108F5}, + {0x10900, 0x10915}, + {0x10920, 0x10939}, + {0x10940, 0x10959}, + {0x10980, 0x109B7}, + {0x109BE, 0x109BF}, + {0x10A00, 0x10A00}, + {0x10A10, 0x10A13}, + {0x10A15, 0x10A17}, + {0x10A19, 0x10A35}, + {0x10A60, 0x10A7C}, + {0x10A80, 0x10A9C}, + {0x10AC0, 0x10AC7}, + {0x10AC9, 0x10AE4}, + {0x10B00, 0x10B35}, + {0x10B40, 0x10B55}, + {0x10B60, 0x10B72}, + {0x10B80, 0x10B91}, + {0x10C00, 0x10C48}, + {0x10C80, 0x10CB2}, + {0x10CC0, 0x10CF2}, + {0x10D00, 0x10D23}, + {0x10D4A, 0x10D65}, + {0x10D6F, 0x10D85}, + {0x10E80, 0x10EA9}, + {0x10EB0, 0x10EB1}, + {0x10EC2, 0x10EC7}, + {0x10F00, 0x10F1C}, + {0x10F27, 0x10F27}, + {0x10F30, 0x10F45}, + {0x10F70, 0x10F81}, + {0x10FB0, 0x10FC4}, + {0x10FE0, 0x10FF6}, + {0x11003, 0x11037}, + {0x11071, 0x11072}, + {0x11075, 0x11075}, + {0x11083, 0x110AF}, + {0x110D0, 0x110E8}, + {0x11103, 0x11126}, + {0x11144, 0x11144}, + {0x11147, 0x11147}, + {0x11150, 0x11172}, + {0x11176, 0x11176}, + {0x11183, 0x111B2}, + {0x111C1, 0x111C4}, + {0x111DA, 0x111DA}, + {0x111DC, 0x111DC}, + {0x11200, 0x11211}, + {0x11213, 0x1122B}, + {0x1123F, 0x11240}, + {0x11280, 0x11286}, + {0x11288, 0x11288}, + {0x1128A, 0x1128D}, + {0x1128F, 0x1129D}, + {0x1129F, 0x112A8}, + {0x112B0, 0x112DE}, + {0x11305, 0x1130C}, + {0x1130F, 0x11310}, + {0x11313, 0x11328}, + {0x1132A, 0x11330}, + {0x11332, 0x11333}, + {0x11335, 0x11339}, + {0x1133D, 0x1133D}, + {0x11350, 0x11350}, + {0x1135D, 0x11361}, + {0x11380, 0x11389}, + {0x1138B, 0x1138B}, + {0x1138E, 0x1138E}, + {0x11390, 0x113B5}, + {0x113B7, 0x113B7}, + {0x113D1, 0x113D1}, + {0x113D3, 0x113D3}, + {0x11400, 0x11434}, + {0x11447, 0x1144A}, + {0x1145F, 0x11461}, + {0x11480, 0x114AF}, + {0x114C4, 0x114C5}, + {0x114C7, 0x114C7}, + {0x11580, 0x115AE}, + {0x115D8, 0x115DB}, + {0x11600, 0x1162F}, + {0x11644, 0x11644}, + {0x11680, 0x116AA}, + {0x116B8, 0x116B8}, + {0x11700, 0x1171A}, + {0x11740, 0x11746}, + {0x11800, 0x1182B}, + {0x118A0, 0x118DF}, + {0x118FF, 0x11906}, + {0x11909, 0x11909}, + {0x1190C, 0x11913}, + {0x11915, 0x11916}, + {0x11918, 0x1192F}, + {0x1193F, 0x1193F}, + {0x11941, 0x11941}, + {0x119A0, 0x119A7}, + {0x119AA, 0x119D0}, + {0x119E1, 0x119E1}, + {0x119E3, 0x119E3}, + {0x11A00, 0x11A00}, + {0x11A0B, 0x11A32}, + {0x11A3A, 0x11A3A}, + {0x11A50, 0x11A50}, + {0x11A5C, 0x11A89}, + {0x11A9D, 0x11A9D}, + {0x11AB0, 0x11AF8}, + {0x11BC0, 0x11BE0}, + {0x11C00, 0x11C08}, + {0x11C0A, 0x11C2E}, + {0x11C40, 0x11C40}, + {0x11C72, 0x11C8F}, + {0x11D00, 0x11D06}, + {0x11D08, 0x11D09}, + {0x11D0B, 0x11D30}, + {0x11D46, 0x11D46}, + {0x11D60, 0x11D65}, + {0x11D67, 0x11D68}, + {0x11D6A, 0x11D89}, + {0x11D98, 0x11D98}, + {0x11DB0, 0x11DDB}, + {0x11EE0, 0x11EF2}, + {0x11F02, 0x11F02}, + {0x11F04, 0x11F10}, + {0x11F12, 0x11F33}, + {0x11FB0, 0x11FB0}, + {0x12000, 0x12399}, + {0x12480, 0x12543}, + {0x12F90, 0x12FF0}, + {0x13000, 0x1342F}, + {0x13441, 0x13446}, + {0x13460, 0x143FA}, + {0x14400, 0x14646}, + {0x16100, 0x1611D}, + {0x16800, 0x16A38}, + {0x16A40, 0x16A5E}, + {0x16A70, 0x16ABE}, + {0x16AD0, 0x16AED}, + {0x16B00, 0x16B2F}, + {0x16B40, 0x16B43}, + {0x16B63, 0x16B77}, + {0x16B7D, 0x16B8F}, + {0x16D40, 0x16D6C}, + {0x16E40, 0x16E7F}, + {0x16EA0, 0x16EB8}, + {0x16EBB, 0x16ED3}, + {0x16F00, 0x16F4A}, + {0x16F50, 0x16F50}, + {0x16F93, 0x16F9F}, + {0x16FE0, 0x16FE1}, + {0x16FE3, 0x16FE3}, + {0x16FF2, 0x16FF3}, + {0x17000, 0x18CD5}, + {0x18CFF, 0x18D1E}, + {0x18D80, 0x18DF2}, + {0x1AFF0, 0x1AFF3}, + {0x1AFF5, 0x1AFFB}, + {0x1AFFD, 0x1AFFE}, + {0x1B000, 0x1B122}, + {0x1B132, 0x1B132}, + {0x1B150, 0x1B152}, + {0x1B155, 0x1B155}, + {0x1B164, 0x1B167}, + {0x1B170, 0x1B2FB}, + {0x1BC00, 0x1BC6A}, + {0x1BC70, 0x1BC7C}, + {0x1BC80, 0x1BC88}, + {0x1BC90, 0x1BC99}, + {0x1D400, 0x1D454}, + {0x1D456, 0x1D49C}, + {0x1D49E, 0x1D49F}, + {0x1D4A2, 0x1D4A2}, + {0x1D4A5, 0x1D4A6}, + {0x1D4A9, 0x1D4AC}, + {0x1D4AE, 0x1D4B9}, + {0x1D4BB, 0x1D4BB}, + {0x1D4BD, 0x1D4C3}, + {0x1D4C5, 0x1D505}, + {0x1D507, 0x1D50A}, + {0x1D50D, 0x1D514}, + {0x1D516, 0x1D51C}, + {0x1D51E, 0x1D539}, + {0x1D53B, 0x1D53E}, + {0x1D540, 0x1D544}, + {0x1D546, 0x1D546}, + {0x1D54A, 0x1D550}, + {0x1D552, 0x1D6A5}, + {0x1D6A8, 0x1D6C0}, + {0x1D6C2, 0x1D6DA}, + {0x1D6DC, 0x1D6FA}, + {0x1D6FC, 0x1D714}, + {0x1D716, 0x1D734}, + {0x1D736, 0x1D74E}, + {0x1D750, 0x1D76E}, + {0x1D770, 0x1D788}, + {0x1D78A, 0x1D7A8}, + {0x1D7AA, 0x1D7C2}, + {0x1D7C4, 0x1D7CB}, + {0x1DF00, 0x1DF1E}, + {0x1DF25, 0x1DF2A}, + {0x1E030, 0x1E06D}, + {0x1E100, 0x1E12C}, + {0x1E137, 0x1E13D}, + {0x1E14E, 0x1E14E}, + {0x1E290, 0x1E2AD}, + {0x1E2C0, 0x1E2EB}, + {0x1E4D0, 0x1E4EB}, + {0x1E5D0, 0x1E5ED}, + {0x1E5F0, 0x1E5F0}, + {0x1E6C0, 0x1E6DE}, + {0x1E6E0, 0x1E6E2}, + {0x1E6E4, 0x1E6E5}, + {0x1E6E7, 0x1E6ED}, + {0x1E6F0, 0x1E6F4}, + {0x1E6FE, 0x1E6FF}, + {0x1E7E0, 0x1E7E6}, + {0x1E7E8, 0x1E7EB}, + {0x1E7ED, 0x1E7EE}, + {0x1E7F0, 0x1E7FE}, + {0x1E800, 0x1E8C4}, + {0x1E900, 0x1E943}, + {0x1E94B, 0x1E94B}, + {0x1EE00, 0x1EE03}, + {0x1EE05, 0x1EE1F}, + {0x1EE21, 0x1EE22}, + {0x1EE24, 0x1EE24}, + {0x1EE27, 0x1EE27}, + {0x1EE29, 0x1EE32}, + {0x1EE34, 0x1EE37}, + {0x1EE39, 0x1EE39}, + {0x1EE3B, 0x1EE3B}, + {0x1EE42, 0x1EE42}, + {0x1EE47, 0x1EE47}, + {0x1EE49, 0x1EE49}, + {0x1EE4B, 0x1EE4B}, + {0x1EE4D, 0x1EE4F}, + {0x1EE51, 0x1EE52}, + {0x1EE54, 0x1EE54}, + {0x1EE57, 0x1EE57}, + {0x1EE59, 0x1EE59}, + {0x1EE5B, 0x1EE5B}, + {0x1EE5D, 0x1EE5D}, + {0x1EE5F, 0x1EE5F}, + {0x1EE61, 0x1EE62}, + {0x1EE64, 0x1EE64}, + {0x1EE67, 0x1EE6A}, + {0x1EE6C, 0x1EE72}, + {0x1EE74, 0x1EE77}, + {0x1EE79, 0x1EE7C}, + {0x1EE7E, 0x1EE7E}, + {0x1EE80, 0x1EE89}, + {0x1EE8B, 0x1EE9B}, + {0x1EEA1, 0x1EEA3}, + {0x1EEA5, 0x1EEA9}, + {0x1EEAB, 0x1EEBB}, + {0x20000, 0x2A6DF}, + {0x2A700, 0x2B81D}, + {0x2B820, 0x2CEAD}, + {0x2CEB0, 0x2EBE0}, + {0x2EBF0, 0x2EE5D}, + {0x2F800, 0x2FA1D}, + {0x30000, 0x3134A}, + {0x31350, 0x33479}, + }; + + for (const auto& r : ranges) { + if (ch >= r.start && ch <= r.end) + return true; + } + return false; +} + +bool is_space(char32_t cp) { + switch (cp) { + case 0x0009: // TAB \t + case 0x000A: // LF \n + case 0x000B: // VT + case 0x000C: // FF + case 0x000D: // CR \r + case 0x0020: // Space + case 0x00A0: // No-Break Space + case 0x1680: // Ogham Space Mark + case 0x2000: // En Quad + case 0x2001: // Em Quad + case 0x2002: // En Space + case 0x2003: // Em Space + case 0x2004: // Three-Per-Em Space + case 0x2005: // Four-Per-Em Space + case 0x2006: // Six-Per-Em Space + case 0x2007: // Figure Space + case 0x2008: // Punctuation Space + case 0x2009: // Thin Space + case 0x200A: // Hair Space + case 0x202F: // Narrow No-Break Space + case 0x205F: // Medium Mathematical Space + case 0x3000: // Ideographic Space + return true; + default: + return false; + } +} + +std::string str_to_lower(const std::string& input) { + std::string result = input; + std::transform(result.begin(), result.end(), result.begin(), + [](unsigned char c) { return std::tolower(c); }); + return result; +} + +// UTF-8 -> Unicode code points +std::vector utf8_to_codepoints(const std::string& str) { + std::vector codepoints; + size_t i = 0; + while (i < str.size()) { + unsigned char c = str[i]; + char32_t cp = 0; + size_t extra_bytes = 0; + + if ((c & 0x80) == 0) + cp = c; + else if ((c & 0xE0) == 0xC0) { + cp = c & 0x1F; + extra_bytes = 1; + } else if ((c & 0xF0) == 0xE0) { + cp = c & 0x0F; + extra_bytes = 2; + } else if ((c & 0xF8) == 0xF0) { + cp = c & 0x07; + extra_bytes = 3; + } else { + ++i; + continue; + } // Invalid UTF-8 + + if (i + extra_bytes >= str.size()) + break; + + for (size_t j = 1; j <= extra_bytes; ++j) + cp = (cp << 6) | (str[i + j] & 0x3F); + + codepoints.push_back(cp); + i += 1 + extra_bytes; + } + return codepoints; +} + +// Unicode code point -> UTF-8 +std::string codepoint_to_utf8(char32_t cp) { + std::string out; + if (cp <= 0x7F) + out.push_back(static_cast(cp)); + else if (cp <= 0x7FF) { + out.push_back(static_cast(0xC0 | (cp >> 6))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else if (cp <= 0xFFFF) { + out.push_back(static_cast(0xE0 | (cp >> 12))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } else { + out.push_back(static_cast(0xF0 | (cp >> 18))); + out.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (cp & 0x3F))); + } + return out; +} + +bool starts_with(const std::vector& text, + const std::vector& prefix, + std::size_t index) { + if (index > text.size()) { + return false; + } + if (prefix.size() > text.size() - index) { + return false; + } + return std::equal(prefix.begin(), prefix.end(), text.begin() + index); +} + +// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+ +// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+ +std::vector token_split(const std::string& text) { + std::vector tokens; + auto cps = utf8_to_codepoints(text); + size_t i = 0; + + while (i < cps.size()) { + char32_t cp = cps[i]; + + // `(?i:'s|'t|'re|'ve|'m|'ll|'d)` + if (cp == U'\'' && i + 1 < cps.size()) { + std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1])); + if (next == "s" || next == "t" || next == "m") { + tokens.push_back("'" + next); + i += 2; + continue; + } + if (i + 2 < cps.size()) { + next += str_to_lower(codepoint_to_utf8(cps[i + 2])); + if (next == "re" || next == "ve" || next == "ll" || next == "d") { + tokens.push_back("'" + next); + i += 3; + continue; + } + } + } + + // `\p{N}` + if (is_number(cp)) { + tokens.push_back(codepoint_to_utf8(cp)); + ++i; + continue; + } + + // `[^\r\n\p{L}\p{N}]?\p{L}+` + { + // `[^\r\n\p{L}\p{N}]\p{L}+` + if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) { + std::string token = codepoint_to_utf8(cp); + ++i; + + while (i < cps.size() && is_letter(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + tokens.push_back(token); + continue; + } + + // `\p{L}+` + if (is_letter(cp)) { + std::string token = codepoint_to_utf8(cp); + ++i; + while (i < cps.size() && is_letter(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + tokens.push_back(token); + continue; + } + } + + // ` ?[^\s\p{L}\p{N}]+[\r\n]*` + { + // ` [^\s\p{L}\p{N}]+[\r\n]*` + if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) { + std::string token = codepoint_to_utf8(cp); + token += codepoint_to_utf8(cps[i + 1]); + i += 2; + + while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + tokens.push_back(token); + continue; + } + + // `[^\s\p{L}\p{N}]+[\r\n]*` + std::string token; + if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { + std::string token = codepoint_to_utf8(cp); + ++i; + + while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) { + token += codepoint_to_utf8(cps[i]); + ++i; + } + + tokens.push_back(token); + continue; + } + } + + // `\s*[\r\n]+|\s+(?!\S)|\s+` + if (is_space(cp)) { + std::string token; + bool saw_new_line = false; + + while (i < cps.size() && is_space(cps[i])) { + token += codepoint_to_utf8(cps[i]); + + if (cps[i] == U'\r' || cps[i] == U'\n') { + saw_new_line = true; + } else { + if (saw_new_line) { + break; + } + } + + ++i; + } + + tokens.push_back(token); + continue; + } + + // skip + ++i; + } + + return tokens; +} + +std::vector split_with_special_tokens( + const std::string& text, + const std::vector& special_tokens) { + std::vector result; + size_t pos = 0; + size_t text_len = text.size(); + + while (pos < text_len) { + size_t next_pos = text_len; + std::string matched_token; + + for (const auto& token : special_tokens) { + size_t token_pos = text.find(token, pos); + if (token_pos != std::string::npos && token_pos < next_pos) { + next_pos = token_pos; + matched_token = token; + } + } + + if (next_pos > pos) { + result.push_back(text.substr(pos, next_pos - pos)); + } + + if (!matched_token.empty()) { + result.push_back(matched_token); + pos = next_pos + matched_token.size(); + } else { + break; + } + } + + return result; +} + +// int main() { +// std::string text = "I'm testing C++ token_split function. Hello world 123"; +// auto tokens = token_split(text); + +// for (const auto& t : tokens) { +// std::cout << "[" << t << "] "; +// } +// std::cout << "\n"; +// return 0; +// } diff --git a/src/ucache.hpp b/src/ucache.hpp index d3247618..3d785c5e 100644 --- a/src/ucache.hpp +++ b/src/ucache.hpp @@ -6,8 +6,10 @@ #include #include +#include "condition_cache_utils.hpp" #include "denoiser.hpp" #include "ggml_extend.hpp" +#include "tensor.hpp" struct UCacheConfig { bool enabled = false; @@ -29,15 +31,15 @@ struct UCacheCacheEntry { struct UCacheState { UCacheConfig config; - Denoiser* denoiser = nullptr; - float start_sigma = std::numeric_limits::max(); - float end_sigma = 0.0f; - bool initialized = false; - bool initial_step = true; - bool skip_current_step = false; - bool step_active = false; - const SDCondition* anchor_condition = nullptr; - std::unordered_map cache_diffs; + Denoiser* denoiser = nullptr; + float start_sigma = std::numeric_limits::max(); + float end_sigma = 0.0f; + bool initialized = false; + bool initial_step = true; + bool skip_current_step = false; + bool step_active = false; + const void* anchor_condition = nullptr; + std::unordered_map cache_diffs; std::vector prev_input; std::vector prev_output; float output_prev_norm = 0.0f; @@ -233,43 +235,30 @@ struct UCacheState { return base_threshold * multiplier; } - bool has_cache(const SDCondition* cond) const { + bool has_cache(const void* cond) const { auto it = cache_diffs.find(cond); return it != cache_diffs.end() && !it->second.diff.empty(); } - void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void update_cache(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { UCacheCacheEntry& entry = cache_diffs[cond]; - size_t ne = static_cast(ggml_nelements(output)); - entry.diff.resize(ne); - float* out_data = (float*)output->data; - float* in_data = (float*)input->data; - - for (size_t i = 0; i < ne; ++i) { - entry.diff[i] = out_data[i] - in_data[i]; - } + sd::store_condition_cache_diff(&entry.diff, input, output); } - void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void apply_cache(const void* cond, const sd::Tensor& input, sd::Tensor* output) { auto it = cache_diffs.find(cond); if (it == cache_diffs.end() || it->second.diff.empty()) { return; } - - copy_ggml_tensor(output, input); - float* out_data = (float*)output->data; - const std::vector& diff = it->second.diff; - for (size_t i = 0; i < diff.size(); ++i) { - out_data[i] += diff[i]; - } + sd::apply_condition_cache_diff(it->second.diff, input, output); } - bool before_condition(const SDCondition* cond, - ggml_tensor* input, - ggml_tensor* output, + bool before_condition(const void* cond, + const sd::Tensor& input, + sd::Tensor* output, float sigma, int step_index) { - if (!enabled() || step_index < 0) { + if (!enabled() || step_index < 0 || output == nullptr) { return false; } if (step_index != current_step_index) { @@ -302,13 +291,13 @@ struct UCacheState { return false; } - size_t ne = static_cast(ggml_nelements(input)); + size_t ne = static_cast(input.numel()); if (prev_input.size() != ne) { return false; } - float* input_data = (float*)input->data; - last_input_change = 0.0f; + const float* input_data = input.data(); + last_input_change = 0.0f; for (size_t i = 0; i < ne; ++i) { last_input_change += std::fabs(input_data[i] - prev_input[i]); } @@ -354,7 +343,7 @@ struct UCacheState { return false; } - void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) { + void after_condition(const void* cond, const sd::Tensor& input, const sd::Tensor& output) { if (!step_is_active()) { return; } @@ -367,16 +356,16 @@ struct UCacheState { steps_computed_since_active++; consecutive_skipped_steps = 0; - size_t ne = static_cast(ggml_nelements(input)); - float* in_data = (float*)input->data; + size_t ne = static_cast(input.numel()); + const float* in_data = input.data(); prev_input.resize(ne); for (size_t i = 0; i < ne; ++i) { prev_input[i] = in_data[i]; } has_prev_input = true; - float* out_data = (float*)output->data; - float output_change = 0.0f; + const float* out_data = output.data(); + float output_change = 0.0f; if (has_prev_output && prev_output.size() == ne) { for (size_t i = 0; i < ne; ++i) { output_change += std::fabs(out_data[i] - prev_output[i]); diff --git a/src/unet.hpp b/src/unet.hpp index f7aa3f05..63e23eb9 100644 --- a/src/unet.hpp +++ b/src/unet.hpp @@ -609,30 +609,31 @@ struct UNetModelRunner : public GGMLRunner { unet.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* c_concat = nullptr, - ggml_tensor* y = nullptr, - int num_video_frames = -1, - std::vector controls = {}, - float control_strength = 0.f) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& c_concat_tensor = {}, + const sd::Tensor& y_tensor = {}, + int num_video_frames = -1, + const std::vector>& controls_tensor = {}, + float control_strength = 0.f) { ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* c_concat = make_optional_input(c_concat_tensor); + ggml_tensor* y = make_optional_input(y_tensor); + std::vector controls; + controls.reserve(controls_tensor.size()); + for (const auto& control_tensor : controls_tensor) { + controls.push_back(make_input(control_tensor)); + } + if (num_video_frames == -1) { num_video_frames = static_cast(x->ne[3]); } - x = to_backend(x); - context = to_backend(context); - y = to_backend(y); - timesteps = to_backend(timesteps); - c_concat = to_backend(c_concat); - - for (int i = 0; i < controls.size(); i++) { - controls[i] = to_backend(controls[i]); - } - auto runner_ctx = get_context(); ggml_tensor* out = unet.forward(&runner_ctx, @@ -650,17 +651,15 @@ struct UNetModelRunner : public GGMLRunner { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* c_concat, - ggml_tensor* y, - int num_video_frames = -1, - std::vector controls = {}, - float control_strength = 0.f, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& c_concat = {}, + const sd::Tensor& y = {}, + int num_video_frames = -1, + const std::vector>& controls = {}, + float control_strength = 0.f) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] @@ -670,7 +669,7 @@ struct UNetModelRunner : public GGMLRunner { return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -679,8 +678,8 @@ struct UNetModelRunner : public GGMLRunner { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // CPU, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass @@ -689,27 +688,37 @@ struct UNetModelRunner : public GGMLRunner { // CUDA, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: nan int num_video_frames = 3; - auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 8, num_video_frames); + sd::Tensor x({8, 8, 8, num_video_frames}); std::vector timesteps_vec(num_video_frames, 999.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); - ggml_set_f32(x, 0.5f); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); + x.fill_(0.5f); // print_ggml_tensor(x); - auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 1024, 1, num_video_frames); - ggml_set_f32(context, 0.5f); + sd::Tensor context({1024, 1, num_video_frames}); + context.fill_(0.5f); // print_ggml_tensor(context); - auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, num_video_frames); - ggml_set_f32(y, 0.5f); + sd::Tensor y({768, num_video_frames}); + y.fill_(0.5f); // print_ggml_tensor(y); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, nullptr, y, num_video_frames, {}, 0.f, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + {}, + y, + num_video_frames, + {}, + 0.f); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("unet test done in %lldms", t1 - t0); } } diff --git a/src/upscaler.cpp b/src/upscaler.cpp index 18e185d0..03f7714e 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -2,6 +2,7 @@ #include "ggml_extend.hpp" #include "model.h" #include "stable-diffusion.h" +#include "util.h" struct UpscalerGGML { ggml_backend_t backend = nullptr; // general backend @@ -64,6 +65,39 @@ struct UpscalerGGML { return true; } + sd::Tensor upscale_tensor(const sd::Tensor& input_tensor) { + sd::Tensor upscaled; + if (tile_size <= 0 || (input_tensor.shape()[0] <= tile_size && input_tensor.shape()[1] <= tile_size)) { + upscaled = esrgan_upscaler->compute(n_threads, input_tensor); + } else { + auto on_processing = [&](const sd::Tensor& input_tile) -> sd::Tensor { + auto output_tile = esrgan_upscaler->compute(n_threads, input_tile); + if (output_tile.empty()) { + LOG_ERROR("esrgan compute failed while processing a tile"); + return {}; + } + return output_tile; + }; + + upscaled = process_tiles_2d(input_tensor, + static_cast(input_tensor.shape()[0] * esrgan_upscaler->scale), + static_cast(input_tensor.shape()[1] * esrgan_upscaler->scale), + esrgan_upscaler->scale, + tile_size, + tile_size, + 0.25f, + false, + false, + on_processing); + } + esrgan_upscaler->free_compute_buffer(); + if (upscaled.empty()) { + LOG_ERROR("esrgan compute failed"); + return {}; + } + return upscaled; + } + sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) { // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth sd_image_t upscaled_image = {0, 0, 0, nullptr}; @@ -72,40 +106,17 @@ struct UpscalerGGML { LOG_INFO("upscaling from (%i x %i) to (%i x %i)", input_image.width, input_image.height, output_width, output_height); - ggml_init_params params; - params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = nullptr; - params.no_alloc = false; - - // draft context - ggml_context* upscale_ctx = ggml_init(params); - if (!upscale_ctx) { - LOG_ERROR("ggml_init() failed"); + sd::Tensor input_tensor = sd_image_to_tensor(input_image); + sd::Tensor upscaled; + int64_t t0 = ggml_time_ms(); + upscaled = upscale_tensor(input_tensor); + if (upscaled.empty()) { return upscaled_image; } - // LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); - ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1); - sd_image_to_ggml_tensor(input_image, input_image_tensor); - - ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1); - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return esrgan_upscaler->compute(n_threads, in, &out); - }; - int64_t t0 = ggml_time_ms(); - // TODO: circular upscaling? - sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, false, false, on_tiling); - esrgan_upscaler->free_compute_buffer(); - ggml_ext_tensor_clamp_inplace(upscaled, 0.f, 1.f); - uint8_t* upscaled_data = ggml_tensor_to_sd_image(upscaled); - ggml_free(upscale_ctx); - int64_t t3 = ggml_time_ms(); + sd_image_t upscaled_data = tensor_to_sd_image(upscaled); + int64_t t3 = ggml_time_ms(); LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f); - upscaled_image = { - (uint32_t)output_width, - (uint32_t)output_height, - 3, - upscaled_data, - }; + upscaled_image = upscaled_data; return upscaled_image; } }; diff --git a/src/util.cpp b/src/util.cpp index a94cfd98..2d330a2d 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -479,158 +479,96 @@ const char* sd_get_system_info() { return buffer; } -sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) { - sd_image_f32_t converted_image; - converted_image.width = image.width; - converted_image.height = image.height; - converted_image.channel = image.channel; +sd_image_t tensor_to_sd_image(const sd::Tensor& tensor, int frame_index) { + const auto& shape = tensor.shape(); + GGML_ASSERT(shape.size() == 4 || shape.size() == 5); + int width = static_cast(shape[0]); + int height = static_cast(shape[1]); + int channel = static_cast(shape[shape.size() == 5 ? 3 : 2]); + uint8_t* data = (uint8_t*)malloc(static_cast(width * height * channel)); + GGML_ASSERT(data != nullptr); - // Allocate memory for float data - converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float)); - - for (uint32_t i = 0; i < image.width * image.height * image.channel; i++) { - // Convert uint8_t to float - converted_image.data[i] = (float)image.data[i]; - } - - return converted_image; -} - -// Function to perform double linear interpolation -float interpolate(float v1, float v2, float v3, float v4, float x_ratio, float y_ratio) { - return v1 * (1 - x_ratio) * (1 - y_ratio) + v2 * x_ratio * (1 - y_ratio) + v3 * (1 - x_ratio) * y_ratio + v4 * x_ratio * y_ratio; -} - -sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height) { - sd_image_f32_t resized_image; - resized_image.width = target_width; - resized_image.height = target_height; - resized_image.channel = image.channel; - - // Allocate memory for resized float data - resized_image.data = (float*)malloc(target_width * target_height * image.channel * sizeof(float)); - - for (int y = 0; y < target_height; y++) { - for (int x = 0; x < target_width; x++) { - float original_x = (float)x * image.width / target_width; - float original_y = (float)y * image.height / target_height; - - uint32_t x1 = (uint32_t)original_x; - uint32_t y1 = (uint32_t)original_y; - uint32_t x2 = std::min(x1 + 1, image.width - 1); - uint32_t y2 = std::min(y1 + 1, image.height - 1); - - for (uint32_t k = 0; k < image.channel; k++) { - float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k); - float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k); - float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k); - float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k); - - float x_ratio = original_x - x1; - float y_ratio = original_y - y1; - - float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio); - - *(resized_image.data + y * target_width * image.channel + x * image.channel + k) = value; + for (int iw = 0; iw < width; ++iw) { + for (int ih = 0; ih < height; ++ih) { + for (int ic = 0; ic < channel; ++ic) { + float value = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0) + : tensor.index(iw, ih, ic, frame_index); + value = std::clamp(value, 0.0f, 1.0f); + data[(ih * width + iw) * channel + ic] = static_cast(std::round(value * 255.0f)); } } } - - return resized_image; + return { + static_cast(width), + static_cast(height), + static_cast(channel), + data, + }; } -void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) { - for (uint32_t y = 0; y < image.height; y++) { - for (uint32_t x = 0; x < image.width; x++) { - for (uint32_t k = 0; k < image.channel; k++) { - int index = (y * image.width + x) * image.channel + k; - image.data[index] = (image.data[index] - means[k]) / stds[k]; +sd::Tensor sd_image_to_tensor(sd_image_t image, + int target_width, + int target_height, + bool scale) { + sd::Tensor tensor = sd::zeros({static_cast(image.width), + static_cast(image.height), + static_cast(image.channel), + 1}); + for (uint32_t iw = 0; iw < image.width; ++iw) { + for (uint32_t ih = 0; ih < image.height; ++ih) { + for (uint32_t ic = 0; ic < image.channel; ++ic) { + tensor.index(iw, ih, ic, 0) = sd_image_get_f32(image, iw, ih, ic, scale); } } } + if (target_width >= 0 && target_height >= 0 && + (tensor.shape()[0] != target_width || tensor.shape()[1] != target_height)) { + tensor = sd::ops::interpolate(tensor, + {target_width, + target_height, + tensor.shape()[2], + tensor.shape()[3]}); + } + return tensor; } // Constants for means and std float means[3] = {0.48145466f, 0.4578275f, 0.40821073f}; float stds[3] = {0.26862954f, 0.26130258f, 0.27577711f}; -// Function to clip and preprocess sd_image_f32_t -sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height) { - float width_scale = (float)target_width / image.width; - float height_scale = (float)target_height / image.height; +sd::Tensor clip_preprocess(const sd::Tensor& image, int target_width, int target_height) { + GGML_ASSERT(image.dim() == 4); + GGML_ASSERT(image.shape()[2] == 3); + GGML_ASSERT(image.shape()[3] == 1); + GGML_ASSERT(target_width > 0 && target_height > 0); - float scale = std::fmax(width_scale, height_scale); + float width_scale = static_cast(target_width) / static_cast(image.shape()[0]); + float height_scale = static_cast(target_height) / static_cast(image.shape()[1]); + float scale = std::fmax(width_scale, height_scale); - // Interpolation - int resized_width = (int)(scale * image.width); - int resized_height = (int)(scale * image.height); - float* resized_data = (float*)malloc(resized_width * resized_height * image.channel * sizeof(float)); + int64_t resized_width = static_cast(scale * static_cast(image.shape()[0])); + int64_t resized_height = static_cast(scale * static_cast(image.shape()[1])); - for (int y = 0; y < resized_height; y++) { - for (int x = 0; x < resized_width; x++) { - float original_x = (float)x * image.width / resized_width; - float original_y = (float)y * image.height / resized_height; + sd::Tensor resized = sd::ops::interpolate( + image, + {resized_width, resized_height, image.shape()[2], image.shape()[3]}); - uint32_t x1 = (uint32_t)original_x; - uint32_t y1 = (uint32_t)original_y; - uint32_t x2 = std::min(x1 + 1, image.width - 1); - uint32_t y2 = std::min(y1 + 1, image.height - 1); + int64_t h_offset = std::max((resized_height - target_height) / 2, 0); + int64_t w_offset = std::max((resized_width - target_width) / 2, 0); - for (uint32_t k = 0; k < image.channel; k++) { - float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k); - float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k); - float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k); - float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k); - - float x_ratio = original_x - x1; - float y_ratio = original_y - y1; - - float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio); - - *(resized_data + y * resized_width * image.channel + x * image.channel + k) = value; + sd::Tensor cropped({target_width, target_height, image.shape()[2], image.shape()[3]}); + for (int64_t y = 0; y < target_height; ++y) { + for (int64_t x = 0; x < target_width; ++x) { + for (int64_t c = 0; c < image.shape()[2]; ++c) { + cropped.index(x, y, c, 0) = resized.index(x + w_offset, y + h_offset, c, 0); } } } - // Clip and preprocess - int h_offset = std::max((int)(resized_height - target_height) / 2, 0); - int w_offset = std::max((int)(resized_width - target_width) / 2, 0); - - sd_image_f32_t result; - result.width = target_width; - result.height = target_height; - result.channel = image.channel; - result.data = (float*)malloc(target_height * target_width * image.channel * sizeof(float)); - - for (uint32_t k = 0; k < image.channel; k++) { - for (uint32_t i = 0; i < result.height; i++) { - for (uint32_t j = 0; j < result.width; j++) { - int src_y = std::min(static_cast(i + h_offset), resized_height - 1); - int src_x = std::min(static_cast(j + w_offset), resized_width - 1); - *(result.data + i * result.width * image.channel + j * image.channel + k) = - fmin(fmax(*(resized_data + src_y * resized_width * image.channel + src_x * image.channel + k), 0.0f), 255.0f) / 255.0f; - } - } - } - - // Free allocated memory - free(resized_data); - - // Normalize - for (uint32_t k = 0; k < image.channel; k++) { - for (uint32_t i = 0; i < result.height; i++) { - for (uint32_t j = 0; j < result.width; j++) { - // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f; - int offset = i * result.width * image.channel + j * image.channel + k; - float value = *(result.data + offset); - value = (value - means[k]) / stds[k]; - // value = 0.5f; - *(result.data + offset) = value; - } - } - } - - return result; + sd::Tensor normalized = sd::ops::clamp(cropped, 0.0f, 1.0f); + sd::Tensor mean({1, 1, 3, 1}, {means[0], means[1], means[2]}); + sd::Tensor std({1, 1, 3, 1}, {stds[0], stds[1], stds[2]}); + return (normalized - mean) / std; } // Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345 diff --git a/src/util.h b/src/util.h index 7dee7bf5..24ce4cf3 100644 --- a/src/util.h +++ b/src/util.h @@ -7,6 +7,7 @@ #include #include "stable-diffusion.h" +#include "tensor.hpp" #define SAFE_STR(s) ((s) ? (s) : "") #define BOOL_STR(b) ((b) ? "true" : "false") @@ -29,20 +30,14 @@ std::string utf32_to_utf8(const std::u32string& utf32_str); std::u32string unicode_value_to_utf32(int unicode_value); // std::string sd_basename(const std::string& path); -typedef struct { - uint32_t width; - uint32_t height; - uint32_t channel; - float* data; -} sd_image_f32_t; +sd_image_t tensor_to_sd_image(const sd::Tensor& tensor, int frame_index = 0); -void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]); +sd::Tensor sd_image_to_tensor(sd_image_t image, + int target_width = -1, + int target_height = -1, + bool scale = true); -sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image); - -sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height); - -sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height); +sd::Tensor clip_preprocess(const sd::Tensor& image, int target_width, int target_height); class MmapWrapper { public: diff --git a/src/vae.hpp b/src/vae.hpp index dafc0d4b..22be8867 100644 --- a/src/vae.hpp +++ b/src/vae.hpp @@ -2,16 +2,64 @@ #define __VAE_HPP__ #include "common_block.hpp" +#include "tensor_ggml.hpp" struct VAE : public GGMLRunner { protected: SDVersion version; - bool scale_input = true; - virtual bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx) = 0; + bool scale_input = true; + virtual sd::Tensor _compute(const int n_threads, + const sd::Tensor& z, + bool decode_graph) = 0; + + static inline void scale_tensor_to_minus1_1(sd::Tensor* tensor) { + GGML_ASSERT(tensor != nullptr); + for (int64_t i = 0; i < tensor->numel(); ++i) { + (*tensor)[i] = (*tensor)[i] * 2.0f - 1.0f; + } + } + + static inline void scale_tensor_to_0_1(sd::Tensor* tensor) { + GGML_ASSERT(tensor != nullptr); + for (int64_t i = 0; i < tensor->numel(); ++i) { + float value = ((*tensor)[i] + 1.0f) * 0.5f; + (*tensor)[i] = std::max(0.0f, std::min(1.0f, value)); + } + } + + sd::Tensor tiled_compute(const sd::Tensor& input, + int n_threads, + int output_width, + int output_height, + int scale, + int p_tile_size_x, + int p_tile_size_y, + float tile_overlap_factor, + bool circular_x, + bool circular_y, + bool decode_graph, + const char* error_message, + bool silent = false) { + auto on_processing = [&](const sd::Tensor& input_tile) { + auto output_tile = _compute(n_threads, input_tile, decode_graph); + if (output_tile.empty()) { + LOG_ERROR("%s", error_message); + return sd::Tensor(); + } + return output_tile; + }; + return ::process_tiles_2d(input, + output_width, + output_height, + scale, + p_tile_size_x, + p_tile_size_y, + tile_overlap_factor, + circular_x, + circular_y, + on_processing, + silent); + } public: VAE(SDVersion version, ggml_backend_t backend, bool offload_params_to_cpu) @@ -60,133 +108,109 @@ public: tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y); } - ggml_tensor* encode(int n_threads, - ggml_context* work_ctx, - ggml_tensor* x, - sd_tiling_params_t tiling_params, - bool circular_x = false, - bool circular_y = false) { - int64_t t0 = ggml_time_ms(); - ggml_tensor* result = nullptr; - const int scale_factor = get_scale_factor(); - int64_t W = x->ne[0] / scale_factor; - int64_t H = x->ne[1] / scale_factor; - int channel_dim = sd_version_is_wan(version) ? 3 : 2; - int64_t C = get_encoder_output_channels(static_cast(x->ne[channel_dim])); - int64_t ne2; - int64_t ne3; - if (sd_version_is_wan(version)) { - int64_t T = x->ne[2]; - ne2 = (T - 1) / 4 + 1; - ne3 = C; - } else { - ne2 = C; - ne3 = x->ne[3]; - } - result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3); - + sd::Tensor encode(int n_threads, + const sd::Tensor& x, + sd_tiling_params_t tiling_params, + bool circular_x = false, + bool circular_y = false) { + int64_t t0 = ggml_time_ms(); + sd::Tensor input = x; + sd::Tensor output; if (scale_input) { - scale_to_minus1_1(x); - } - - if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); + scale_tensor_to_minus1_1(&input); } if (tiling_params.enabled) { + const int scale_factor = get_scale_factor(); + int64_t W = input.shape()[0] / scale_factor; + int64_t H = input.shape()[1] / scale_factor; float tile_overlap; int tile_size_x, tile_size_y; - // multiply tile size for encode to keep the compute buffer size consistent get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, W, H, 1.30539f); - LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y); - - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return _compute(n_threads, in, false, &out, work_ctx); - }; - sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling); + output = tiled_compute(input, + n_threads, + static_cast(W), + static_cast(H), + scale_factor, + tile_size_x, + tile_size_y, + tile_overlap, + circular_x, + circular_y, + false, + "vae encode compute failed while processing a tile"); } else { - _compute(n_threads, x, false, &result, work_ctx); + output = _compute(n_threads, input, false); + free_compute_buffer(); } - free_compute_buffer(); + if (output.empty()) { + LOG_ERROR("vae encode compute failed"); + return {}; + } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - return result; + return std::move(output); } - ggml_tensor* decode(int n_threads, - ggml_context* work_ctx, - ggml_tensor* x, - sd_tiling_params_t tiling_params, - bool decode_video = false, - bool circular_x = false, - bool circular_y = false, - ggml_tensor* result = nullptr, - bool silent = false) { - const int scale_factor = get_scale_factor(); - int64_t W = x->ne[0] * scale_factor; - int64_t H = x->ne[1] * scale_factor; - int64_t C = 3; - if (result == nullptr) { - if (decode_video) { - int64_t T = x->ne[2]; - if (sd_version_is_wan(version)) { - T = ((T - 1) * 4) + 1; - } - result = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - W, - H, - T, - 3); - } else { - result = ggml_new_tensor_4d(work_ctx, - GGML_TYPE_F32, - W, - H, - C, - x->ne[3]); - } - } - int64_t t0 = ggml_time_ms(); - if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { - x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); - } + sd::Tensor decode(int n_threads, + const sd::Tensor& x, + sd_tiling_params_t tiling_params, + bool decode_video = false, + bool circular_x = false, + bool circular_y = false, + bool silent = false) { + int64_t t0 = ggml_time_ms(); + sd::Tensor input = x; + sd::Tensor output; + if (tiling_params.enabled) { + const int scale_factor = get_scale_factor(); + int64_t W = input.shape()[0] * scale_factor; + int64_t H = input.shape()[1] * scale_factor; float tile_overlap; int tile_size_x, tile_size_y; - get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, x->ne[0], x->ne[1]); - + get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, tiling_params, input.shape()[0], input.shape()[1]); if (!silent) { LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y); } - - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - return _compute(n_threads, in, true, &out, nullptr); - }; - sd_tiling_non_square(x, result, scale_factor, tile_size_x, tile_size_y, tile_overlap, circular_x, circular_y, on_tiling, silent); + output = tiled_compute( + input, + n_threads, + static_cast(W), + static_cast(H), + scale_factor, + tile_size_x, + tile_size_y, + tile_overlap, + circular_x, + circular_y, + true, + "vae decode compute failed while processing a tile", + silent); } else { - if (!_compute(n_threads, x, true, &result, work_ctx)) { - LOG_ERROR("Failed to decode latetnts"); - free_compute_buffer(); - return nullptr; - } + output = _compute(n_threads, input, true); } + free_compute_buffer(); + + if (output.empty()) { + LOG_ERROR("vae decode compute failed"); + return {}; + } if (scale_input) { - scale_to_0_1(result); + scale_tensor_to_0_1(&output); } int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f); - return result; + return std::move(output); } - virtual ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) = 0; - virtual ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0; - virtual ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) = 0; - virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; + virtual sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) = 0; + virtual sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) = 0; + virtual sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) = 0; + virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; }; @@ -198,31 +222,25 @@ struct FakeVAE : public VAE { return input_channels; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx) override { - if (*output == nullptr && output_ctx != nullptr) { - *output = ggml_dup_tensor(output_ctx, z); - } - ggml_ext_tensor_iter(z, [&](ggml_tensor* z, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { - float value = ggml_ext_tensor_get_f32(z, i0, i1, i2, i3); - ggml_ext_tensor_set_f32(*output, value, i0, i1, i2, i3); - }); - return true; + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z, + bool decode_graph) override { + SD_UNUSED(n_threads); + SD_UNUSED(decode_graph); + return z; } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { + SD_UNUSED(rng); return vae_output; } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { + return latents; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - return ggml_ext_dup_and_cpy_tensor(work_ctx, latents); + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { + return latents; } void get_param_tensors(std::map& tensors, const std::string prefix) override {} diff --git a/src/wan.hpp b/src/wan.hpp index af8acbfd..6860262c 100644 --- a/src/wan.hpp +++ b/src/wan.hpp @@ -1131,105 +1131,66 @@ namespace WAN { ae.get_param_tensors(tensors, prefix); } - ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr rng) { + sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { + SD_UNUSED(rng); return vae_output; } - void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector& latents_mean_vec, std::vector& latents_std_vec) { - GGML_ASSERT(latents->ne[channel_dim] == 16 || latents->ne[channel_dim] == 48); - if (latents->ne[channel_dim] == 16) { // Wan2.1 VAE - latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f, - 0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f}; - latents_std_vec = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f, - 3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f}; - } else if (latents->ne[channel_dim] == 48) { // Wan2.2 VAE - latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f, - -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f, - -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f, - -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f, - -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, - 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}; - latents_std_vec = { - 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, - 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, - 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, - 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, - 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, - 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; + std::pair, sd::Tensor> get_latents_mean_std(const sd::Tensor& latents) { + int channel_dim = latents.dim() == 5 ? 3 : 2; + std::vector stats_shape(static_cast(latents.dim()), 1); + if (latents.shape()[channel_dim] == 16) { // Wan2.1 VAE + stats_shape[static_cast(channel_dim)] = 16; + + auto mean_tensor = sd::Tensor::from_vector({-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f, + 0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f}); + mean_tensor.reshape_(stats_shape); + auto std_tensor = sd::Tensor::from_vector({2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f, + 3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f}); + std_tensor.reshape_(stats_shape); + return {std::move(mean_tensor), std::move(std_tensor)}; } + if (latents.shape()[channel_dim] == 48) { // Wan2.2 VAE + stats_shape[static_cast(channel_dim)] = 48; + + auto mean_tensor = sd::Tensor::from_vector({-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f, + -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f, + -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f, + -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f, + -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, + 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}); + mean_tensor.reshape_(stats_shape); + auto std_tensor = sd::Tensor::from_vector({0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, + 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, + 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, + 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, + 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, + 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}); + std_tensor.reshape_(stats_shape); + return {std::move(mean_tensor), std::move(std_tensor)}; + } + GGML_ABORT("unexpected latent channel dimension %lld for version %d", + (long long)latents.shape()[channel_dim], + version); } - ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) { - ggml_tensor* vae_latents = ggml_dup(work_ctx, latents); - int channel_dim = sd_version_is_wan(version) ? 3 : 2; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latents->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latents->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[j]; - std_ = latents_std_vec[j]; - } - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); - value = value * std_ / scale_factor + mean; - ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i); - } - } - } - } - - return vae_latents; + sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { + auto [mean_tensor, std_tensor] = get_latents_mean_std(latents); + return (latents * std_tensor) / scale_factor + mean_tensor; } - ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) { - ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents); - int channel_dim = sd_version_is_wan(version) ? 3 : 2; - std::vector latents_mean_vec; - std::vector latents_std_vec; - get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec); - - float mean; - float std_; - for (int i = 0; i < latents->ne[3]; i++) { - if (channel_dim == 3) { - mean = latents_mean_vec[i]; - std_ = latents_std_vec[i]; - } - for (int j = 0; j < latents->ne[2]; j++) { - if (channel_dim == 2) { - mean = latents_mean_vec[j]; - std_ = latents_std_vec[j]; - } - for (int k = 0; k < latents->ne[1]; k++) { - for (int l = 0; l < latents->ne[0]; l++) { - float value = ggml_ext_tensor_get_f32(latents, l, k, j, i); - value = (value - mean) * scale_factor / std_; - ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i); - } - } - } - } - return diffusion_latents; + sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { + auto [mean_tensor, std_tensor] = get_latents_mean_std(latents); + return ((latents - mean_tensor) * scale_factor) / std_tensor; } int get_encoder_output_channels(int input_channels) { return static_cast(ae.z_dim); } - ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) { - ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]); - - z = to_backend(z); + ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { + ggml_cgraph* gf = new_graph_custom(10240 * z_tensor.shape()[2]); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); @@ -1240,7 +1201,7 @@ namespace WAN { return gf; } - ggml_cgraph* build_graph_partial(ggml_tensor* z, bool decode_graph, int i) { + ggml_cgraph* build_graph_partial(const sd::Tensor& z_tensor, bool decode_graph, int i) { ggml_cgraph* gf = new_graph_custom(20480); ae.clear_cache(); @@ -1250,7 +1211,7 @@ namespace WAN { ae._feat_map[feat_idx] = feat_cache; } - z = to_backend(z); + ggml_tensor* z = make_input(z_tensor); auto runner_ctx = get_context(); @@ -1269,58 +1230,57 @@ namespace WAN { return gf; } - bool _compute(const int n_threads, - ggml_tensor* z, - bool decode_graph, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) override { + sd::Tensor _compute(const int n_threads, + const sd::Tensor& z, + bool decode_graph) override { if (true) { + sd::Tensor input; + if (z.dim() == 4) { + input = z.unsqueeze(2); + } auto get_graph = [&]() -> ggml_cgraph* { - return build_graph(z, decode_graph); + if (input.empty()) { + return build_graph(z, decode_graph); + } else { + return build_graph(input, decode_graph); + } }; - return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true), + input.empty() ? z.dim() : input.dim()); + if (!result.empty() && z.dim() == 4) { + result.squeeze_(2); + } + return result; } else { // chunk 1 result is weird ae.clear_cache(); - int64_t t = z->ne[2]; + int64_t t = z.shape()[2]; int i = 0; auto get_graph = [&]() -> ggml_cgraph* { return build_graph_partial(z, decode_graph, i); }; - ggml_tensor* out = nullptr; - bool res = GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx); + auto out_opt = GGMLRunner::compute(get_graph, n_threads, true); + if (!out_opt.has_value()) { + return {}; + } + sd::Tensor out = std::move(*out_opt); ae.clear_cache(); if (t == 1) { - *output = out; - return res; + return out; } - *output = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], (t - 1) * 4 + 1, out->ne[3]); - - auto copy_to_output = [&]() { - for (int64_t i3 = 0; i3 < out->ne[3]; i3++) { - for (int64_t i2 = 0; i2 < out->ne[2]; i2++) { - for (int64_t i1 = 0; i1 < out->ne[1]; i1++) { - for (int64_t i0 = 0; i0 < out->ne[0]; i0++) { - float value = ggml_ext_tensor_get_f32(out, i0, i1, i2, i3); - int64_t offset = (i == 0) ? 0 : (1 + (i - 1) * 4); - ggml_ext_tensor_set_f32(*output, value, i0, i1, offset + i2, i3); - } - } - } - } - }; - - copy_to_output(); - - out = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], 4, out->ne[3]); + sd::Tensor output = std::move(out); for (i = 1; i < t; i++) { - res = res || GGMLRunner::compute(get_graph, n_threads, true, &out); + auto chunk_opt = GGMLRunner::compute(get_graph, n_threads, true); + if (!chunk_opt.has_value()) { + return {}; + } + out = std::move(*chunk_opt); ae.clear_cache(); - copy_to_output(); + output = sd::ops::concat(output, out, 2); } free_cache_ctx_and_buffer(); - return res; + return output; } } @@ -1330,25 +1290,25 @@ namespace WAN { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); if (true) { // cpu f32, pass // cpu f16, pass // cuda f16, pass // cuda f32, pass - auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 104, 60, 2, 16); - ggml_set_f32(z, 0.5f); - z = load_tensor_from_file(work_ctx, "wan_vae_z.bin"); - print_ggml_tensor(z); - ggml_tensor* out = nullptr; + auto z = sd::load_tensor_from_file_as_tensor("wan_vae_z.bin"); + print_sd_tensor(z); + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - _compute(8, z, true, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = _compute(8, z, true); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("decode test done in %ldms", t1 - t0); } }; @@ -2229,23 +2189,23 @@ namespace WAN { wan.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* clip_fea = nullptr, - ggml_tensor* c_concat = nullptr, - ggml_tensor* time_dim_concat = nullptr, - ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor = {}, + const sd::Tensor& clip_fea_tensor = {}, + const sd::Tensor& c_concat_tensor = {}, + const sd::Tensor& time_dim_concat_tensor = {}, + const sd::Tensor& vace_context_tensor = {}, + float vace_strength = 1.f) { ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE); - x = to_backend(x); - timesteps = to_backend(timesteps); - context = to_backend(context); - clip_fea = to_backend(clip_fea); - c_concat = to_backend(c_concat); - time_dim_concat = to_backend(time_dim_concat); - vace_context = to_backend(vace_context); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); + ggml_tensor* context = make_optional_input(context_tensor); + ggml_tensor* clip_fea = make_optional_input(clip_fea_tensor); + ggml_tensor* c_concat = make_optional_input(c_concat_tensor); + ggml_tensor* time_dim_concat = make_optional_input(time_dim_concat_tensor); + ggml_tensor* vace_context = make_optional_input(vace_context_tensor); pe_vec = Rope::gen_wan_pe(static_cast(x->ne[2]), static_cast(x->ne[1]), @@ -2285,22 +2245,20 @@ namespace WAN { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - ggml_tensor* clip_fea = nullptr, - ggml_tensor* c_concat = nullptr, - ggml_tensor* time_dim_concat = nullptr, - ggml_tensor* vace_context = nullptr, - float vace_strength = 1.f, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context = {}, + const sd::Tensor& clip_fea = {}, + const sd::Tensor& c_concat = {}, + const sd::Tensor& time_dim_concat = {}, + const sd::Tensor& vace_context = {}, + float vace_strength = 1.f) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -2309,36 +2267,38 @@ namespace WAN { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { // cpu f16: pass // cuda f16: pass // cpu q8_0: pass - // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 104, 60, 1, 16); + // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 104, 60, 1, 16); // ggml_set_f32(x, 0.01f); - auto x = load_tensor_from_file(work_ctx, "wan_dit_x.bin"); - print_ggml_tensor(x); + auto x = sd::load_tensor_from_file_as_tensor("wan_dit_x.bin"); + print_sd_tensor(x); std::vector timesteps_vec(3, 1000.f); timesteps_vec[0] = 0.f; - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); - // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 512, 1); + // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 4096, 512, 1); // ggml_set_f32(context, 0.01f); - auto context = load_tensor_from_file(work_ctx, "wan_dit_context.bin"); - print_ggml_tensor(context); - // auto clip_fea = load_tensor_from_file(work_ctx, "wan_dit_clip_fea.bin"); + auto context = sd::load_tensor_from_file_as_tensor("wan_dit_context.bin"); + print_sd_tensor(context); + // auto clip_fea = load_tensor_from_file(ctx, "wan_dit_clip_fea.bin"); // print_ggml_tensor(clip_fea); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, nullptr, nullptr, nullptr, nullptr, 1.f, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, x, timesteps, context, {}, {}, {}, {}, 1.f); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("wan test done in %lldms", t1 - t0); } } diff --git a/src/z_image.hpp b/src/z_image.hpp index 53a7cf82..363ce5f4 100644 --- a/src/z_image.hpp +++ b/src/z_image.hpp @@ -481,20 +481,21 @@ namespace ZImage { z_image.get_param_tensors(tensors, prefix); } - ggml_cgraph* build_graph(ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false) { + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timesteps_tensor, + const sd::Tensor& context_tensor, + const std::vector>& ref_latents_tensor = {}, + bool increase_ref_index = false) { + ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timesteps = make_input(timesteps_tensor); GGML_ASSERT(x->ne[3] == 1); - ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE); - - x = to_backend(x); - context = to_backend(context); - timesteps = to_backend(timesteps); - - for (int i = 0; i < ref_latents.size(); i++) { - ref_latents[i] = to_backend(ref_latents[i]); + GGML_ASSERT(!context_tensor.empty()); + ggml_tensor* context = make_input(context_tensor); + std::vector ref_latents; + ref_latents.reserve(ref_latents_tensor.size()); + for (const auto& ref_latent_tensor : ref_latents_tensor) { + ref_latents.push_back(make_input(ref_latent_tensor)); } pe_vec = Rope::gen_z_image_pe(static_cast(x->ne[1]), @@ -530,14 +531,12 @@ namespace ZImage { return gf; } - bool compute(int n_threads, - ggml_tensor* x, - ggml_tensor* timesteps, - ggml_tensor* context, - std::vector ref_latents = {}, - bool increase_ref_index = false, - ggml_tensor** output = nullptr, - ggml_context* output_ctx = nullptr) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timesteps, + const sd::Tensor& context, + const std::vector>& ref_latents = {}, + bool increase_ref_index = false) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] @@ -545,7 +544,7 @@ namespace ZImage { return build_graph(x, timesteps, context, ref_latents, increase_ref_index); }; - return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); } void test() { @@ -554,30 +553,37 @@ namespace ZImage { params.mem_buffer = nullptr; params.no_alloc = false; - ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); + ggml_context* ctx = ggml_init(params); + GGML_ASSERT(ctx != nullptr); { - // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1); + // auto x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 16, 16, 16, 1); // ggml_set_f32(x, 0.01f); - auto x = load_tensor_from_file(work_ctx, "./z_image_x.bin"); - print_ggml_tensor(x); + auto x = sd::load_tensor_from_file_as_tensor("./z_image_x.bin"); + print_sd_tensor(x); std::vector timesteps_vec(1, 0.f); - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + auto timesteps = sd::Tensor::from_vector(timesteps_vec); - // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 2560, 256, 1); + // auto context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 2560, 256, 1); // ggml_set_f32(context, 0.01f); - auto context = load_tensor_from_file(work_ctx, "./z_image_context.bin"); - print_ggml_tensor(context); + auto context = sd::load_tensor_from_file_as_tensor("./z_image_context.bin"); + print_sd_tensor(context); - ggml_tensor* out = nullptr; + sd::Tensor out; - int64_t t0 = ggml_time_ms(); - compute(8, x, timesteps, context, {}, false, &out, work_ctx); - int64_t t1 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); + auto out_opt = compute(8, + x, + timesteps, + context, + {}, + false); + int64_t t1 = ggml_time_ms(); - print_ggml_tensor(out); + GGML_ASSERT(!out_opt.empty()); + out = std::move(out_opt); + print_sd_tensor(out); LOG_DEBUG("z_image test done in %lldms", t1 - t0); } }